|
{ |
|
"best_metric": 0.7337000966072083, |
|
"best_model_checkpoint": "/home/ubuntu/utah/babylm-24/src/evaluation/results/finetune/DebertaV2-Base-10M_babylm-A/mnli/checkpoint-147264", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 147264, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010185788787483703, |
|
"grad_norm": 3.972818613052368, |
|
"learning_rate": 2.9938885267275102e-05, |
|
"loss": 1.1058, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.020371577574967405, |
|
"grad_norm": 3.13022780418396, |
|
"learning_rate": 2.9877770534550196e-05, |
|
"loss": 1.0388, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03055736636245111, |
|
"grad_norm": 5.194879055023193, |
|
"learning_rate": 2.9816655801825297e-05, |
|
"loss": 0.999, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04074315514993481, |
|
"grad_norm": 6.858974933624268, |
|
"learning_rate": 2.975554106910039e-05, |
|
"loss": 0.9637, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.050928943937418515, |
|
"grad_norm": 7.074331760406494, |
|
"learning_rate": 2.9694426336375492e-05, |
|
"loss": 0.9697, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06111473272490222, |
|
"grad_norm": 5.386518478393555, |
|
"learning_rate": 2.9633311603650586e-05, |
|
"loss": 0.9472, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.07130052151238592, |
|
"grad_norm": 5.582085609436035, |
|
"learning_rate": 2.9572196870925687e-05, |
|
"loss": 0.9273, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.08148631029986962, |
|
"grad_norm": 10.966410636901855, |
|
"learning_rate": 2.951108213820078e-05, |
|
"loss": 0.9316, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.09167209908735333, |
|
"grad_norm": 7.840672492980957, |
|
"learning_rate": 2.9449967405475882e-05, |
|
"loss": 0.9237, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.10185788787483703, |
|
"grad_norm": 7.442521095275879, |
|
"learning_rate": 2.9388852672750976e-05, |
|
"loss": 0.918, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.11204367666232073, |
|
"grad_norm": 8.379261016845703, |
|
"learning_rate": 2.9327737940026077e-05, |
|
"loss": 0.8798, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.12222946544980444, |
|
"grad_norm": 6.880769729614258, |
|
"learning_rate": 2.926662320730117e-05, |
|
"loss": 0.9045, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.13241525423728814, |
|
"grad_norm": 5.965817451477051, |
|
"learning_rate": 2.9205508474576272e-05, |
|
"loss": 0.8941, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.14260104302477183, |
|
"grad_norm": 9.67165756225586, |
|
"learning_rate": 2.914439374185137e-05, |
|
"loss": 0.878, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.15278683181225555, |
|
"grad_norm": 6.4934892654418945, |
|
"learning_rate": 2.9083279009126467e-05, |
|
"loss": 0.8723, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.16297262059973924, |
|
"grad_norm": 11.987072944641113, |
|
"learning_rate": 2.9022164276401565e-05, |
|
"loss": 0.8913, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.17315840938722293, |
|
"grad_norm": 9.218280792236328, |
|
"learning_rate": 2.8961049543676662e-05, |
|
"loss": 0.868, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.18334419817470665, |
|
"grad_norm": 6.697582244873047, |
|
"learning_rate": 2.8899934810951763e-05, |
|
"loss": 0.8618, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.19352998696219034, |
|
"grad_norm": 8.035455703735352, |
|
"learning_rate": 2.8838820078226858e-05, |
|
"loss": 0.8641, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.20371577574967406, |
|
"grad_norm": 6.890414237976074, |
|
"learning_rate": 2.877770534550196e-05, |
|
"loss": 0.8574, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.21390156453715775, |
|
"grad_norm": 9.001678466796875, |
|
"learning_rate": 2.8716590612777053e-05, |
|
"loss": 0.8547, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.22408735332464147, |
|
"grad_norm": 7.271191120147705, |
|
"learning_rate": 2.8655475880052154e-05, |
|
"loss": 0.8332, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.23427314211212516, |
|
"grad_norm": 9.849822998046875, |
|
"learning_rate": 2.8594361147327248e-05, |
|
"loss": 0.8447, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.24445893089960888, |
|
"grad_norm": 15.07829761505127, |
|
"learning_rate": 2.853324641460235e-05, |
|
"loss": 0.8251, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.25464471968709257, |
|
"grad_norm": 6.886602878570557, |
|
"learning_rate": 2.8472131681877446e-05, |
|
"loss": 0.8444, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.2648305084745763, |
|
"grad_norm": 9.94825267791748, |
|
"learning_rate": 2.8411016949152544e-05, |
|
"loss": 0.8504, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.27501629726205995, |
|
"grad_norm": 8.0740966796875, |
|
"learning_rate": 2.834990221642764e-05, |
|
"loss": 0.8182, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.28520208604954367, |
|
"grad_norm": 10.770129203796387, |
|
"learning_rate": 2.828878748370274e-05, |
|
"loss": 0.8373, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.2953878748370274, |
|
"grad_norm": 9.214078903198242, |
|
"learning_rate": 2.8227672750977836e-05, |
|
"loss": 0.8089, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.3055736636245111, |
|
"grad_norm": 14.231884956359863, |
|
"learning_rate": 2.8166558018252934e-05, |
|
"loss": 0.8337, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.31575945241199477, |
|
"grad_norm": 9.379185676574707, |
|
"learning_rate": 2.810544328552803e-05, |
|
"loss": 0.8315, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.3259452411994785, |
|
"grad_norm": 8.370843887329102, |
|
"learning_rate": 2.804432855280313e-05, |
|
"loss": 0.808, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.3361310299869622, |
|
"grad_norm": 7.201444149017334, |
|
"learning_rate": 2.7983213820078227e-05, |
|
"loss": 0.808, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.34631681877444587, |
|
"grad_norm": 10.696725845336914, |
|
"learning_rate": 2.7922099087353324e-05, |
|
"loss": 0.813, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.3565026075619296, |
|
"grad_norm": 6.316204071044922, |
|
"learning_rate": 2.7860984354628422e-05, |
|
"loss": 0.8213, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.3666883963494133, |
|
"grad_norm": 14.780086517333984, |
|
"learning_rate": 2.779986962190352e-05, |
|
"loss": 0.8165, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.376874185136897, |
|
"grad_norm": 11.34945011138916, |
|
"learning_rate": 2.773875488917862e-05, |
|
"loss": 0.7943, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.3870599739243807, |
|
"grad_norm": 7.136452674865723, |
|
"learning_rate": 2.7677640156453718e-05, |
|
"loss": 0.8017, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.3972457627118644, |
|
"grad_norm": 6.209742069244385, |
|
"learning_rate": 2.7616525423728815e-05, |
|
"loss": 0.7941, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.4074315514993481, |
|
"grad_norm": 8.923270225524902, |
|
"learning_rate": 2.7555410691003913e-05, |
|
"loss": 0.7826, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.41761734028683184, |
|
"grad_norm": 6.371203899383545, |
|
"learning_rate": 2.749429595827901e-05, |
|
"loss": 0.7938, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.4278031290743155, |
|
"grad_norm": 8.679354667663574, |
|
"learning_rate": 2.7433181225554108e-05, |
|
"loss": 0.7936, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.4379889178617992, |
|
"grad_norm": 5.962934970855713, |
|
"learning_rate": 2.7372066492829205e-05, |
|
"loss": 0.7924, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.44817470664928294, |
|
"grad_norm": 11.835200309753418, |
|
"learning_rate": 2.7310951760104303e-05, |
|
"loss": 0.7894, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.4583604954367666, |
|
"grad_norm": 9.62806510925293, |
|
"learning_rate": 2.72498370273794e-05, |
|
"loss": 0.7963, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.4685462842242503, |
|
"grad_norm": 7.6390509605407715, |
|
"learning_rate": 2.7188722294654498e-05, |
|
"loss": 0.783, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.47873207301173404, |
|
"grad_norm": 5.826275825500488, |
|
"learning_rate": 2.7127607561929596e-05, |
|
"loss": 0.7859, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.48891786179921776, |
|
"grad_norm": 5.199416637420654, |
|
"learning_rate": 2.7066492829204693e-05, |
|
"loss": 0.7989, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.4991036505867014, |
|
"grad_norm": 12.38355541229248, |
|
"learning_rate": 2.7005378096479794e-05, |
|
"loss": 0.7864, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.5092894393741851, |
|
"grad_norm": 7.700937271118164, |
|
"learning_rate": 2.694426336375489e-05, |
|
"loss": 0.7747, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.5194752281616688, |
|
"grad_norm": 10.8720703125, |
|
"learning_rate": 2.688314863102999e-05, |
|
"loss": 0.7974, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.5296610169491526, |
|
"grad_norm": 6.260867595672607, |
|
"learning_rate": 2.6822033898305083e-05, |
|
"loss": 0.7914, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.5398468057366362, |
|
"grad_norm": 11.113471984863281, |
|
"learning_rate": 2.6760919165580184e-05, |
|
"loss": 0.782, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.5500325945241199, |
|
"grad_norm": 7.497796058654785, |
|
"learning_rate": 2.6699804432855282e-05, |
|
"loss": 0.7642, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.5602183833116037, |
|
"grad_norm": 6.1969170570373535, |
|
"learning_rate": 2.663868970013038e-05, |
|
"loss": 0.7693, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.5704041720990873, |
|
"grad_norm": 9.965928077697754, |
|
"learning_rate": 2.6577574967405477e-05, |
|
"loss": 0.7681, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.5805899608865711, |
|
"grad_norm": 5.052292346954346, |
|
"learning_rate": 2.6516460234680575e-05, |
|
"loss": 0.7664, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.5907757496740548, |
|
"grad_norm": 6.7289276123046875, |
|
"learning_rate": 2.6455345501955672e-05, |
|
"loss": 0.7445, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.6009615384615384, |
|
"grad_norm": 8.478078842163086, |
|
"learning_rate": 2.639423076923077e-05, |
|
"loss": 0.7859, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.6111473272490222, |
|
"grad_norm": 6.2336106300354, |
|
"learning_rate": 2.6333116036505867e-05, |
|
"loss": 0.7598, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6213331160365059, |
|
"grad_norm": 11.82295036315918, |
|
"learning_rate": 2.6272001303780965e-05, |
|
"loss": 0.7507, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.6315189048239895, |
|
"grad_norm": 8.545003890991211, |
|
"learning_rate": 2.6210886571056066e-05, |
|
"loss": 0.7769, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.6417046936114733, |
|
"grad_norm": 9.849065780639648, |
|
"learning_rate": 2.614977183833116e-05, |
|
"loss": 0.7781, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.651890482398957, |
|
"grad_norm": 7.641623020172119, |
|
"learning_rate": 2.608865710560626e-05, |
|
"loss": 0.7789, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.6620762711864406, |
|
"grad_norm": 16.397785186767578, |
|
"learning_rate": 2.6027542372881355e-05, |
|
"loss": 0.7542, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.6722620599739244, |
|
"grad_norm": 11.119671821594238, |
|
"learning_rate": 2.5966427640156456e-05, |
|
"loss": 0.7716, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.6824478487614081, |
|
"grad_norm": 12.466675758361816, |
|
"learning_rate": 2.590531290743155e-05, |
|
"loss": 0.7743, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.6926336375488917, |
|
"grad_norm": 8.151625633239746, |
|
"learning_rate": 2.584419817470665e-05, |
|
"loss": 0.7581, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.7028194263363755, |
|
"grad_norm": 7.743143558502197, |
|
"learning_rate": 2.5783083441981745e-05, |
|
"loss": 0.7599, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.7130052151238592, |
|
"grad_norm": 5.249680995941162, |
|
"learning_rate": 2.5721968709256846e-05, |
|
"loss": 0.7483, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.7231910039113429, |
|
"grad_norm": 6.738178730010986, |
|
"learning_rate": 2.566085397653194e-05, |
|
"loss": 0.7781, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.7333767926988266, |
|
"grad_norm": 6.7923102378845215, |
|
"learning_rate": 2.559973924380704e-05, |
|
"loss": 0.7464, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.7435625814863103, |
|
"grad_norm": 15.575695991516113, |
|
"learning_rate": 2.553862451108214e-05, |
|
"loss": 0.7571, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.753748370273794, |
|
"grad_norm": 13.128719329833984, |
|
"learning_rate": 2.5477509778357236e-05, |
|
"loss": 0.7625, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.7639341590612777, |
|
"grad_norm": 4.938621997833252, |
|
"learning_rate": 2.5416395045632337e-05, |
|
"loss": 0.7477, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.7741199478487614, |
|
"grad_norm": 8.46985912322998, |
|
"learning_rate": 2.535528031290743e-05, |
|
"loss": 0.7419, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.7843057366362451, |
|
"grad_norm": 18.881208419799805, |
|
"learning_rate": 2.5294165580182532e-05, |
|
"loss": 0.7403, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.7944915254237288, |
|
"grad_norm": 9.986156463623047, |
|
"learning_rate": 2.5233050847457626e-05, |
|
"loss": 0.7508, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.8046773142112125, |
|
"grad_norm": 11.625401496887207, |
|
"learning_rate": 2.5171936114732727e-05, |
|
"loss": 0.7426, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.8148631029986962, |
|
"grad_norm": 9.926522254943848, |
|
"learning_rate": 2.511082138200782e-05, |
|
"loss": 0.7365, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.8250488917861799, |
|
"grad_norm": 8.104277610778809, |
|
"learning_rate": 2.5049706649282922e-05, |
|
"loss": 0.7532, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.8352346805736637, |
|
"grad_norm": 5.898177623748779, |
|
"learning_rate": 2.4988591916558017e-05, |
|
"loss": 0.7657, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.8454204693611473, |
|
"grad_norm": 6.856070041656494, |
|
"learning_rate": 2.4927477183833118e-05, |
|
"loss": 0.7438, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.855606258148631, |
|
"grad_norm": 11.806628227233887, |
|
"learning_rate": 2.4866362451108212e-05, |
|
"loss": 0.7348, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.8657920469361148, |
|
"grad_norm": 12.08478832244873, |
|
"learning_rate": 2.4805247718383313e-05, |
|
"loss": 0.7277, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.8759778357235984, |
|
"grad_norm": 5.707648277282715, |
|
"learning_rate": 2.474413298565841e-05, |
|
"loss": 0.7458, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.8861636245110821, |
|
"grad_norm": 5.36006498336792, |
|
"learning_rate": 2.4683018252933508e-05, |
|
"loss": 0.7446, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.8963494132985659, |
|
"grad_norm": 6.603504180908203, |
|
"learning_rate": 2.4621903520208605e-05, |
|
"loss": 0.7278, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.9065352020860495, |
|
"grad_norm": 7.789953231811523, |
|
"learning_rate": 2.4560788787483703e-05, |
|
"loss": 0.7375, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.9167209908735332, |
|
"grad_norm": 11.424116134643555, |
|
"learning_rate": 2.4499674054758804e-05, |
|
"loss": 0.7297, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.926906779661017, |
|
"grad_norm": 6.7079620361328125, |
|
"learning_rate": 2.4438559322033898e-05, |
|
"loss": 0.7411, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.9370925684485006, |
|
"grad_norm": 11.931846618652344, |
|
"learning_rate": 2.4377444589309e-05, |
|
"loss": 0.749, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.9472783572359843, |
|
"grad_norm": 8.87076473236084, |
|
"learning_rate": 2.4316329856584093e-05, |
|
"loss": 0.7312, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.9574641460234681, |
|
"grad_norm": 12.787999153137207, |
|
"learning_rate": 2.4255215123859194e-05, |
|
"loss": 0.7244, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.9676499348109517, |
|
"grad_norm": 6.917387962341309, |
|
"learning_rate": 2.4194100391134288e-05, |
|
"loss": 0.733, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.9778357235984355, |
|
"grad_norm": 7.564793109893799, |
|
"learning_rate": 2.413298565840939e-05, |
|
"loss": 0.7253, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.9880215123859192, |
|
"grad_norm": 7.665174961090088, |
|
"learning_rate": 2.4071870925684483e-05, |
|
"loss": 0.7191, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.9982073011734028, |
|
"grad_norm": 9.235818862915039, |
|
"learning_rate": 2.4010756192959584e-05, |
|
"loss": 0.7288, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.6841890811920166, |
|
"eval_loss": 0.7356013059616089, |
|
"eval_runtime": 8.0037, |
|
"eval_samples_per_second": 613.213, |
|
"eval_steps_per_second": 76.714, |
|
"step": 49088 |
|
}, |
|
{ |
|
"epoch": 1.0083930899608866, |
|
"grad_norm": 9.232441902160645, |
|
"learning_rate": 2.3949641460234682e-05, |
|
"loss": 0.666, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.0185788787483703, |
|
"grad_norm": 5.098124980926514, |
|
"learning_rate": 2.388852672750978e-05, |
|
"loss": 0.6762, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.028764667535854, |
|
"grad_norm": 6.507536888122559, |
|
"learning_rate": 2.3827411994784877e-05, |
|
"loss": 0.6509, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.0389504563233376, |
|
"grad_norm": 12.516643524169922, |
|
"learning_rate": 2.3766297262059974e-05, |
|
"loss": 0.6569, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.0491362451108215, |
|
"grad_norm": 11.644853591918945, |
|
"learning_rate": 2.3705182529335072e-05, |
|
"loss": 0.6498, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.0593220338983051, |
|
"grad_norm": 15.303723335266113, |
|
"learning_rate": 2.364406779661017e-05, |
|
"loss": 0.6565, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.0695078226857888, |
|
"grad_norm": 11.212811470031738, |
|
"learning_rate": 2.3582953063885267e-05, |
|
"loss": 0.6492, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.0796936114732725, |
|
"grad_norm": 7.695071220397949, |
|
"learning_rate": 2.3521838331160365e-05, |
|
"loss": 0.6618, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.0898794002607561, |
|
"grad_norm": 8.234328269958496, |
|
"learning_rate": 2.3460723598435462e-05, |
|
"loss": 0.6709, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.1000651890482398, |
|
"grad_norm": 12.574545860290527, |
|
"learning_rate": 2.339960886571056e-05, |
|
"loss": 0.6518, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.1102509778357237, |
|
"grad_norm": 5.914109706878662, |
|
"learning_rate": 2.333849413298566e-05, |
|
"loss": 0.6494, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.1204367666232073, |
|
"grad_norm": 11.49263858795166, |
|
"learning_rate": 2.3277379400260758e-05, |
|
"loss": 0.6723, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.130622555410691, |
|
"grad_norm": 11.228019714355469, |
|
"learning_rate": 2.3216264667535856e-05, |
|
"loss": 0.6328, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.1408083441981747, |
|
"grad_norm": 14.478355407714844, |
|
"learning_rate": 2.3155149934810953e-05, |
|
"loss": 0.667, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.1509941329856583, |
|
"grad_norm": 18.817468643188477, |
|
"learning_rate": 2.309403520208605e-05, |
|
"loss": 0.6674, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.161179921773142, |
|
"grad_norm": 12.483678817749023, |
|
"learning_rate": 2.303292046936115e-05, |
|
"loss": 0.6677, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.1713657105606259, |
|
"grad_norm": 7.133495807647705, |
|
"learning_rate": 2.2971805736636246e-05, |
|
"loss": 0.642, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.1815514993481095, |
|
"grad_norm": 7.099282741546631, |
|
"learning_rate": 2.2910691003911343e-05, |
|
"loss": 0.664, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.1917372881355932, |
|
"grad_norm": 9.188867568969727, |
|
"learning_rate": 2.284957627118644e-05, |
|
"loss": 0.6703, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.2019230769230769, |
|
"grad_norm": 5.470687389373779, |
|
"learning_rate": 2.278846153846154e-05, |
|
"loss": 0.6575, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.2121088657105605, |
|
"grad_norm": 8.013532638549805, |
|
"learning_rate": 2.2727346805736636e-05, |
|
"loss": 0.6465, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.2222946544980444, |
|
"grad_norm": 8.846539497375488, |
|
"learning_rate": 2.2666232073011734e-05, |
|
"loss": 0.6682, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.232480443285528, |
|
"grad_norm": 11.2225341796875, |
|
"learning_rate": 2.260511734028683e-05, |
|
"loss": 0.653, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.2426662320730117, |
|
"grad_norm": 12.363032341003418, |
|
"learning_rate": 2.254400260756193e-05, |
|
"loss": 0.6807, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.2528520208604954, |
|
"grad_norm": 8.01014232635498, |
|
"learning_rate": 2.248288787483703e-05, |
|
"loss": 0.6625, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.263037809647979, |
|
"grad_norm": 20.960845947265625, |
|
"learning_rate": 2.2421773142112124e-05, |
|
"loss": 0.6524, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.2732235984354627, |
|
"grad_norm": 13.620718002319336, |
|
"learning_rate": 2.2360658409387225e-05, |
|
"loss": 0.6625, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.2834093872229466, |
|
"grad_norm": 18.55965232849121, |
|
"learning_rate": 2.2299543676662322e-05, |
|
"loss": 0.6434, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.2935951760104303, |
|
"grad_norm": 12.919173240661621, |
|
"learning_rate": 2.223842894393742e-05, |
|
"loss": 0.652, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.303780964797914, |
|
"grad_norm": 19.561386108398438, |
|
"learning_rate": 2.2177314211212517e-05, |
|
"loss": 0.6498, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.3139667535853976, |
|
"grad_norm": 10.529995918273926, |
|
"learning_rate": 2.2116199478487615e-05, |
|
"loss": 0.666, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.3241525423728815, |
|
"grad_norm": 13.268050193786621, |
|
"learning_rate": 2.2055084745762713e-05, |
|
"loss": 0.6499, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.3343383311603652, |
|
"grad_norm": 6.3072285652160645, |
|
"learning_rate": 2.199397001303781e-05, |
|
"loss": 0.6798, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.3445241199478488, |
|
"grad_norm": 5.304599761962891, |
|
"learning_rate": 2.1932855280312908e-05, |
|
"loss": 0.6559, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.3547099087353325, |
|
"grad_norm": 13.698792457580566, |
|
"learning_rate": 2.1871740547588005e-05, |
|
"loss": 0.6441, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.3648956975228161, |
|
"grad_norm": 8.951632499694824, |
|
"learning_rate": 2.1810625814863103e-05, |
|
"loss": 0.6564, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.3750814863102998, |
|
"grad_norm": 5.84067440032959, |
|
"learning_rate": 2.17495110821382e-05, |
|
"loss": 0.6686, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.3852672750977835, |
|
"grad_norm": 6.442080497741699, |
|
"learning_rate": 2.16883963494133e-05, |
|
"loss": 0.6517, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.3954530638852674, |
|
"grad_norm": 5.514212608337402, |
|
"learning_rate": 2.1627281616688395e-05, |
|
"loss": 0.6664, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.405638852672751, |
|
"grad_norm": 14.150158882141113, |
|
"learning_rate": 2.1566166883963496e-05, |
|
"loss": 0.6561, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.4158246414602347, |
|
"grad_norm": 17.196884155273438, |
|
"learning_rate": 2.150505215123859e-05, |
|
"loss": 0.6513, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.4260104302477183, |
|
"grad_norm": 6.181870937347412, |
|
"learning_rate": 2.144393741851369e-05, |
|
"loss": 0.65, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.436196219035202, |
|
"grad_norm": 14.865707397460938, |
|
"learning_rate": 2.1382822685788786e-05, |
|
"loss": 0.6596, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.4463820078226859, |
|
"grad_norm": 15.808574676513672, |
|
"learning_rate": 2.1321707953063886e-05, |
|
"loss": 0.6559, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.4565677966101696, |
|
"grad_norm": 16.76003074645996, |
|
"learning_rate": 2.1260593220338984e-05, |
|
"loss": 0.6467, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.4667535853976532, |
|
"grad_norm": 21.466825485229492, |
|
"learning_rate": 2.119947848761408e-05, |
|
"loss": 0.6579, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.4769393741851369, |
|
"grad_norm": 18.81052589416504, |
|
"learning_rate": 2.113836375488918e-05, |
|
"loss": 0.6699, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.4871251629726205, |
|
"grad_norm": 10.334957122802734, |
|
"learning_rate": 2.1077249022164277e-05, |
|
"loss": 0.655, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.4973109517601042, |
|
"grad_norm": 14.836852073669434, |
|
"learning_rate": 2.1016134289439378e-05, |
|
"loss": 0.66, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.5074967405475879, |
|
"grad_norm": 21.439233779907227, |
|
"learning_rate": 2.0955019556714472e-05, |
|
"loss": 0.6486, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.5176825293350718, |
|
"grad_norm": 21.09993553161621, |
|
"learning_rate": 2.0893904823989573e-05, |
|
"loss": 0.6432, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.5278683181225554, |
|
"grad_norm": 8.926955223083496, |
|
"learning_rate": 2.0832790091264667e-05, |
|
"loss": 0.6418, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.538054106910039, |
|
"grad_norm": 3.551163911819458, |
|
"learning_rate": 2.0771675358539768e-05, |
|
"loss": 0.6418, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.548239895697523, |
|
"grad_norm": 10.46932315826416, |
|
"learning_rate": 2.0710560625814862e-05, |
|
"loss": 0.657, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.5584256844850066, |
|
"grad_norm": 10.022995948791504, |
|
"learning_rate": 2.0649445893089963e-05, |
|
"loss": 0.6564, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.5686114732724903, |
|
"grad_norm": 15.299884796142578, |
|
"learning_rate": 2.0588331160365057e-05, |
|
"loss": 0.6535, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.578797262059974, |
|
"grad_norm": 11.149444580078125, |
|
"learning_rate": 2.0527216427640158e-05, |
|
"loss": 0.6409, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.5889830508474576, |
|
"grad_norm": 4.011179447174072, |
|
"learning_rate": 2.0466101694915252e-05, |
|
"loss": 0.6299, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.5991688396349413, |
|
"grad_norm": 13.744772911071777, |
|
"learning_rate": 2.0404986962190353e-05, |
|
"loss": 0.6511, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.609354628422425, |
|
"grad_norm": 11.129698753356934, |
|
"learning_rate": 2.0343872229465447e-05, |
|
"loss": 0.6349, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.6195404172099086, |
|
"grad_norm": 16.331953048706055, |
|
"learning_rate": 2.0282757496740548e-05, |
|
"loss": 0.6517, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.6297262059973925, |
|
"grad_norm": 12.816121101379395, |
|
"learning_rate": 2.0221642764015646e-05, |
|
"loss": 0.6409, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.6399119947848761, |
|
"grad_norm": 13.881726264953613, |
|
"learning_rate": 2.0160528031290743e-05, |
|
"loss": 0.6606, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 1.6500977835723598, |
|
"grad_norm": 14.215683937072754, |
|
"learning_rate": 2.0099413298565844e-05, |
|
"loss": 0.657, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.6602835723598437, |
|
"grad_norm": 19.557342529296875, |
|
"learning_rate": 2.003829856584094e-05, |
|
"loss": 0.6451, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 1.6704693611473274, |
|
"grad_norm": 17.83816909790039, |
|
"learning_rate": 1.997718383311604e-05, |
|
"loss": 0.6397, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.680655149934811, |
|
"grad_norm": 8.789198875427246, |
|
"learning_rate": 1.9916069100391133e-05, |
|
"loss": 0.6514, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 1.6908409387222947, |
|
"grad_norm": 11.361543655395508, |
|
"learning_rate": 1.9854954367666234e-05, |
|
"loss": 0.6466, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.7010267275097783, |
|
"grad_norm": 15.630517959594727, |
|
"learning_rate": 1.979383963494133e-05, |
|
"loss": 0.6316, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 1.711212516297262, |
|
"grad_norm": 10.55432415008545, |
|
"learning_rate": 1.973272490221643e-05, |
|
"loss": 0.6533, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.7213983050847457, |
|
"grad_norm": 8.428609848022461, |
|
"learning_rate": 1.9671610169491524e-05, |
|
"loss": 0.6478, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 1.7315840938722293, |
|
"grad_norm": 4.304576873779297, |
|
"learning_rate": 1.9610495436766625e-05, |
|
"loss": 0.6449, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.7417698826597132, |
|
"grad_norm": 21.92937469482422, |
|
"learning_rate": 1.9549380704041722e-05, |
|
"loss": 0.6425, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 1.7519556714471969, |
|
"grad_norm": 19.813392639160156, |
|
"learning_rate": 1.948826597131682e-05, |
|
"loss": 0.6238, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.7621414602346805, |
|
"grad_norm": 6.831646919250488, |
|
"learning_rate": 1.9427151238591917e-05, |
|
"loss": 0.6553, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 1.7723272490221644, |
|
"grad_norm": 11.058158874511719, |
|
"learning_rate": 1.9366036505867015e-05, |
|
"loss": 0.6528, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.782513037809648, |
|
"grad_norm": 7.01440954208374, |
|
"learning_rate": 1.9304921773142112e-05, |
|
"loss": 0.6506, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 1.7926988265971318, |
|
"grad_norm": 4.963765621185303, |
|
"learning_rate": 1.924380704041721e-05, |
|
"loss": 0.6412, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.8028846153846154, |
|
"grad_norm": 38.63767623901367, |
|
"learning_rate": 1.9182692307692307e-05, |
|
"loss": 0.6629, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 1.813070404172099, |
|
"grad_norm": 11.423843383789062, |
|
"learning_rate": 1.9121577574967405e-05, |
|
"loss": 0.673, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 1.8232561929595827, |
|
"grad_norm": 11.73025894165039, |
|
"learning_rate": 1.9060462842242506e-05, |
|
"loss": 0.625, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 1.8334419817470664, |
|
"grad_norm": 11.493837356567383, |
|
"learning_rate": 1.89993481095176e-05, |
|
"loss": 0.6658, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 1.84362777053455, |
|
"grad_norm": 12.196702003479004, |
|
"learning_rate": 1.89382333767927e-05, |
|
"loss": 0.6451, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 1.8538135593220337, |
|
"grad_norm": 9.690689086914062, |
|
"learning_rate": 1.8877118644067795e-05, |
|
"loss": 0.6282, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 1.8639993481095176, |
|
"grad_norm": 13.679101943969727, |
|
"learning_rate": 1.8816003911342896e-05, |
|
"loss": 0.6328, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 1.8741851368970013, |
|
"grad_norm": 14.264899253845215, |
|
"learning_rate": 1.8754889178617994e-05, |
|
"loss": 0.6355, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 1.8843709256844852, |
|
"grad_norm": 9.039682388305664, |
|
"learning_rate": 1.869377444589309e-05, |
|
"loss": 0.6471, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 1.8945567144719688, |
|
"grad_norm": 9.861899375915527, |
|
"learning_rate": 1.863265971316819e-05, |
|
"loss": 0.6549, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 1.9047425032594525, |
|
"grad_norm": 14.948457717895508, |
|
"learning_rate": 1.8571544980443286e-05, |
|
"loss": 0.6436, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 1.9149282920469362, |
|
"grad_norm": 15.613487243652344, |
|
"learning_rate": 1.8510430247718384e-05, |
|
"loss": 0.6355, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 1.9251140808344198, |
|
"grad_norm": 14.105792045593262, |
|
"learning_rate": 1.844931551499348e-05, |
|
"loss": 0.6345, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 1.9352998696219035, |
|
"grad_norm": 7.004807472229004, |
|
"learning_rate": 1.838820078226858e-05, |
|
"loss": 0.6456, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 1.9454856584093871, |
|
"grad_norm": 10.506879806518555, |
|
"learning_rate": 1.8327086049543677e-05, |
|
"loss": 0.6303, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 1.9556714471968708, |
|
"grad_norm": 11.595491409301758, |
|
"learning_rate": 1.8265971316818774e-05, |
|
"loss": 0.6509, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 1.9658572359843545, |
|
"grad_norm": 11.29542064666748, |
|
"learning_rate": 1.820485658409387e-05, |
|
"loss": 0.6273, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 1.9760430247718384, |
|
"grad_norm": 4.132607460021973, |
|
"learning_rate": 1.814374185136897e-05, |
|
"loss": 0.643, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 1.986228813559322, |
|
"grad_norm": 9.745738983154297, |
|
"learning_rate": 1.8082627118644067e-05, |
|
"loss": 0.6656, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 1.996414602346806, |
|
"grad_norm": 4.289452075958252, |
|
"learning_rate": 1.8021512385919164e-05, |
|
"loss": 0.6367, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.7218826413154602, |
|
"eval_loss": 0.6872764825820923, |
|
"eval_runtime": 7.6175, |
|
"eval_samples_per_second": 644.307, |
|
"eval_steps_per_second": 80.604, |
|
"step": 98176 |
|
}, |
|
{ |
|
"epoch": 2.0066003911342896, |
|
"grad_norm": 19.260536193847656, |
|
"learning_rate": 1.7960397653194265e-05, |
|
"loss": 0.5568, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 2.0167861799217732, |
|
"grad_norm": 8.347018241882324, |
|
"learning_rate": 1.7899282920469363e-05, |
|
"loss": 0.5344, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 2.026971968709257, |
|
"grad_norm": 12.238682746887207, |
|
"learning_rate": 1.783816818774446e-05, |
|
"loss": 0.5685, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 2.0371577574967406, |
|
"grad_norm": 18.42786979675293, |
|
"learning_rate": 1.7777053455019558e-05, |
|
"loss": 0.5539, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.047343546284224, |
|
"grad_norm": 25.627717971801758, |
|
"learning_rate": 1.7715938722294655e-05, |
|
"loss": 0.5393, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 2.057529335071708, |
|
"grad_norm": 15.942806243896484, |
|
"learning_rate": 1.7654823989569753e-05, |
|
"loss": 0.5191, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 2.0677151238591915, |
|
"grad_norm": 7.591663360595703, |
|
"learning_rate": 1.759370925684485e-05, |
|
"loss": 0.5428, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 2.077900912646675, |
|
"grad_norm": 7.183356285095215, |
|
"learning_rate": 1.7532594524119948e-05, |
|
"loss": 0.5475, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 2.088086701434159, |
|
"grad_norm": 19.916288375854492, |
|
"learning_rate": 1.7471479791395046e-05, |
|
"loss": 0.5586, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 2.098272490221643, |
|
"grad_norm": 9.07480239868164, |
|
"learning_rate": 1.7410365058670143e-05, |
|
"loss": 0.5564, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 2.1084582790091266, |
|
"grad_norm": 7.647058010101318, |
|
"learning_rate": 1.734925032594524e-05, |
|
"loss": 0.543, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 2.1186440677966103, |
|
"grad_norm": 8.871644973754883, |
|
"learning_rate": 1.728813559322034e-05, |
|
"loss": 0.5623, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 2.128829856584094, |
|
"grad_norm": 9.516389846801758, |
|
"learning_rate": 1.7227020860495436e-05, |
|
"loss": 0.5464, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 2.1390156453715776, |
|
"grad_norm": 13.605746269226074, |
|
"learning_rate": 1.7165906127770537e-05, |
|
"loss": 0.5425, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 2.1492014341590613, |
|
"grad_norm": 14.016572952270508, |
|
"learning_rate": 1.710479139504563e-05, |
|
"loss": 0.5496, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 2.159387222946545, |
|
"grad_norm": 14.719120979309082, |
|
"learning_rate": 1.7043676662320732e-05, |
|
"loss": 0.5591, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 2.1695730117340286, |
|
"grad_norm": 14.922130584716797, |
|
"learning_rate": 1.6982561929595826e-05, |
|
"loss": 0.5462, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 2.1797588005215123, |
|
"grad_norm": 10.74787425994873, |
|
"learning_rate": 1.6921447196870927e-05, |
|
"loss": 0.5699, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 2.189944589308996, |
|
"grad_norm": 7.978420257568359, |
|
"learning_rate": 1.6860332464146024e-05, |
|
"loss": 0.5449, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 2.2001303780964796, |
|
"grad_norm": 15.361347198486328, |
|
"learning_rate": 1.6799217731421122e-05, |
|
"loss": 0.5443, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 2.2103161668839633, |
|
"grad_norm": 6.552661418914795, |
|
"learning_rate": 1.673810299869622e-05, |
|
"loss": 0.5663, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 2.2205019556714474, |
|
"grad_norm": 6.750521659851074, |
|
"learning_rate": 1.6676988265971317e-05, |
|
"loss": 0.5488, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 2.230687744458931, |
|
"grad_norm": 11.149799346923828, |
|
"learning_rate": 1.6615873533246415e-05, |
|
"loss": 0.5412, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 2.2408735332464147, |
|
"grad_norm": 19.362638473510742, |
|
"learning_rate": 1.6554758800521512e-05, |
|
"loss": 0.5505, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 2.2510593220338984, |
|
"grad_norm": 11.263615608215332, |
|
"learning_rate": 1.6493644067796613e-05, |
|
"loss": 0.5591, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 2.261245110821382, |
|
"grad_norm": 8.90456771850586, |
|
"learning_rate": 1.6432529335071707e-05, |
|
"loss": 0.5491, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 2.2714308996088657, |
|
"grad_norm": 11.962569236755371, |
|
"learning_rate": 1.6371414602346808e-05, |
|
"loss": 0.556, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 2.2816166883963493, |
|
"grad_norm": 8.397544860839844, |
|
"learning_rate": 1.6310299869621902e-05, |
|
"loss": 0.5561, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 2.291802477183833, |
|
"grad_norm": 16.376155853271484, |
|
"learning_rate": 1.6249185136897003e-05, |
|
"loss": 0.5435, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 2.3019882659713167, |
|
"grad_norm": 9.14609432220459, |
|
"learning_rate": 1.6188070404172097e-05, |
|
"loss": 0.5457, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 2.3121740547588003, |
|
"grad_norm": 26.53936767578125, |
|
"learning_rate": 1.61269556714472e-05, |
|
"loss": 0.5295, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 2.322359843546284, |
|
"grad_norm": 17.222721099853516, |
|
"learning_rate": 1.6065840938722293e-05, |
|
"loss": 0.5177, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 2.332545632333768, |
|
"grad_norm": 9.585100173950195, |
|
"learning_rate": 1.6004726205997394e-05, |
|
"loss": 0.5419, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 2.3427314211212518, |
|
"grad_norm": 9.392489433288574, |
|
"learning_rate": 1.5943611473272488e-05, |
|
"loss": 0.5296, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 2.3529172099087354, |
|
"grad_norm": 15.146162986755371, |
|
"learning_rate": 1.588249674054759e-05, |
|
"loss": 0.5518, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 2.363102998696219, |
|
"grad_norm": 15.631641387939453, |
|
"learning_rate": 1.582138200782269e-05, |
|
"loss": 0.5584, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 2.3732887874837028, |
|
"grad_norm": 17.875492095947266, |
|
"learning_rate": 1.5760267275097784e-05, |
|
"loss": 0.5601, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 2.3834745762711864, |
|
"grad_norm": 7.7471184730529785, |
|
"learning_rate": 1.5699152542372885e-05, |
|
"loss": 0.549, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 2.39366036505867, |
|
"grad_norm": 14.712841987609863, |
|
"learning_rate": 1.563803780964798e-05, |
|
"loss": 0.5293, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 2.4038461538461537, |
|
"grad_norm": 9.229011535644531, |
|
"learning_rate": 1.557692307692308e-05, |
|
"loss": 0.5483, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 2.4140319426336374, |
|
"grad_norm": 11.47548770904541, |
|
"learning_rate": 1.5515808344198174e-05, |
|
"loss": 0.5529, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 2.424217731421121, |
|
"grad_norm": 29.980873107910156, |
|
"learning_rate": 1.5454693611473275e-05, |
|
"loss": 0.5489, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 2.4344035202086047, |
|
"grad_norm": 10.478185653686523, |
|
"learning_rate": 1.539357887874837e-05, |
|
"loss": 0.5595, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 2.444589308996089, |
|
"grad_norm": 12.978096008300781, |
|
"learning_rate": 1.533246414602347e-05, |
|
"loss": 0.5612, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 2.4547750977835725, |
|
"grad_norm": 17.834806442260742, |
|
"learning_rate": 1.5271349413298564e-05, |
|
"loss": 0.5412, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 2.464960886571056, |
|
"grad_norm": 25.001754760742188, |
|
"learning_rate": 1.5210234680573665e-05, |
|
"loss": 0.5673, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 2.47514667535854, |
|
"grad_norm": 27.953767776489258, |
|
"learning_rate": 1.5149119947848761e-05, |
|
"loss": 0.5497, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 2.4853324641460235, |
|
"grad_norm": 9.370855331420898, |
|
"learning_rate": 1.508800521512386e-05, |
|
"loss": 0.5344, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 2.495518252933507, |
|
"grad_norm": 18.109821319580078, |
|
"learning_rate": 1.5026890482398956e-05, |
|
"loss": 0.5572, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 2.505704041720991, |
|
"grad_norm": 20.435270309448242, |
|
"learning_rate": 1.4965775749674055e-05, |
|
"loss": 0.5631, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 2.5158898305084745, |
|
"grad_norm": 10.656272888183594, |
|
"learning_rate": 1.4904661016949153e-05, |
|
"loss": 0.5344, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 2.526075619295958, |
|
"grad_norm": 18.796552658081055, |
|
"learning_rate": 1.484354628422425e-05, |
|
"loss": 0.5464, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 2.5362614080834422, |
|
"grad_norm": 9.630306243896484, |
|
"learning_rate": 1.4782431551499348e-05, |
|
"loss": 0.547, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 2.5464471968709255, |
|
"grad_norm": 6.3599467277526855, |
|
"learning_rate": 1.4721316818774445e-05, |
|
"loss": 0.5577, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 2.5566329856584096, |
|
"grad_norm": 8.967144966125488, |
|
"learning_rate": 1.4660202086049545e-05, |
|
"loss": 0.5524, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 2.5668187744458932, |
|
"grad_norm": 16.690263748168945, |
|
"learning_rate": 1.4599087353324642e-05, |
|
"loss": 0.5546, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 2.577004563233377, |
|
"grad_norm": 17.713790893554688, |
|
"learning_rate": 1.453797262059974e-05, |
|
"loss": 0.5686, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 2.5871903520208606, |
|
"grad_norm": 7.970760822296143, |
|
"learning_rate": 1.4476857887874837e-05, |
|
"loss": 0.5273, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 2.5973761408083442, |
|
"grad_norm": 8.869507789611816, |
|
"learning_rate": 1.4415743155149935e-05, |
|
"loss": 0.5472, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 2.607561929595828, |
|
"grad_norm": 17.247589111328125, |
|
"learning_rate": 1.4354628422425032e-05, |
|
"loss": 0.5444, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 2.6177477183833116, |
|
"grad_norm": 16.415138244628906, |
|
"learning_rate": 1.429351368970013e-05, |
|
"loss": 0.5445, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 2.627933507170795, |
|
"grad_norm": 10.408729553222656, |
|
"learning_rate": 1.4232398956975227e-05, |
|
"loss": 0.5667, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 2.638119295958279, |
|
"grad_norm": 53.77888488769531, |
|
"learning_rate": 1.4171284224250327e-05, |
|
"loss": 0.547, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 2.648305084745763, |
|
"grad_norm": 12.244620323181152, |
|
"learning_rate": 1.4110169491525424e-05, |
|
"loss": 0.5428, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 2.658490873533246, |
|
"grad_norm": 11.80124568939209, |
|
"learning_rate": 1.4049054758800522e-05, |
|
"loss": 0.5565, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 2.6686766623207303, |
|
"grad_norm": 13.250449180603027, |
|
"learning_rate": 1.398794002607562e-05, |
|
"loss": 0.5423, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 2.678862451108214, |
|
"grad_norm": 15.81285572052002, |
|
"learning_rate": 1.3926825293350719e-05, |
|
"loss": 0.5621, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 2.6890482398956976, |
|
"grad_norm": 22.584035873413086, |
|
"learning_rate": 1.3865710560625816e-05, |
|
"loss": 0.5521, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 2.6992340286831813, |
|
"grad_norm": 23.865680694580078, |
|
"learning_rate": 1.3804595827900914e-05, |
|
"loss": 0.551, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 2.709419817470665, |
|
"grad_norm": 18.28876304626465, |
|
"learning_rate": 1.3743481095176011e-05, |
|
"loss": 0.5451, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 2.7196056062581486, |
|
"grad_norm": 16.23969078063965, |
|
"learning_rate": 1.3682366362451109e-05, |
|
"loss": 0.5392, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 2.7297913950456323, |
|
"grad_norm": 14.673959732055664, |
|
"learning_rate": 1.3621251629726206e-05, |
|
"loss": 0.5451, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 2.739977183833116, |
|
"grad_norm": 8.019514083862305, |
|
"learning_rate": 1.3560136897001304e-05, |
|
"loss": 0.5568, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 2.7501629726205996, |
|
"grad_norm": 13.495898246765137, |
|
"learning_rate": 1.3499022164276401e-05, |
|
"loss": 0.5734, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 2.7603487614080837, |
|
"grad_norm": 7.548976421356201, |
|
"learning_rate": 1.3437907431551499e-05, |
|
"loss": 0.557, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 2.770534550195567, |
|
"grad_norm": 20.049760818481445, |
|
"learning_rate": 1.3376792698826597e-05, |
|
"loss": 0.5658, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 2.780720338983051, |
|
"grad_norm": 9.346122741699219, |
|
"learning_rate": 1.3315677966101694e-05, |
|
"loss": 0.535, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 2.7909061277705347, |
|
"grad_norm": 15.080660820007324, |
|
"learning_rate": 1.3254563233376792e-05, |
|
"loss": 0.5538, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 2.8010919165580184, |
|
"grad_norm": 11.485374450683594, |
|
"learning_rate": 1.3193448500651891e-05, |
|
"loss": 0.5419, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 2.811277705345502, |
|
"grad_norm": 12.089446067810059, |
|
"learning_rate": 1.3132333767926988e-05, |
|
"loss": 0.5651, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 2.8214634941329857, |
|
"grad_norm": 5.466490268707275, |
|
"learning_rate": 1.3071219035202088e-05, |
|
"loss": 0.5217, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 2.8316492829204694, |
|
"grad_norm": 12.89148235321045, |
|
"learning_rate": 1.3010104302477185e-05, |
|
"loss": 0.5477, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 2.841835071707953, |
|
"grad_norm": 5.610709190368652, |
|
"learning_rate": 1.2948989569752283e-05, |
|
"loss": 0.5377, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 2.8520208604954367, |
|
"grad_norm": 26.2186222076416, |
|
"learning_rate": 1.288787483702738e-05, |
|
"loss": 0.5281, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 2.8622066492829203, |
|
"grad_norm": 2.1066462993621826, |
|
"learning_rate": 1.2826760104302478e-05, |
|
"loss": 0.5387, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 2.872392438070404, |
|
"grad_norm": 13.6646728515625, |
|
"learning_rate": 1.2765645371577575e-05, |
|
"loss": 0.5416, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 2.8825782268578877, |
|
"grad_norm": 14.357284545898438, |
|
"learning_rate": 1.2704530638852673e-05, |
|
"loss": 0.5504, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 2.8927640156453718, |
|
"grad_norm": 14.08674144744873, |
|
"learning_rate": 1.264341590612777e-05, |
|
"loss": 0.5691, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 2.9029498044328554, |
|
"grad_norm": 20.460561752319336, |
|
"learning_rate": 1.2582301173402868e-05, |
|
"loss": 0.5624, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 2.913135593220339, |
|
"grad_norm": 12.731225967407227, |
|
"learning_rate": 1.2521186440677966e-05, |
|
"loss": 0.5439, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 2.9233213820078228, |
|
"grad_norm": 3.781522274017334, |
|
"learning_rate": 1.2460071707953065e-05, |
|
"loss": 0.5348, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 2.9335071707953064, |
|
"grad_norm": 14.624204635620117, |
|
"learning_rate": 1.2398956975228162e-05, |
|
"loss": 0.5589, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 2.94369295958279, |
|
"grad_norm": 18.551095962524414, |
|
"learning_rate": 1.233784224250326e-05, |
|
"loss": 0.532, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 2.9538787483702738, |
|
"grad_norm": 17.094831466674805, |
|
"learning_rate": 1.2276727509778357e-05, |
|
"loss": 0.558, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 2.9640645371577574, |
|
"grad_norm": 12.306907653808594, |
|
"learning_rate": 1.2215612777053455e-05, |
|
"loss": 0.542, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 2.974250325945241, |
|
"grad_norm": 12.134025573730469, |
|
"learning_rate": 1.2154498044328553e-05, |
|
"loss": 0.5429, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 2.9844361147327247, |
|
"grad_norm": 22.295795440673828, |
|
"learning_rate": 1.209338331160365e-05, |
|
"loss": 0.5632, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 2.9946219035202084, |
|
"grad_norm": 17.749858856201172, |
|
"learning_rate": 1.2032268578878748e-05, |
|
"loss": 0.5402, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7337000966072083, |
|
"eval_loss": 0.7322831153869629, |
|
"eval_runtime": 8.3496, |
|
"eval_samples_per_second": 587.813, |
|
"eval_steps_per_second": 73.536, |
|
"step": 147264 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 245440, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.001 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.950698126852608e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|