|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.997411003236246, |
|
"eval_steps": 50, |
|
"global_step": 4632, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008629989212513484, |
|
"grad_norm": 20.681591033935547, |
|
"learning_rate": 2.1551724137931036e-07, |
|
"loss": 1.0408, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.017259978425026967, |
|
"grad_norm": 96.75000762939453, |
|
"learning_rate": 4.3103448275862073e-07, |
|
"loss": 1.047, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.025889967637540454, |
|
"grad_norm": 22.108104705810547, |
|
"learning_rate": 6.465517241379311e-07, |
|
"loss": 1.0718, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.034519956850053934, |
|
"grad_norm": 40.05157470703125, |
|
"learning_rate": 8.620689655172415e-07, |
|
"loss": 1.0488, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.043149946062567425, |
|
"grad_norm": 15.964655876159668, |
|
"learning_rate": 1.0775862068965518e-06, |
|
"loss": 1.075, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.043149946062567425, |
|
"eval_accuracy": 0.49320388349514566, |
|
"eval_loss": 1.018173336982727, |
|
"eval_runtime": 322.676, |
|
"eval_samples_per_second": 1.596, |
|
"eval_steps_per_second": 1.596, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05177993527508091, |
|
"grad_norm": 27.802989959716797, |
|
"learning_rate": 1.2931034482758623e-06, |
|
"loss": 1.1389, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06040992448759439, |
|
"grad_norm": 28.11711883544922, |
|
"learning_rate": 1.5086206896551726e-06, |
|
"loss": 1.1116, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06903991370010787, |
|
"grad_norm": 22.176109313964844, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 1.0697, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07766990291262135, |
|
"grad_norm": 41.33392333984375, |
|
"learning_rate": 1.9396551724137932e-06, |
|
"loss": 1.0242, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08629989212513485, |
|
"grad_norm": 34.400508880615234, |
|
"learning_rate": 2.1551724137931035e-06, |
|
"loss": 1.0505, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08629989212513485, |
|
"eval_accuracy": 0.5009708737864078, |
|
"eval_loss": 0.9943639039993286, |
|
"eval_runtime": 321.8255, |
|
"eval_samples_per_second": 1.6, |
|
"eval_steps_per_second": 1.6, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09492988133764833, |
|
"grad_norm": 28.23130989074707, |
|
"learning_rate": 2.370689655172414e-06, |
|
"loss": 1.0073, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10355987055016182, |
|
"grad_norm": 36.090736389160156, |
|
"learning_rate": 2.5862068965517246e-06, |
|
"loss": 0.9802, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1121898597626753, |
|
"grad_norm": 58.96036148071289, |
|
"learning_rate": 2.8017241379310345e-06, |
|
"loss": 0.9827, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.12081984897518878, |
|
"grad_norm": 18.94993782043457, |
|
"learning_rate": 3.017241379310345e-06, |
|
"loss": 1.0015, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12944983818770225, |
|
"grad_norm": 32.874114990234375, |
|
"learning_rate": 3.2327586206896555e-06, |
|
"loss": 0.9387, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12944983818770225, |
|
"eval_accuracy": 0.5048543689320388, |
|
"eval_loss": 0.9101472496986389, |
|
"eval_runtime": 321.9422, |
|
"eval_samples_per_second": 1.6, |
|
"eval_steps_per_second": 1.6, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13807982740021574, |
|
"grad_norm": 14.486083030700684, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 0.9255, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14670981661272922, |
|
"grad_norm": 26.06964111328125, |
|
"learning_rate": 3.663793103448276e-06, |
|
"loss": 0.8775, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1553398058252427, |
|
"grad_norm": 23.44382667541504, |
|
"learning_rate": 3.8793103448275865e-06, |
|
"loss": 0.8675, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16396979503775622, |
|
"grad_norm": 22.29359245300293, |
|
"learning_rate": 4.094827586206897e-06, |
|
"loss": 0.9728, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1725997842502697, |
|
"grad_norm": 38.14244842529297, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 0.92, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1725997842502697, |
|
"eval_accuracy": 0.5048543689320388, |
|
"eval_loss": 0.9019931554794312, |
|
"eval_runtime": 321.9115, |
|
"eval_samples_per_second": 1.6, |
|
"eval_steps_per_second": 1.6, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.18122977346278318, |
|
"grad_norm": 64.9331283569336, |
|
"learning_rate": 4.525862068965518e-06, |
|
"loss": 0.9633, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18985976267529667, |
|
"grad_norm": 39.31247329711914, |
|
"learning_rate": 4.741379310344828e-06, |
|
"loss": 0.9646, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19848975188781015, |
|
"grad_norm": 26.192481994628906, |
|
"learning_rate": 4.9568965517241384e-06, |
|
"loss": 0.9956, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.20711974110032363, |
|
"grad_norm": 33.946685791015625, |
|
"learning_rate": 5.172413793103449e-06, |
|
"loss": 0.8929, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21574973031283712, |
|
"grad_norm": 20.04779624938965, |
|
"learning_rate": 5.38793103448276e-06, |
|
"loss": 0.9531, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21574973031283712, |
|
"eval_accuracy": 0.5223300970873787, |
|
"eval_loss": 0.886761486530304, |
|
"eval_runtime": 321.7179, |
|
"eval_samples_per_second": 1.601, |
|
"eval_steps_per_second": 1.601, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2243797195253506, |
|
"grad_norm": 53.125587463378906, |
|
"learning_rate": 5.603448275862069e-06, |
|
"loss": 0.9716, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.23300970873786409, |
|
"grad_norm": 43.821533203125, |
|
"learning_rate": 5.81896551724138e-06, |
|
"loss": 0.9407, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.24163969795037757, |
|
"grad_norm": 47.41954803466797, |
|
"learning_rate": 6.03448275862069e-06, |
|
"loss": 0.9464, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.25026968716289105, |
|
"grad_norm": 29.925968170166016, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.9151, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2588996763754045, |
|
"grad_norm": 23.372934341430664, |
|
"learning_rate": 6.465517241379311e-06, |
|
"loss": 0.849, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2588996763754045, |
|
"eval_accuracy": 0.5339805825242718, |
|
"eval_loss": 0.856666088104248, |
|
"eval_runtime": 321.7027, |
|
"eval_samples_per_second": 1.601, |
|
"eval_steps_per_second": 1.601, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.267529665587918, |
|
"grad_norm": 22.651479721069336, |
|
"learning_rate": 6.681034482758622e-06, |
|
"loss": 1.0237, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2761596548004315, |
|
"grad_norm": 17.50941276550293, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 0.8401, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.284789644012945, |
|
"grad_norm": 51.20744323730469, |
|
"learning_rate": 7.1120689655172415e-06, |
|
"loss": 0.9366, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.29341963322545844, |
|
"grad_norm": 23.283870697021484, |
|
"learning_rate": 7.327586206896552e-06, |
|
"loss": 0.8198, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.30204962243797195, |
|
"grad_norm": 24.28423500061035, |
|
"learning_rate": 7.543103448275862e-06, |
|
"loss": 0.8897, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.30204962243797195, |
|
"eval_accuracy": 0.5262135922330097, |
|
"eval_loss": 0.8523032069206238, |
|
"eval_runtime": 321.7555, |
|
"eval_samples_per_second": 1.601, |
|
"eval_steps_per_second": 1.601, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3106796116504854, |
|
"grad_norm": 27.711999893188477, |
|
"learning_rate": 7.758620689655173e-06, |
|
"loss": 0.8352, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3193096008629989, |
|
"grad_norm": 25.017581939697266, |
|
"learning_rate": 7.974137931034484e-06, |
|
"loss": 0.7918, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32793959007551243, |
|
"grad_norm": 33.27495193481445, |
|
"learning_rate": 8.189655172413794e-06, |
|
"loss": 0.9004, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3365695792880259, |
|
"grad_norm": 17.355253219604492, |
|
"learning_rate": 8.405172413793105e-06, |
|
"loss": 0.8079, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3451995685005394, |
|
"grad_norm": 33.237518310546875, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 0.8512, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3451995685005394, |
|
"eval_accuracy": 0.5262135922330097, |
|
"eval_loss": 0.8104857206344604, |
|
"eval_runtime": 321.6492, |
|
"eval_samples_per_second": 1.601, |
|
"eval_steps_per_second": 1.601, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.35382955771305286, |
|
"grad_norm": 31.926298141479492, |
|
"learning_rate": 8.836206896551725e-06, |
|
"loss": 0.8049, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.36245954692556637, |
|
"grad_norm": 18.511268615722656, |
|
"learning_rate": 9.051724137931036e-06, |
|
"loss": 0.7887, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3710895361380798, |
|
"grad_norm": 12.080615043640137, |
|
"learning_rate": 9.267241379310346e-06, |
|
"loss": 0.8286, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.37971952535059333, |
|
"grad_norm": 22.48563003540039, |
|
"learning_rate": 9.482758620689655e-06, |
|
"loss": 0.8201, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3883495145631068, |
|
"grad_norm": 25.83173179626465, |
|
"learning_rate": 9.698275862068966e-06, |
|
"loss": 0.7854, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3883495145631068, |
|
"eval_accuracy": 0.5106796116504855, |
|
"eval_loss": 0.7994323372840881, |
|
"eval_runtime": 321.4421, |
|
"eval_samples_per_second": 1.602, |
|
"eval_steps_per_second": 1.602, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3969795037756203, |
|
"grad_norm": 41.783851623535156, |
|
"learning_rate": 9.913793103448277e-06, |
|
"loss": 0.8339, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.40560949298813376, |
|
"grad_norm": 12.72182846069336, |
|
"learning_rate": 9.9999488687872e-06, |
|
"loss": 0.8063, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.41423948220064727, |
|
"grad_norm": 28.933361053466797, |
|
"learning_rate": 9.999636404051638e-06, |
|
"loss": 0.8554, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4228694714131607, |
|
"grad_norm": 48.14093017578125, |
|
"learning_rate": 9.999039898540166e-06, |
|
"loss": 0.9297, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.43149946062567424, |
|
"grad_norm": 27.8731746673584, |
|
"learning_rate": 9.998159386141626e-06, |
|
"loss": 0.8147, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43149946062567424, |
|
"eval_accuracy": 0.5398058252427185, |
|
"eval_loss": 0.7859384417533875, |
|
"eval_runtime": 321.5871, |
|
"eval_samples_per_second": 1.601, |
|
"eval_steps_per_second": 1.601, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4401294498381877, |
|
"grad_norm": 17.547481536865234, |
|
"learning_rate": 9.996994916879941e-06, |
|
"loss": 0.8449, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4487594390507012, |
|
"grad_norm": 33.447723388671875, |
|
"learning_rate": 9.995546556911271e-06, |
|
"loss": 0.779, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.45738942826321466, |
|
"grad_norm": 41.81571578979492, |
|
"learning_rate": 9.99381438852026e-06, |
|
"loss": 0.7262, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.46601941747572817, |
|
"grad_norm": 40.82163619995117, |
|
"learning_rate": 9.991798510115351e-06, |
|
"loss": 0.8282, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4746494066882416, |
|
"grad_norm": 55.30727767944336, |
|
"learning_rate": 9.989499036223209e-06, |
|
"loss": 0.8075, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4746494066882416, |
|
"eval_accuracy": 0.5553398058252427, |
|
"eval_loss": 0.7565743923187256, |
|
"eval_runtime": 321.511, |
|
"eval_samples_per_second": 1.602, |
|
"eval_steps_per_second": 1.602, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.48327939590075514, |
|
"grad_norm": 51.085289001464844, |
|
"learning_rate": 9.986916097482204e-06, |
|
"loss": 0.7747, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4919093851132686, |
|
"grad_norm": 65.66133880615234, |
|
"learning_rate": 9.98404984063499e-06, |
|
"loss": 0.7563, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5005393743257821, |
|
"grad_norm": 11.704032897949219, |
|
"learning_rate": 9.980900428520171e-06, |
|
"loss": 0.7819, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5091693635382956, |
|
"grad_norm": 27.524673461914062, |
|
"learning_rate": 9.977468040063054e-06, |
|
"loss": 0.7777, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.517799352750809, |
|
"grad_norm": 22.56294822692871, |
|
"learning_rate": 9.973752870265473e-06, |
|
"loss": 0.8282, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.517799352750809, |
|
"eval_accuracy": 0.5145631067961165, |
|
"eval_loss": 0.7454360127449036, |
|
"eval_runtime": 321.3773, |
|
"eval_samples_per_second": 1.602, |
|
"eval_steps_per_second": 1.602, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5264293419633226, |
|
"grad_norm": 24.327606201171875, |
|
"learning_rate": 9.96975513019472e-06, |
|
"loss": 0.7907, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.535059331175836, |
|
"grad_norm": 18.27765655517578, |
|
"learning_rate": 9.965475046971548e-06, |
|
"loss": 0.8475, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5436893203883495, |
|
"grad_norm": 23.742115020751953, |
|
"learning_rate": 9.960912863757273e-06, |
|
"loss": 0.7363, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.552319309600863, |
|
"grad_norm": 11.194246292114258, |
|
"learning_rate": 9.956068839739955e-06, |
|
"loss": 0.8291, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5609492988133765, |
|
"grad_norm": 23.568937301635742, |
|
"learning_rate": 9.950943250119674e-06, |
|
"loss": 0.7524, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5609492988133765, |
|
"eval_accuracy": 0.49902912621359224, |
|
"eval_loss": 0.7317044138908386, |
|
"eval_runtime": 321.3686, |
|
"eval_samples_per_second": 1.603, |
|
"eval_steps_per_second": 1.603, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.56957928802589, |
|
"grad_norm": 11.3060302734375, |
|
"learning_rate": 9.945536386092893e-06, |
|
"loss": 0.7319, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5782092772384034, |
|
"grad_norm": 29.552515029907227, |
|
"learning_rate": 9.939848554835927e-06, |
|
"loss": 0.6644, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5868392664509169, |
|
"grad_norm": 23.357723236083984, |
|
"learning_rate": 9.93388007948747e-06, |
|
"loss": 0.8749, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5954692556634305, |
|
"grad_norm": 18.92988395690918, |
|
"learning_rate": 9.927631299130254e-06, |
|
"loss": 0.8157, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6040992448759439, |
|
"grad_norm": 18.492721557617188, |
|
"learning_rate": 9.921102568771781e-06, |
|
"loss": 0.7338, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6040992448759439, |
|
"eval_accuracy": 0.5339805825242718, |
|
"eval_loss": 0.7266865968704224, |
|
"eval_runtime": 321.4222, |
|
"eval_samples_per_second": 1.602, |
|
"eval_steps_per_second": 1.602, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6127292340884574, |
|
"grad_norm": 24.050262451171875, |
|
"learning_rate": 9.914294259324149e-06, |
|
"loss": 0.7609, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6213592233009708, |
|
"grad_norm": 8.642351150512695, |
|
"learning_rate": 9.907206757582987e-06, |
|
"loss": 0.7681, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6299892125134844, |
|
"grad_norm": 20.86747932434082, |
|
"learning_rate": 9.899840466205473e-06, |
|
"loss": 0.8052, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6386192017259978, |
|
"grad_norm": 44.50579833984375, |
|
"learning_rate": 9.892195803687464e-06, |
|
"loss": 0.739, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6472491909385113, |
|
"grad_norm": 20.538475036621094, |
|
"learning_rate": 9.884273204339716e-06, |
|
"loss": 0.7909, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6472491909385113, |
|
"eval_accuracy": 0.5611650485436893, |
|
"eval_loss": 0.7110950350761414, |
|
"eval_runtime": 321.0742, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6558791801510249, |
|
"grad_norm": 53.17654037475586, |
|
"learning_rate": 9.876073118263216e-06, |
|
"loss": 0.8172, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6645091693635383, |
|
"grad_norm": 26.998899459838867, |
|
"learning_rate": 9.867596011323602e-06, |
|
"loss": 0.7901, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6731391585760518, |
|
"grad_norm": 45.38533020019531, |
|
"learning_rate": 9.858842365124702e-06, |
|
"loss": 0.7284, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6817691477885652, |
|
"grad_norm": 28.952617645263672, |
|
"learning_rate": 9.849812676981172e-06, |
|
"loss": 0.7501, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6903991370010788, |
|
"grad_norm": 19.87049102783203, |
|
"learning_rate": 9.840507459890244e-06, |
|
"loss": 0.7783, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6903991370010788, |
|
"eval_accuracy": 0.5300970873786408, |
|
"eval_loss": 0.7211207151412964, |
|
"eval_runtime": 320.8034, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6990291262135923, |
|
"grad_norm": 15.508710861206055, |
|
"learning_rate": 9.830927242502575e-06, |
|
"loss": 0.6965, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7076591154261057, |
|
"grad_norm": 36.019798278808594, |
|
"learning_rate": 9.821072569092223e-06, |
|
"loss": 0.77, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7162891046386192, |
|
"grad_norm": 13.119162559509277, |
|
"learning_rate": 9.810943999525714e-06, |
|
"loss": 0.7158, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7249190938511327, |
|
"grad_norm": 20.22465705871582, |
|
"learning_rate": 9.800542109230247e-06, |
|
"loss": 0.6938, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7335490830636462, |
|
"grad_norm": 33.313209533691406, |
|
"learning_rate": 9.78986748916099e-06, |
|
"loss": 0.7895, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7335490830636462, |
|
"eval_accuracy": 0.5592233009708738, |
|
"eval_loss": 0.7069711685180664, |
|
"eval_runtime": 321.285, |
|
"eval_samples_per_second": 1.603, |
|
"eval_steps_per_second": 1.603, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7421790722761596, |
|
"grad_norm": 9.106620788574219, |
|
"learning_rate": 9.778920745767524e-06, |
|
"loss": 0.6717, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7508090614886731, |
|
"grad_norm": 34.899375915527344, |
|
"learning_rate": 9.767702500959365e-06, |
|
"loss": 0.7353, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7594390507011867, |
|
"grad_norm": 29.355737686157227, |
|
"learning_rate": 9.756213392070654e-06, |
|
"loss": 0.7315, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7680690399137001, |
|
"grad_norm": 16.923168182373047, |
|
"learning_rate": 9.744454071823936e-06, |
|
"loss": 0.6777, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7766990291262136, |
|
"grad_norm": 7.441469192504883, |
|
"learning_rate": 9.732425208293083e-06, |
|
"loss": 0.6881, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7766990291262136, |
|
"eval_accuracy": 0.537864077669903, |
|
"eval_loss": 0.7709933519363403, |
|
"eval_runtime": 321.2302, |
|
"eval_samples_per_second": 1.603, |
|
"eval_steps_per_second": 1.603, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.785329018338727, |
|
"grad_norm": 17.159208297729492, |
|
"learning_rate": 9.720127484865336e-06, |
|
"loss": 0.7973, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7939590075512406, |
|
"grad_norm": 29.373632431030273, |
|
"learning_rate": 9.707561600202481e-06, |
|
"loss": 0.6946, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8025889967637541, |
|
"grad_norm": 40.986690521240234, |
|
"learning_rate": 9.694728268201162e-06, |
|
"loss": 0.7697, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8112189859762675, |
|
"grad_norm": 10.117018699645996, |
|
"learning_rate": 9.681628217952308e-06, |
|
"loss": 0.7183, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.819848975188781, |
|
"grad_norm": 45.013118743896484, |
|
"learning_rate": 9.668262193699731e-06, |
|
"loss": 0.7137, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.819848975188781, |
|
"eval_accuracy": 0.5805825242718446, |
|
"eval_loss": 0.6908486485481262, |
|
"eval_runtime": 321.1671, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8284789644012945, |
|
"grad_norm": 22.911548614501953, |
|
"learning_rate": 9.65463095479783e-06, |
|
"loss": 0.7166, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.837108953613808, |
|
"grad_norm": 9.517961502075195, |
|
"learning_rate": 9.640735275668453e-06, |
|
"loss": 0.7713, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8457389428263214, |
|
"grad_norm": 19.63594627380371, |
|
"learning_rate": 9.62657594575691e-06, |
|
"loss": 0.7101, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8543689320388349, |
|
"grad_norm": 27.475940704345703, |
|
"learning_rate": 9.6121537694871e-06, |
|
"loss": 0.741, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8629989212513485, |
|
"grad_norm": 13.922393798828125, |
|
"learning_rate": 9.597469566215841e-06, |
|
"loss": 0.6924, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8629989212513485, |
|
"eval_accuracy": 0.6, |
|
"eval_loss": 0.6857309341430664, |
|
"eval_runtime": 321.1313, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8716289104638619, |
|
"grad_norm": 8.671666145324707, |
|
"learning_rate": 9.582524170186294e-06, |
|
"loss": 0.6936, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8802588996763754, |
|
"grad_norm": 11.311553001403809, |
|
"learning_rate": 9.567318430480579e-06, |
|
"loss": 0.6853, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 6.2082648277282715, |
|
"learning_rate": 9.55185321097154e-06, |
|
"loss": 0.6846, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8975188781014024, |
|
"grad_norm": 35.873565673828125, |
|
"learning_rate": 9.536129390273659e-06, |
|
"loss": 0.7125, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9061488673139159, |
|
"grad_norm": 3.9832065105438232, |
|
"learning_rate": 9.520147861693138e-06, |
|
"loss": 0.7275, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9061488673139159, |
|
"eval_accuracy": 0.5766990291262136, |
|
"eval_loss": 0.6835415959358215, |
|
"eval_runtime": 321.1452, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9147788565264293, |
|
"grad_norm": 9.8655424118042, |
|
"learning_rate": 9.503909533177162e-06, |
|
"loss": 0.7286, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9234088457389428, |
|
"grad_norm": 14.413016319274902, |
|
"learning_rate": 9.487415327262303e-06, |
|
"loss": 0.7012, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9320388349514563, |
|
"grad_norm": 22.791946411132812, |
|
"learning_rate": 9.470666181022114e-06, |
|
"loss": 0.7057, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9406688241639698, |
|
"grad_norm": 7.595472812652588, |
|
"learning_rate": 9.453663046013889e-06, |
|
"loss": 0.7165, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9492988133764833, |
|
"grad_norm": 6.206796169281006, |
|
"learning_rate": 9.436406888224603e-06, |
|
"loss": 0.67, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9492988133764833, |
|
"eval_accuracy": 0.570873786407767, |
|
"eval_loss": 0.6888366341590881, |
|
"eval_runtime": 321.1897, |
|
"eval_samples_per_second": 1.603, |
|
"eval_steps_per_second": 1.603, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9579288025889967, |
|
"grad_norm": 9.740569114685059, |
|
"learning_rate": 9.418898688016042e-06, |
|
"loss": 0.7177, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9665587918015103, |
|
"grad_norm": 9.868525505065918, |
|
"learning_rate": 9.40113944006909e-06, |
|
"loss": 0.6841, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9751887810140237, |
|
"grad_norm": 10.188973426818848, |
|
"learning_rate": 9.383130153327231e-06, |
|
"loss": 0.6808, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9838187702265372, |
|
"grad_norm": 5.215792655944824, |
|
"learning_rate": 9.36487185093922e-06, |
|
"loss": 0.7059, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9924487594390508, |
|
"grad_norm": 5.438614845275879, |
|
"learning_rate": 9.34636557020097e-06, |
|
"loss": 0.6787, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9924487594390508, |
|
"eval_accuracy": 0.596116504854369, |
|
"eval_loss": 0.6860348582267761, |
|
"eval_runtime": 320.9468, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.0010787486515642, |
|
"grad_norm": 7.045734405517578, |
|
"learning_rate": 9.327612362496601e-06, |
|
"loss": 0.6904, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.0097087378640777, |
|
"grad_norm": 21.833343505859375, |
|
"learning_rate": 9.308613293238722e-06, |
|
"loss": 0.7516, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.0183387270765911, |
|
"grad_norm": 4.44768762588501, |
|
"learning_rate": 9.2893694418079e-06, |
|
"loss": 0.7105, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.0269687162891046, |
|
"grad_norm": 12.016294479370117, |
|
"learning_rate": 9.269881901491335e-06, |
|
"loss": 0.67, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.035598705501618, |
|
"grad_norm": 5.096578598022461, |
|
"learning_rate": 9.250151779420756e-06, |
|
"loss": 0.7012, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.035598705501618, |
|
"eval_accuracy": 0.570873786407767, |
|
"eval_loss": 0.6847370266914368, |
|
"eval_runtime": 320.5183, |
|
"eval_samples_per_second": 1.607, |
|
"eval_steps_per_second": 1.607, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.0442286947141317, |
|
"grad_norm": 11.158854484558105, |
|
"learning_rate": 9.230180196509506e-06, |
|
"loss": 0.6726, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.0528586839266452, |
|
"grad_norm": 7.818958282470703, |
|
"learning_rate": 9.209968287388878e-06, |
|
"loss": 0.6737, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.0614886731391586, |
|
"grad_norm": 4.283718109130859, |
|
"learning_rate": 9.189517200343643e-06, |
|
"loss": 0.6421, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.070118662351672, |
|
"grad_norm": 6.186824321746826, |
|
"learning_rate": 9.168828097246819e-06, |
|
"loss": 0.7709, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.0787486515641855, |
|
"grad_norm": 5.761249542236328, |
|
"learning_rate": 9.147902153493659e-06, |
|
"loss": 0.6765, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0787486515641855, |
|
"eval_accuracy": 0.5786407766990291, |
|
"eval_loss": 0.6961000561714172, |
|
"eval_runtime": 320.4513, |
|
"eval_samples_per_second": 1.607, |
|
"eval_steps_per_second": 1.607, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.087378640776699, |
|
"grad_norm": 5.015466213226318, |
|
"learning_rate": 9.126740557934874e-06, |
|
"loss": 0.6551, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0960086299892124, |
|
"grad_norm": 8.18385124206543, |
|
"learning_rate": 9.105344512809097e-06, |
|
"loss": 0.6606, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.104638619201726, |
|
"grad_norm": 3.6305551528930664, |
|
"learning_rate": 9.083715233674572e-06, |
|
"loss": 0.7058, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.1132686084142396, |
|
"grad_norm": 9.872076034545898, |
|
"learning_rate": 9.061853949340104e-06, |
|
"loss": 0.6577, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.121898597626753, |
|
"grad_norm": 4.889667510986328, |
|
"learning_rate": 9.039761901795241e-06, |
|
"loss": 0.7052, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.121898597626753, |
|
"eval_accuracy": 0.6058252427184466, |
|
"eval_loss": 0.6881099939346313, |
|
"eval_runtime": 320.8035, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.1305285868392665, |
|
"grad_norm": 3.392106056213379, |
|
"learning_rate": 9.017440346139718e-06, |
|
"loss": 0.681, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.13915857605178, |
|
"grad_norm": 5.220512866973877, |
|
"learning_rate": 8.994890550512152e-06, |
|
"loss": 0.7117, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.1477885652642934, |
|
"grad_norm": 11.190145492553711, |
|
"learning_rate": 8.972113796017992e-06, |
|
"loss": 0.7058, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.1564185544768069, |
|
"grad_norm": 3.2504310607910156, |
|
"learning_rate": 8.949111376656741e-06, |
|
"loss": 0.6867, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.1650485436893203, |
|
"grad_norm": 3.312730073928833, |
|
"learning_rate": 8.925884599248437e-06, |
|
"loss": 0.6804, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.1650485436893203, |
|
"eval_accuracy": 0.6097087378640776, |
|
"eval_loss": 0.6778111457824707, |
|
"eval_runtime": 320.8442, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.173678532901834, |
|
"grad_norm": 3.8169898986816406, |
|
"learning_rate": 8.902434783359417e-06, |
|
"loss": 0.6812, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.1823085221143474, |
|
"grad_norm": 13.139059066772461, |
|
"learning_rate": 8.878763261227337e-06, |
|
"loss": 0.7111, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.190938511326861, |
|
"grad_norm": 8.938994407653809, |
|
"learning_rate": 8.854871377685496e-06, |
|
"loss": 0.6762, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.1995685005393744, |
|
"grad_norm": 7.517580509185791, |
|
"learning_rate": 8.830760490086427e-06, |
|
"loss": 0.6817, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.2081984897518878, |
|
"grad_norm": 5.75648307800293, |
|
"learning_rate": 8.806431968224784e-06, |
|
"loss": 0.6644, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.2081984897518878, |
|
"eval_accuracy": 0.6194174757281553, |
|
"eval_loss": 0.6810408234596252, |
|
"eval_runtime": 320.9626, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.2168284789644013, |
|
"grad_norm": 6.445542812347412, |
|
"learning_rate": 8.781887194259523e-06, |
|
"loss": 0.6684, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.2254584681769147, |
|
"grad_norm": 5.923236846923828, |
|
"learning_rate": 8.757127562635374e-06, |
|
"loss": 0.6802, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.2340884573894282, |
|
"grad_norm": 5.63727331161499, |
|
"learning_rate": 8.732154480003625e-06, |
|
"loss": 0.7045, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.2427184466019416, |
|
"grad_norm": 5.639196872711182, |
|
"learning_rate": 8.706969365142202e-06, |
|
"loss": 0.6916, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.2513484358144553, |
|
"grad_norm": 6.068101406097412, |
|
"learning_rate": 8.681573648875064e-06, |
|
"loss": 0.6566, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.2513484358144553, |
|
"eval_accuracy": 0.6135922330097088, |
|
"eval_loss": 0.6820415258407593, |
|
"eval_runtime": 320.9166, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.2599784250269688, |
|
"grad_norm": 5.288263320922852, |
|
"learning_rate": 8.655968773990922e-06, |
|
"loss": 0.6696, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.2686084142394822, |
|
"grad_norm": 9.293752670288086, |
|
"learning_rate": 8.630156195161264e-06, |
|
"loss": 0.6407, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.2772384034519957, |
|
"grad_norm": 14.672719955444336, |
|
"learning_rate": 8.604137378857713e-06, |
|
"loss": 0.6507, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.2858683926645091, |
|
"grad_norm": 9.176056861877441, |
|
"learning_rate": 8.577913803268719e-06, |
|
"loss": 0.7229, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2944983818770226, |
|
"grad_norm": 12.57158374786377, |
|
"learning_rate": 8.551486958215569e-06, |
|
"loss": 0.7024, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2944983818770226, |
|
"eval_accuracy": 0.6116504854368932, |
|
"eval_loss": 0.6744683384895325, |
|
"eval_runtime": 321.1558, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.303128371089536, |
|
"grad_norm": 6.8445305824279785, |
|
"learning_rate": 8.524858345067757e-06, |
|
"loss": 0.6842, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.3117583603020497, |
|
"grad_norm": 5.6327643394470215, |
|
"learning_rate": 8.498029476657686e-06, |
|
"loss": 0.6904, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.3203883495145632, |
|
"grad_norm": 10.025938987731934, |
|
"learning_rate": 8.471001877194708e-06, |
|
"loss": 0.6733, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.3290183387270766, |
|
"grad_norm": 6.761681079864502, |
|
"learning_rate": 8.443777082178556e-06, |
|
"loss": 0.6767, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.33764832793959, |
|
"grad_norm": 5.284752368927002, |
|
"learning_rate": 8.416356638312082e-06, |
|
"loss": 0.7241, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.33764832793959, |
|
"eval_accuracy": 0.6135922330097088, |
|
"eval_loss": 0.6697773933410645, |
|
"eval_runtime": 321.0762, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.3462783171521036, |
|
"grad_norm": 5.520620346069336, |
|
"learning_rate": 8.388742103413397e-06, |
|
"loss": 0.6738, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.354908306364617, |
|
"grad_norm": 4.6568098068237305, |
|
"learning_rate": 8.360935046327373e-06, |
|
"loss": 0.671, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.3635382955771305, |
|
"grad_norm": 4.777432441711426, |
|
"learning_rate": 8.332937046836503e-06, |
|
"loss": 0.69, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.3721682847896441, |
|
"grad_norm": 8.115592956542969, |
|
"learning_rate": 8.304749695571157e-06, |
|
"loss": 0.6583, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.3807982740021574, |
|
"grad_norm": 11.980337142944336, |
|
"learning_rate": 8.276374593919213e-06, |
|
"loss": 0.7378, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3807982740021574, |
|
"eval_accuracy": 0.6058252427184466, |
|
"eval_loss": 0.6734395027160645, |
|
"eval_runtime": 320.9778, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.389428263214671, |
|
"grad_norm": 3.5900051593780518, |
|
"learning_rate": 8.247813353935073e-06, |
|
"loss": 0.664, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.3980582524271845, |
|
"grad_norm": 14.644140243530273, |
|
"learning_rate": 8.219067598248087e-06, |
|
"loss": 0.6718, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.406688241639698, |
|
"grad_norm": 6.659509658813477, |
|
"learning_rate": 8.190138959970366e-06, |
|
"loss": 0.6476, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.4153182308522114, |
|
"grad_norm": 5.535285949707031, |
|
"learning_rate": 8.161029082603994e-06, |
|
"loss": 0.642, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.4239482200647249, |
|
"grad_norm": 7.590597152709961, |
|
"learning_rate": 8.131739619947667e-06, |
|
"loss": 0.6584, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.4239482200647249, |
|
"eval_accuracy": 0.6, |
|
"eval_loss": 0.6994197964668274, |
|
"eval_runtime": 321.0664, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.4325782092772383, |
|
"grad_norm": 13.075584411621094, |
|
"learning_rate": 8.102272236002729e-06, |
|
"loss": 0.7239, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.4412081984897518, |
|
"grad_norm": 6.066156387329102, |
|
"learning_rate": 8.072628604878638e-06, |
|
"loss": 0.7182, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.4498381877022655, |
|
"grad_norm": 4.588730335235596, |
|
"learning_rate": 8.042810410697861e-06, |
|
"loss": 0.717, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.458468176914779, |
|
"grad_norm": 3.397918224334717, |
|
"learning_rate": 8.012819347500189e-06, |
|
"loss": 0.6567, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.4670981661272924, |
|
"grad_norm": 8.24763298034668, |
|
"learning_rate": 7.982657119146495e-06, |
|
"loss": 0.6724, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.4670981661272924, |
|
"eval_accuracy": 0.6097087378640776, |
|
"eval_loss": 0.6715120077133179, |
|
"eval_runtime": 321.0917, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.4757281553398058, |
|
"grad_norm": 8.984458923339844, |
|
"learning_rate": 7.952325439221944e-06, |
|
"loss": 0.6653, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.4843581445523193, |
|
"grad_norm": 8.375741958618164, |
|
"learning_rate": 7.921826030938623e-06, |
|
"loss": 0.722, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.4929881337648327, |
|
"grad_norm": 8.309843063354492, |
|
"learning_rate": 7.891160627037653e-06, |
|
"loss": 0.7034, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.5016181229773462, |
|
"grad_norm": 7.065859794616699, |
|
"learning_rate": 7.860330969690749e-06, |
|
"loss": 0.6338, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.5102481121898599, |
|
"grad_norm": 5.86482048034668, |
|
"learning_rate": 7.829338810401238e-06, |
|
"loss": 0.6774, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.5102481121898599, |
|
"eval_accuracy": 0.6135922330097088, |
|
"eval_loss": 0.669984757900238, |
|
"eval_runtime": 321.0227, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.5188781014023731, |
|
"grad_norm": 6.1000075340271, |
|
"learning_rate": 7.798185909904552e-06, |
|
"loss": 0.6813, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.5275080906148868, |
|
"grad_norm": 8.106244087219238, |
|
"learning_rate": 7.766874038068202e-06, |
|
"loss": 0.7138, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.5361380798274002, |
|
"grad_norm": 5.946533203125, |
|
"learning_rate": 7.735404973791223e-06, |
|
"loss": 0.7025, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.5447680690399137, |
|
"grad_norm": 6.442516326904297, |
|
"learning_rate": 7.703780504903107e-06, |
|
"loss": 0.6643, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.5533980582524272, |
|
"grad_norm": 6.0701985359191895, |
|
"learning_rate": 7.672002428062245e-06, |
|
"loss": 0.6653, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.5533980582524272, |
|
"eval_accuracy": 0.6097087378640776, |
|
"eval_loss": 0.6695827841758728, |
|
"eval_runtime": 321.0661, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.5620280474649406, |
|
"grad_norm": 10.973797798156738, |
|
"learning_rate": 7.640072548653843e-06, |
|
"loss": 0.6681, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.5706580366774543, |
|
"grad_norm": 9.289361000061035, |
|
"learning_rate": 7.607992680687362e-06, |
|
"loss": 0.6297, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.5792880258899675, |
|
"grad_norm": 6.6282148361206055, |
|
"learning_rate": 7.575764646693447e-06, |
|
"loss": 0.706, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.5879180151024812, |
|
"grad_norm": 4.8196702003479, |
|
"learning_rate": 7.5433902776204015e-06, |
|
"loss": 0.6669, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.5965480043149944, |
|
"grad_norm": 8.807297706604004, |
|
"learning_rate": 7.510871412730157e-06, |
|
"loss": 0.6641, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.5965480043149944, |
|
"eval_accuracy": 0.5980582524271845, |
|
"eval_loss": 0.6732643246650696, |
|
"eval_runtime": 321.036, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.6051779935275081, |
|
"grad_norm": 4.137267589569092, |
|
"learning_rate": 7.478209899493787e-06, |
|
"loss": 0.6345, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.6138079827400216, |
|
"grad_norm": 7.294461250305176, |
|
"learning_rate": 7.445407593486535e-06, |
|
"loss": 0.6899, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.622437971952535, |
|
"grad_norm": 8.29757308959961, |
|
"learning_rate": 7.41246635828241e-06, |
|
"loss": 0.6848, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.6310679611650487, |
|
"grad_norm": 10.072659492492676, |
|
"learning_rate": 7.379388065348305e-06, |
|
"loss": 0.6829, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.639697950377562, |
|
"grad_norm": 8.695294380187988, |
|
"learning_rate": 7.346174593937676e-06, |
|
"loss": 0.7241, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.639697950377562, |
|
"eval_accuracy": 0.596116504854369, |
|
"eval_loss": 0.6652901768684387, |
|
"eval_runtime": 321.0146, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.6483279395900756, |
|
"grad_norm": 3.542787790298462, |
|
"learning_rate": 7.31282783098378e-06, |
|
"loss": 0.6428, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.6569579288025889, |
|
"grad_norm": 6.900018215179443, |
|
"learning_rate": 7.279349670992464e-06, |
|
"loss": 0.6494, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.6655879180151025, |
|
"grad_norm": 7.8714189529418945, |
|
"learning_rate": 7.245742015934547e-06, |
|
"loss": 0.5778, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.674217907227616, |
|
"grad_norm": 4.089023590087891, |
|
"learning_rate": 7.212006775137761e-06, |
|
"loss": 0.6912, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.6828478964401294, |
|
"grad_norm": 5.432620048522949, |
|
"learning_rate": 7.178145865178268e-06, |
|
"loss": 0.6496, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.6828478964401294, |
|
"eval_accuracy": 0.6116504854368932, |
|
"eval_loss": 0.6761239767074585, |
|
"eval_runtime": 320.9902, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.691477885652643, |
|
"grad_norm": 4.092471122741699, |
|
"learning_rate": 7.144161209771788e-06, |
|
"loss": 0.6757, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.7001078748651564, |
|
"grad_norm": 6.498571872711182, |
|
"learning_rate": 7.110054739664303e-06, |
|
"loss": 0.6111, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.70873786407767, |
|
"grad_norm": 9.238410949707031, |
|
"learning_rate": 7.075828392522362e-06, |
|
"loss": 0.5998, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.7173678532901833, |
|
"grad_norm": 5.266243934631348, |
|
"learning_rate": 7.04148411282301e-06, |
|
"loss": 0.655, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.725997842502697, |
|
"grad_norm": 8.122797966003418, |
|
"learning_rate": 7.0070238517433e-06, |
|
"loss": 0.662, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.725997842502697, |
|
"eval_accuracy": 0.6038834951456311, |
|
"eval_loss": 0.6728688478469849, |
|
"eval_runtime": 320.9753, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.7346278317152104, |
|
"grad_norm": 8.114389419555664, |
|
"learning_rate": 6.972449567049463e-06, |
|
"loss": 0.6923, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.7432578209277239, |
|
"grad_norm": 6.447281837463379, |
|
"learning_rate": 6.9377632229856665e-06, |
|
"loss": 0.6625, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.7518878101402373, |
|
"grad_norm": 8.996492385864258, |
|
"learning_rate": 6.902966790162425e-06, |
|
"loss": 0.6919, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.7605177993527508, |
|
"grad_norm": 5.145361423492432, |
|
"learning_rate": 6.868062245444655e-06, |
|
"loss": 0.6468, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.7691477885652644, |
|
"grad_norm": 6.459311008453369, |
|
"learning_rate": 6.833051571839347e-06, |
|
"loss": 0.7049, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.7691477885652644, |
|
"eval_accuracy": 0.6135922330097088, |
|
"eval_loss": 0.6757835149765015, |
|
"eval_runtime": 320.6068, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 8.930355072021484, |
|
"learning_rate": 6.797936758382924e-06, |
|
"loss": 0.6384, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.7864077669902914, |
|
"grad_norm": 8.780126571655273, |
|
"learning_rate": 6.762719800028231e-06, |
|
"loss": 0.6169, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.7950377562028046, |
|
"grad_norm": 7.830219745635986, |
|
"learning_rate": 6.727402697531193e-06, |
|
"loss": 0.6596, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.8036677454153183, |
|
"grad_norm": 4.703182697296143, |
|
"learning_rate": 6.69198745733716e-06, |
|
"loss": 0.6964, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.8122977346278317, |
|
"grad_norm": 4.655829906463623, |
|
"learning_rate": 6.656476091466901e-06, |
|
"loss": 0.6483, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.8122977346278317, |
|
"eval_accuracy": 0.6135922330097088, |
|
"eval_loss": 0.6741885542869568, |
|
"eval_runtime": 320.6691, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.8209277238403452, |
|
"grad_norm": 4.25952672958374, |
|
"learning_rate": 6.620870617402312e-06, |
|
"loss": 0.6732, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.8295577130528586, |
|
"grad_norm": 6.7814226150512695, |
|
"learning_rate": 6.585173057971787e-06, |
|
"loss": 0.6674, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.838187702265372, |
|
"grad_norm": 4.3662638664245605, |
|
"learning_rate": 6.5493854412352985e-06, |
|
"loss": 0.6807, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.8468176914778858, |
|
"grad_norm": 5.596447467803955, |
|
"learning_rate": 6.5135098003691865e-06, |
|
"loss": 0.6637, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.855447680690399, |
|
"grad_norm": 4.839741230010986, |
|
"learning_rate": 6.477548173550635e-06, |
|
"loss": 0.678, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.855447680690399, |
|
"eval_accuracy": 0.6310679611650486, |
|
"eval_loss": 0.6695934534072876, |
|
"eval_runtime": 320.6467, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.8640776699029127, |
|
"grad_norm": 11.375150680541992, |
|
"learning_rate": 6.441502603841892e-06, |
|
"loss": 0.6592, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.8727076591154261, |
|
"grad_norm": 6.302811145782471, |
|
"learning_rate": 6.405375139074194e-06, |
|
"loss": 0.6413, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.8813376483279396, |
|
"grad_norm": 9.698513984680176, |
|
"learning_rate": 6.369167831731419e-06, |
|
"loss": 0.6304, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.889967637540453, |
|
"grad_norm": 9.770709991455078, |
|
"learning_rate": 6.332882738833485e-06, |
|
"loss": 0.6144, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.8985976267529665, |
|
"grad_norm": 10.665081977844238, |
|
"learning_rate": 6.296521921819489e-06, |
|
"loss": 0.678, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.8985976267529665, |
|
"eval_accuracy": 0.6233009708737864, |
|
"eval_loss": 0.6689735054969788, |
|
"eval_runtime": 320.6295, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.9072276159654802, |
|
"grad_norm": 8.206169128417969, |
|
"learning_rate": 6.260087446430582e-06, |
|
"loss": 0.6622, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.9158576051779934, |
|
"grad_norm": 11.89337158203125, |
|
"learning_rate": 6.223581382592625e-06, |
|
"loss": 0.6567, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.924487594390507, |
|
"grad_norm": 4.916356086730957, |
|
"learning_rate": 6.18700580429857e-06, |
|
"loss": 0.6634, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.9331175836030206, |
|
"grad_norm": 9.565736770629883, |
|
"learning_rate": 6.150362789490654e-06, |
|
"loss": 0.6532, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.941747572815534, |
|
"grad_norm": 10.54036808013916, |
|
"learning_rate": 6.113654419942334e-06, |
|
"loss": 0.6953, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.941747572815534, |
|
"eval_accuracy": 0.625242718446602, |
|
"eval_loss": 0.6624494791030884, |
|
"eval_runtime": 320.6343, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.9503775620280475, |
|
"grad_norm": 12.351181983947754, |
|
"learning_rate": 6.0768827811400166e-06, |
|
"loss": 0.71, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.959007551240561, |
|
"grad_norm": 6.94906759262085, |
|
"learning_rate": 6.040049962164585e-06, |
|
"loss": 0.6464, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.9676375404530746, |
|
"grad_norm": 6.037535667419434, |
|
"learning_rate": 6.0031580555727005e-06, |
|
"loss": 0.6598, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.9762675296655878, |
|
"grad_norm": 11.901267051696777, |
|
"learning_rate": 5.9662091572779325e-06, |
|
"loss": 0.6292, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.9848975188781015, |
|
"grad_norm": 7.471567153930664, |
|
"learning_rate": 5.929205366431679e-06, |
|
"loss": 0.6969, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.9848975188781015, |
|
"eval_accuracy": 0.6368932038834951, |
|
"eval_loss": 0.6725260019302368, |
|
"eval_runtime": 320.5652, |
|
"eval_samples_per_second": 1.607, |
|
"eval_steps_per_second": 1.607, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.9935275080906147, |
|
"grad_norm": 4.360079765319824, |
|
"learning_rate": 5.892148785303905e-06, |
|
"loss": 0.6386, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.0021574973031284, |
|
"grad_norm": 7.370548725128174, |
|
"learning_rate": 5.855041519163718e-06, |
|
"loss": 0.5936, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.0107874865156417, |
|
"grad_norm": 11.645364761352539, |
|
"learning_rate": 5.817885676159754e-06, |
|
"loss": 0.7021, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.0194174757281553, |
|
"grad_norm": 9.975643157958984, |
|
"learning_rate": 5.78068336720041e-06, |
|
"loss": 0.62, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.028047464940669, |
|
"grad_norm": 8.763169288635254, |
|
"learning_rate": 5.743436705833922e-06, |
|
"loss": 0.6492, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.028047464940669, |
|
"eval_accuracy": 0.6485436893203883, |
|
"eval_loss": 0.656815767288208, |
|
"eval_runtime": 320.6788, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.0366774541531822, |
|
"grad_norm": 6.766859531402588, |
|
"learning_rate": 5.706147808128288e-06, |
|
"loss": 0.6385, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.045307443365696, |
|
"grad_norm": 7.149226665496826, |
|
"learning_rate": 5.668818792551052e-06, |
|
"loss": 0.5838, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.053937432578209, |
|
"grad_norm": 6.320857048034668, |
|
"learning_rate": 5.6314517798489395e-06, |
|
"loss": 0.655, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.062567421790723, |
|
"grad_norm": 12.915064811706543, |
|
"learning_rate": 5.594048892927382e-06, |
|
"loss": 0.7095, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.071197411003236, |
|
"grad_norm": 7.46158504486084, |
|
"learning_rate": 5.556612256729909e-06, |
|
"loss": 0.6572, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.071197411003236, |
|
"eval_accuracy": 0.6446601941747573, |
|
"eval_loss": 0.669795036315918, |
|
"eval_runtime": 320.7237, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.0798274002157497, |
|
"grad_norm": 9.09875202178955, |
|
"learning_rate": 5.519143998117424e-06, |
|
"loss": 0.6518, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.0884573894282634, |
|
"grad_norm": 9.286842346191406, |
|
"learning_rate": 5.48164624574737e-06, |
|
"loss": 0.6492, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.0970873786407767, |
|
"grad_norm": 5.891538143157959, |
|
"learning_rate": 5.444121129952799e-06, |
|
"loss": 0.648, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.1057173678532903, |
|
"grad_norm": 11.724071502685547, |
|
"learning_rate": 5.406570782621341e-06, |
|
"loss": 0.6533, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.1143473570658036, |
|
"grad_norm": 8.159801483154297, |
|
"learning_rate": 5.368997337074088e-06, |
|
"loss": 0.6204, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.1143473570658036, |
|
"eval_accuracy": 0.654368932038835, |
|
"eval_loss": 0.6549546122550964, |
|
"eval_runtime": 320.7153, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.1229773462783172, |
|
"grad_norm": 8.30516529083252, |
|
"learning_rate": 5.331402927944392e-06, |
|
"loss": 0.5746, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.1316073354908305, |
|
"grad_norm": 6.368971824645996, |
|
"learning_rate": 5.293789691056601e-06, |
|
"loss": 0.6352, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.140237324703344, |
|
"grad_norm": 18.369422912597656, |
|
"learning_rate": 5.256159763304703e-06, |
|
"loss": 0.6815, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.148867313915858, |
|
"grad_norm": 7.470778465270996, |
|
"learning_rate": 5.218515282530934e-06, |
|
"loss": 0.5849, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.157497303128371, |
|
"grad_norm": 8.369938850402832, |
|
"learning_rate": 5.180858387404325e-06, |
|
"loss": 0.6479, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.157497303128371, |
|
"eval_accuracy": 0.6446601941747573, |
|
"eval_loss": 0.6610180735588074, |
|
"eval_runtime": 320.6988, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.1661272923408847, |
|
"grad_norm": 12.514945030212402, |
|
"learning_rate": 5.143191217299189e-06, |
|
"loss": 0.5588, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.174757281553398, |
|
"grad_norm": 10.213220596313477, |
|
"learning_rate": 5.10551591217359e-06, |
|
"loss": 0.6862, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.1833872707659117, |
|
"grad_norm": 10.838960647583008, |
|
"learning_rate": 5.067834612447755e-06, |
|
"loss": 0.6218, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.192017259978425, |
|
"grad_norm": 8.767598152160645, |
|
"learning_rate": 5.0301494588824795e-06, |
|
"loss": 0.5711, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.2006472491909386, |
|
"grad_norm": 6.138967514038086, |
|
"learning_rate": 4.9924625924575095e-06, |
|
"loss": 0.6954, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.2006472491909386, |
|
"eval_accuracy": 0.6679611650485436, |
|
"eval_loss": 0.6637104153633118, |
|
"eval_runtime": 320.7599, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.209277238403452, |
|
"grad_norm": 10.984577178955078, |
|
"learning_rate": 4.954776154249896e-06, |
|
"loss": 0.6567, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.2179072276159655, |
|
"grad_norm": 8.720921516418457, |
|
"learning_rate": 4.9170922853123635e-06, |
|
"loss": 0.6283, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.226537216828479, |
|
"grad_norm": 10.784737586975098, |
|
"learning_rate": 4.879413126551675e-06, |
|
"loss": 0.6072, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.2351672060409924, |
|
"grad_norm": 6.139902114868164, |
|
"learning_rate": 4.84174081860699e-06, |
|
"loss": 0.5966, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.243797195253506, |
|
"grad_norm": 7.9166083335876465, |
|
"learning_rate": 4.8040775017282644e-06, |
|
"loss": 0.5668, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.243797195253506, |
|
"eval_accuracy": 0.658252427184466, |
|
"eval_loss": 0.6660070419311523, |
|
"eval_runtime": 320.7212, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.2524271844660193, |
|
"grad_norm": 6.704747676849365, |
|
"learning_rate": 4.766425315654648e-06, |
|
"loss": 0.5675, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.261057173678533, |
|
"grad_norm": 6.141285419464111, |
|
"learning_rate": 4.728786399492923e-06, |
|
"loss": 0.6543, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.269687162891046, |
|
"grad_norm": 16.798852920532227, |
|
"learning_rate": 4.69116289159598e-06, |
|
"loss": 0.5984, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.27831715210356, |
|
"grad_norm": 7.124361038208008, |
|
"learning_rate": 4.653556929441332e-06, |
|
"loss": 0.5777, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.286947141316073, |
|
"grad_norm": 13.590773582458496, |
|
"learning_rate": 4.61597064950967e-06, |
|
"loss": 0.6185, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.286947141316073, |
|
"eval_accuracy": 0.6679611650485436, |
|
"eval_loss": 0.6793263554573059, |
|
"eval_runtime": 320.6049, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.295577130528587, |
|
"grad_norm": 8.081377983093262, |
|
"learning_rate": 4.578406187163503e-06, |
|
"loss": 0.5651, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.3042071197411005, |
|
"grad_norm": 6.233886241912842, |
|
"learning_rate": 4.540865676525828e-06, |
|
"loss": 0.6087, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.3128371089536137, |
|
"grad_norm": 5.7994489669799805, |
|
"learning_rate": 4.503351250358893e-06, |
|
"loss": 0.6153, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.3214670981661274, |
|
"grad_norm": 21.2513427734375, |
|
"learning_rate": 4.465865039943023e-06, |
|
"loss": 0.5765, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.3300970873786406, |
|
"grad_norm": 13.356746673583984, |
|
"learning_rate": 4.428409174955548e-06, |
|
"loss": 0.5314, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.3300970873786406, |
|
"eval_accuracy": 0.6718446601941748, |
|
"eval_loss": 0.6751753091812134, |
|
"eval_runtime": 320.6989, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.3387270765911543, |
|
"grad_norm": 10.287054061889648, |
|
"learning_rate": 4.3909857833498015e-06, |
|
"loss": 0.6288, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.347357065803668, |
|
"grad_norm": 8.844134330749512, |
|
"learning_rate": 4.353596991234228e-06, |
|
"loss": 0.6502, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.355987055016181, |
|
"grad_norm": 18.77345848083496, |
|
"learning_rate": 4.3162449227516015e-06, |
|
"loss": 0.6461, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.364617044228695, |
|
"grad_norm": 5.465780258178711, |
|
"learning_rate": 4.278931699958337e-06, |
|
"loss": 0.5786, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.373247033441208, |
|
"grad_norm": 9.964437484741211, |
|
"learning_rate": 4.241659442703937e-06, |
|
"loss": 0.6406, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.373247033441208, |
|
"eval_accuracy": 0.6563106796116505, |
|
"eval_loss": 0.6680858731269836, |
|
"eval_runtime": 320.7173, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.381877022653722, |
|
"grad_norm": 16.344274520874023, |
|
"learning_rate": 4.2044302685105635e-06, |
|
"loss": 0.6201, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.390507011866235, |
|
"grad_norm": 6.842400074005127, |
|
"learning_rate": 4.167246292452724e-06, |
|
"loss": 0.5944, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.3991370010787487, |
|
"grad_norm": 15.446759223937988, |
|
"learning_rate": 4.130109627037124e-06, |
|
"loss": 0.5883, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.407766990291262, |
|
"grad_norm": 8.021566390991211, |
|
"learning_rate": 4.093022382082639e-06, |
|
"loss": 0.6618, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.4163969795037756, |
|
"grad_norm": 10.198580741882324, |
|
"learning_rate": 4.0559866646004546e-06, |
|
"loss": 0.7011, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.4163969795037756, |
|
"eval_accuracy": 0.6679611650485436, |
|
"eval_loss": 0.6721732020378113, |
|
"eval_runtime": 320.5897, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.4250269687162893, |
|
"grad_norm": 7.147483825683594, |
|
"learning_rate": 4.0190045786743656e-06, |
|
"loss": 0.5454, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.4336569579288025, |
|
"grad_norm": 6.587264060974121, |
|
"learning_rate": 3.982078225341232e-06, |
|
"loss": 0.5114, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.4422869471413162, |
|
"grad_norm": 9.162304878234863, |
|
"learning_rate": 3.945209702471622e-06, |
|
"loss": 0.712, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.4509169363538295, |
|
"grad_norm": 8.858553886413574, |
|
"learning_rate": 3.908401104650621e-06, |
|
"loss": 0.6119, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.459546925566343, |
|
"grad_norm": 7.771361827850342, |
|
"learning_rate": 3.871654523058831e-06, |
|
"loss": 0.6195, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.459546925566343, |
|
"eval_accuracy": 0.6757281553398058, |
|
"eval_loss": 0.6643590927124023, |
|
"eval_runtime": 320.706, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.4681769147788564, |
|
"grad_norm": 7.508529186248779, |
|
"learning_rate": 3.834972045353575e-06, |
|
"loss": 0.6087, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.47680690399137, |
|
"grad_norm": 9.493097305297852, |
|
"learning_rate": 3.798355755550292e-06, |
|
"loss": 0.6224, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.4854368932038833, |
|
"grad_norm": 7.044253826141357, |
|
"learning_rate": 3.7618077339041244e-06, |
|
"loss": 0.6495, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.494066882416397, |
|
"grad_norm": 6.932374954223633, |
|
"learning_rate": 3.725330056791753e-06, |
|
"loss": 0.627, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.5026968716289106, |
|
"grad_norm": 8.32701301574707, |
|
"learning_rate": 3.6889247965934195e-06, |
|
"loss": 0.6675, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.5026968716289106, |
|
"eval_accuracy": 0.6601941747572816, |
|
"eval_loss": 0.6530495285987854, |
|
"eval_runtime": 320.625, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.511326860841424, |
|
"grad_norm": 7.712283134460449, |
|
"learning_rate": 3.6525940215751987e-06, |
|
"loss": 0.6522, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.5199568500539375, |
|
"grad_norm": 8.3215913772583, |
|
"learning_rate": 3.6163397957714895e-06, |
|
"loss": 0.6759, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.528586839266451, |
|
"grad_norm": 6.627832412719727, |
|
"learning_rate": 3.5801641788677576e-06, |
|
"loss": 0.6035, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.5372168284789645, |
|
"grad_norm": 11.45533561706543, |
|
"learning_rate": 3.5440692260835162e-06, |
|
"loss": 0.6256, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.545846817691478, |
|
"grad_norm": 6.252264499664307, |
|
"learning_rate": 3.508056988055564e-06, |
|
"loss": 0.5796, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.545846817691478, |
|
"eval_accuracy": 0.6601941747572816, |
|
"eval_loss": 0.6489056348800659, |
|
"eval_runtime": 320.6022, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.5544768069039914, |
|
"grad_norm": 10.386983871459961, |
|
"learning_rate": 3.4721295107214835e-06, |
|
"loss": 0.4864, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.5631067961165046, |
|
"grad_norm": 8.145389556884766, |
|
"learning_rate": 3.4362888352034153e-06, |
|
"loss": 0.6728, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.5717367853290183, |
|
"grad_norm": 6.486176013946533, |
|
"learning_rate": 3.4005369976920837e-06, |
|
"loss": 0.6055, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.580366774541532, |
|
"grad_norm": 10.21779727935791, |
|
"learning_rate": 3.3648760293311267e-06, |
|
"loss": 0.6123, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.588996763754045, |
|
"grad_norm": 8.619269371032715, |
|
"learning_rate": 3.3293079561016957e-06, |
|
"loss": 0.6148, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.588996763754045, |
|
"eval_accuracy": 0.6679611650485436, |
|
"eval_loss": 0.6675190329551697, |
|
"eval_runtime": 320.4804, |
|
"eval_samples_per_second": 1.607, |
|
"eval_steps_per_second": 1.607, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.597626752966559, |
|
"grad_norm": 14.024328231811523, |
|
"learning_rate": 3.2938347987073576e-06, |
|
"loss": 0.6054, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.606256742179072, |
|
"grad_norm": 13.966845512390137, |
|
"learning_rate": 3.2584585724592967e-06, |
|
"loss": 0.5767, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.614886731391586, |
|
"grad_norm": 6.929962635040283, |
|
"learning_rate": 3.223181287161812e-06, |
|
"loss": 0.5214, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.6235167206040995, |
|
"grad_norm": 9.28740406036377, |
|
"learning_rate": 3.1880049469981468e-06, |
|
"loss": 0.5823, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.6321467098166127, |
|
"grad_norm": 22.37981414794922, |
|
"learning_rate": 3.1529315504166147e-06, |
|
"loss": 0.6293, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.6321467098166127, |
|
"eval_accuracy": 0.6368932038834951, |
|
"eval_loss": 0.6685478091239929, |
|
"eval_runtime": 321.0635, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.6407766990291264, |
|
"grad_norm": 17.161617279052734, |
|
"learning_rate": 3.117963090017071e-06, |
|
"loss": 0.5728, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.6494066882416396, |
|
"grad_norm": 19.009254455566406, |
|
"learning_rate": 3.08310155243771e-06, |
|
"loss": 0.7621, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.6580366774541533, |
|
"grad_norm": 12.797933578491211, |
|
"learning_rate": 3.048348918242191e-06, |
|
"loss": 0.5567, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 10.396708488464355, |
|
"learning_rate": 3.013707161807128e-06, |
|
"loss": 0.6592, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.67529665587918, |
|
"grad_norm": 8.590036392211914, |
|
"learning_rate": 2.9791782512099098e-06, |
|
"loss": 0.6095, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.67529665587918, |
|
"eval_accuracy": 0.6621359223300971, |
|
"eval_loss": 0.6717608571052551, |
|
"eval_runtime": 321.0303, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.6839266450916934, |
|
"grad_norm": 11.886474609375, |
|
"learning_rate": 2.944764148116902e-06, |
|
"loss": 0.4862, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.692556634304207, |
|
"grad_norm": 15.282882690429688, |
|
"learning_rate": 2.9104668076719876e-06, |
|
"loss": 0.5833, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.701186623516721, |
|
"grad_norm": 15.11883544921875, |
|
"learning_rate": 2.8762881783855025e-06, |
|
"loss": 0.5887, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.709816612729234, |
|
"grad_norm": 9.773431777954102, |
|
"learning_rate": 2.8422302020235252e-06, |
|
"loss": 0.6644, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.7184466019417477, |
|
"grad_norm": 16.19442367553711, |
|
"learning_rate": 2.808294813497563e-06, |
|
"loss": 0.5422, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.7184466019417477, |
|
"eval_accuracy": 0.6485436893203883, |
|
"eval_loss": 0.6904874444007874, |
|
"eval_runtime": 321.1401, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.727076591154261, |
|
"grad_norm": 16.843564987182617, |
|
"learning_rate": 2.7744839407546374e-06, |
|
"loss": 0.6523, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.7357065803667746, |
|
"grad_norm": 18.18024253845215, |
|
"learning_rate": 2.7407995046677377e-06, |
|
"loss": 0.5283, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.7443365695792883, |
|
"grad_norm": 20.41519546508789, |
|
"learning_rate": 2.7072434189266945e-06, |
|
"loss": 0.5934, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.7529665587918015, |
|
"grad_norm": 14.765863418579102, |
|
"learning_rate": 2.6738175899294703e-06, |
|
"loss": 0.6699, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.7615965480043148, |
|
"grad_norm": 17.99534034729004, |
|
"learning_rate": 2.640523916673838e-06, |
|
"loss": 0.6089, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.7615965480043148, |
|
"eval_accuracy": 0.654368932038835, |
|
"eval_loss": 0.6814106106758118, |
|
"eval_runtime": 321.1084, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.7702265372168284, |
|
"grad_norm": 5.104621887207031, |
|
"learning_rate": 2.607364290649501e-06, |
|
"loss": 0.6884, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.778856526429342, |
|
"grad_norm": 17.406665802001953, |
|
"learning_rate": 2.574340595730633e-06, |
|
"loss": 0.6264, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.7874865156418553, |
|
"grad_norm": 8.697972297668457, |
|
"learning_rate": 2.541454708068855e-06, |
|
"loss": 0.5552, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.796116504854369, |
|
"grad_norm": 7.472986698150635, |
|
"learning_rate": 2.5087084959866403e-06, |
|
"loss": 0.596, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.8047464940668823, |
|
"grad_norm": 11.333291053771973, |
|
"learning_rate": 2.476103819871166e-06, |
|
"loss": 0.6238, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.8047464940668823, |
|
"eval_accuracy": 0.6466019417475728, |
|
"eval_loss": 0.6738768815994263, |
|
"eval_runtime": 321.0019, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.813376483279396, |
|
"grad_norm": 15.323911666870117, |
|
"learning_rate": 2.44364253206864e-06, |
|
"loss": 0.6472, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.8220064724919096, |
|
"grad_norm": 14.362588882446289, |
|
"learning_rate": 2.4113264767790433e-06, |
|
"loss": 0.6375, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.830636461704423, |
|
"grad_norm": 11.027913093566895, |
|
"learning_rate": 2.379157489951367e-06, |
|
"loss": 0.6185, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.839266450916936, |
|
"grad_norm": 8.004063606262207, |
|
"learning_rate": 2.3471373991793116e-06, |
|
"loss": 0.6608, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.8478964401294498, |
|
"grad_norm": 11.401987075805664, |
|
"learning_rate": 2.315268023597447e-06, |
|
"loss": 0.7386, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.8478964401294498, |
|
"eval_accuracy": 0.6485436893203883, |
|
"eval_loss": 0.6621807813644409, |
|
"eval_runtime": 321.0895, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.8565264293419634, |
|
"grad_norm": 11.381020545959473, |
|
"learning_rate": 2.2835511737778687e-06, |
|
"loss": 0.5386, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.8651564185544767, |
|
"grad_norm": 14.900254249572754, |
|
"learning_rate": 2.2519886516273365e-06, |
|
"loss": 0.6754, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.8737864077669903, |
|
"grad_norm": 10.069350242614746, |
|
"learning_rate": 2.220582250284905e-06, |
|
"loss": 0.6129, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.8824163969795036, |
|
"grad_norm": 8.782756805419922, |
|
"learning_rate": 2.189333754020046e-06, |
|
"loss": 0.6185, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.8910463861920173, |
|
"grad_norm": 8.9526948928833, |
|
"learning_rate": 2.158244938131277e-06, |
|
"loss": 0.6166, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.8910463861920173, |
|
"eval_accuracy": 0.654368932038835, |
|
"eval_loss": 0.6567447781562805, |
|
"eval_runtime": 320.6468, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.899676375404531, |
|
"grad_norm": 6.0573625564575195, |
|
"learning_rate": 2.12731756884532e-06, |
|
"loss": 0.6601, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.908306364617044, |
|
"grad_norm": 15.11607837677002, |
|
"learning_rate": 2.096553403216739e-06, |
|
"loss": 0.7397, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.916936353829558, |
|
"grad_norm": 7.567427635192871, |
|
"learning_rate": 2.0659541890281236e-06, |
|
"loss": 0.5167, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.925566343042071, |
|
"grad_norm": 11.045202255249023, |
|
"learning_rate": 2.0355216646908016e-06, |
|
"loss": 0.6497, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.9341963322545848, |
|
"grad_norm": 14.782462120056152, |
|
"learning_rate": 2.0052575591460636e-06, |
|
"loss": 0.5866, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.9341963322545848, |
|
"eval_accuracy": 0.6504854368932039, |
|
"eval_loss": 0.6615984439849854, |
|
"eval_runtime": 320.6259, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.9428263214670984, |
|
"grad_norm": 5.701985836029053, |
|
"learning_rate": 1.975163591766946e-06, |
|
"loss": 0.6723, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.9514563106796117, |
|
"grad_norm": 10.19908618927002, |
|
"learning_rate": 1.9452414722605432e-06, |
|
"loss": 0.592, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.960086299892125, |
|
"grad_norm": 8.34867000579834, |
|
"learning_rate": 1.915492900570887e-06, |
|
"loss": 0.6623, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.9687162891046386, |
|
"grad_norm": 14.363434791564941, |
|
"learning_rate": 1.885919566782352e-06, |
|
"loss": 0.6295, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.9773462783171523, |
|
"grad_norm": 9.90467357635498, |
|
"learning_rate": 1.8565231510236531e-06, |
|
"loss": 0.6348, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.9773462783171523, |
|
"eval_accuracy": 0.6563106796116505, |
|
"eval_loss": 0.6633828282356262, |
|
"eval_runtime": 320.6481, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.9859762675296655, |
|
"grad_norm": 13.353963851928711, |
|
"learning_rate": 1.8273053233723843e-06, |
|
"loss": 0.5338, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.994606256742179, |
|
"grad_norm": 14.00833797454834, |
|
"learning_rate": 1.798267743760142e-06, |
|
"loss": 0.633, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 3.0032362459546924, |
|
"grad_norm": 14.501118659973145, |
|
"learning_rate": 1.7694120618782169e-06, |
|
"loss": 0.5085, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 3.011866235167206, |
|
"grad_norm": 9.27495002746582, |
|
"learning_rate": 1.7407399170838802e-06, |
|
"loss": 0.5477, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 3.0204962243797193, |
|
"grad_norm": 12.652294158935547, |
|
"learning_rate": 1.7122529383072346e-06, |
|
"loss": 0.5907, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.0204962243797193, |
|
"eval_accuracy": 0.658252427184466, |
|
"eval_loss": 0.6642096042633057, |
|
"eval_runtime": 320.7217, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.029126213592233, |
|
"grad_norm": 12.352764129638672, |
|
"learning_rate": 1.68395274395868e-06, |
|
"loss": 0.5256, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 3.0377562028047467, |
|
"grad_norm": 6.0259222984313965, |
|
"learning_rate": 1.6558409418369686e-06, |
|
"loss": 0.4449, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 3.04638619201726, |
|
"grad_norm": 4.154427528381348, |
|
"learning_rate": 1.6279191290378566e-06, |
|
"loss": 0.449, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 3.0550161812297736, |
|
"grad_norm": 12.186491012573242, |
|
"learning_rate": 1.6001888918633728e-06, |
|
"loss": 0.4746, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 3.063646170442287, |
|
"grad_norm": 9.144371032714844, |
|
"learning_rate": 1.5726518057316969e-06, |
|
"loss": 0.4985, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.063646170442287, |
|
"eval_accuracy": 0.654368932038835, |
|
"eval_loss": 0.6903661489486694, |
|
"eval_runtime": 320.6325, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.0722761596548005, |
|
"grad_norm": 14.253432273864746, |
|
"learning_rate": 1.5453094350876563e-06, |
|
"loss": 0.5309, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 3.0809061488673137, |
|
"grad_norm": 14.948261260986328, |
|
"learning_rate": 1.5181633333138456e-06, |
|
"loss": 0.5263, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 3.0895361380798274, |
|
"grad_norm": 9.058218955993652, |
|
"learning_rate": 1.4912150426423766e-06, |
|
"loss": 0.5077, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 3.098166127292341, |
|
"grad_norm": 17.286836624145508, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 0.5556, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 3.1067961165048543, |
|
"grad_norm": 9.762429237365723, |
|
"learning_rate": 1.4379180072574335e-06, |
|
"loss": 0.53, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.1067961165048543, |
|
"eval_accuracy": 0.6466019417475728, |
|
"eval_loss": 0.6925872564315796, |
|
"eval_runtime": 320.6091, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.115426105717368, |
|
"grad_norm": 15.105671882629395, |
|
"learning_rate": 1.411572290470401e-06, |
|
"loss": 0.5956, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 3.1240560949298812, |
|
"grad_norm": 13.916862487792969, |
|
"learning_rate": 1.3854304404665796e-06, |
|
"loss": 0.5019, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 3.132686084142395, |
|
"grad_norm": 14.544822692871094, |
|
"learning_rate": 1.359493942424241e-06, |
|
"loss": 0.5761, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 3.141316073354908, |
|
"grad_norm": 15.535740852355957, |
|
"learning_rate": 1.3337642698551428e-06, |
|
"loss": 0.4957, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 3.149946062567422, |
|
"grad_norm": 13.230164527893066, |
|
"learning_rate": 1.3082428845208155e-06, |
|
"loss": 0.5728, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.149946062567422, |
|
"eval_accuracy": 0.654368932038835, |
|
"eval_loss": 0.6939272880554199, |
|
"eval_runtime": 320.6286, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.158576051779935, |
|
"grad_norm": 11.026480674743652, |
|
"learning_rate": 1.2829312363495155e-06, |
|
"loss": 0.5602, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 3.1672060409924487, |
|
"grad_norm": 10.449764251708984, |
|
"learning_rate": 1.2578307633538505e-06, |
|
"loss": 0.6031, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 3.1758360302049624, |
|
"grad_norm": 13.517521858215332, |
|
"learning_rate": 1.232942891549083e-06, |
|
"loss": 0.6053, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 3.1844660194174756, |
|
"grad_norm": 10.760808944702148, |
|
"learning_rate": 1.2082690348721204e-06, |
|
"loss": 0.5024, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 3.1930960086299893, |
|
"grad_norm": 14.012762069702148, |
|
"learning_rate": 1.1838105951011758e-06, |
|
"loss": 0.5011, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.1930960086299893, |
|
"eval_accuracy": 0.6601941747572816, |
|
"eval_loss": 0.6916132569313049, |
|
"eval_runtime": 320.6627, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.2017259978425026, |
|
"grad_norm": 11.190227508544922, |
|
"learning_rate": 1.1595689617761363e-06, |
|
"loss": 0.4906, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 3.2103559870550162, |
|
"grad_norm": 17.964550018310547, |
|
"learning_rate": 1.1355455121196234e-06, |
|
"loss": 0.5705, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 3.2189859762675295, |
|
"grad_norm": 21.885299682617188, |
|
"learning_rate": 1.1117416109587403e-06, |
|
"loss": 0.6581, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 3.227615965480043, |
|
"grad_norm": 10.283282279968262, |
|
"learning_rate": 1.0881586106475406e-06, |
|
"loss": 0.6133, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 3.236245954692557, |
|
"grad_norm": 8.597122192382812, |
|
"learning_rate": 1.0647978509901946e-06, |
|
"loss": 0.4987, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.236245954692557, |
|
"eval_accuracy": 0.654368932038835, |
|
"eval_loss": 0.6906397938728333, |
|
"eval_runtime": 320.6953, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.24487594390507, |
|
"grad_norm": 10.815213203430176, |
|
"learning_rate": 1.0416606591648737e-06, |
|
"loss": 0.6638, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.2535059331175837, |
|
"grad_norm": 7.768321990966797, |
|
"learning_rate": 1.018748349648348e-06, |
|
"loss": 0.5556, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 3.262135922330097, |
|
"grad_norm": 11.6558837890625, |
|
"learning_rate": 9.960622241413137e-07, |
|
"loss": 0.5817, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.2707659115426106, |
|
"grad_norm": 14.339502334594727, |
|
"learning_rate": 9.736035714944314e-07, |
|
"loss": 0.5237, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 3.279395900755124, |
|
"grad_norm": 15.16897964477539, |
|
"learning_rate": 9.513736676351104e-07, |
|
"loss": 0.5909, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.279395900755124, |
|
"eval_accuracy": 0.658252427184466, |
|
"eval_loss": 0.6882277727127075, |
|
"eval_runtime": 320.663, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.2880258899676376, |
|
"grad_norm": 13.602522850036621, |
|
"learning_rate": 9.293737754950166e-07, |
|
"loss": 0.5828, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.2966558791801512, |
|
"grad_norm": 17.136140823364258, |
|
"learning_rate": 9.076051449383294e-07, |
|
"loss": 0.6515, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 3.3052858683926645, |
|
"grad_norm": 13.352173805236816, |
|
"learning_rate": 8.860690126907229e-07, |
|
"loss": 0.5751, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 3.313915857605178, |
|
"grad_norm": 21.102169036865234, |
|
"learning_rate": 8.64766602269112e-07, |
|
"loss": 0.6061, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.3225458468176914, |
|
"grad_norm": 23.22005844116211, |
|
"learning_rate": 8.436991239121451e-07, |
|
"loss": 0.5194, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.3225458468176914, |
|
"eval_accuracy": 0.6524271844660194, |
|
"eval_loss": 0.6874131560325623, |
|
"eval_runtime": 320.7489, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.331175836030205, |
|
"grad_norm": 8.979095458984375, |
|
"learning_rate": 8.22867774511435e-07, |
|
"loss": 0.5395, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 3.3398058252427183, |
|
"grad_norm": 9.126049041748047, |
|
"learning_rate": 8.022737375435735e-07, |
|
"loss": 0.566, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.348435814455232, |
|
"grad_norm": 8.811643600463867, |
|
"learning_rate": 7.81918183002891e-07, |
|
"loss": 0.5703, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 3.357065803667745, |
|
"grad_norm": 9.9462308883667, |
|
"learning_rate": 7.618022673349834e-07, |
|
"loss": 0.5318, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 3.365695792880259, |
|
"grad_norm": 15.365378379821777, |
|
"learning_rate": 7.419271333710154e-07, |
|
"loss": 0.5925, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.365695792880259, |
|
"eval_accuracy": 0.6601941747572816, |
|
"eval_loss": 0.685357391834259, |
|
"eval_runtime": 320.5481, |
|
"eval_samples_per_second": 1.607, |
|
"eval_steps_per_second": 1.607, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.3743257820927726, |
|
"grad_norm": 13.633624076843262, |
|
"learning_rate": 7.222939102627919e-07, |
|
"loss": 0.6622, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 3.382955771305286, |
|
"grad_norm": 14.377915382385254, |
|
"learning_rate": 7.029037134186112e-07, |
|
"loss": 0.4916, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 3.3915857605177995, |
|
"grad_norm": 11.740239143371582, |
|
"learning_rate": 6.837576444398913e-07, |
|
"loss": 0.5409, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 3.4002157497303127, |
|
"grad_norm": 10.254107475280762, |
|
"learning_rate": 6.648567910585874e-07, |
|
"loss": 0.6555, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 3.4088457389428264, |
|
"grad_norm": 16.456100463867188, |
|
"learning_rate": 6.46202227075401e-07, |
|
"loss": 0.4709, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.4088457389428264, |
|
"eval_accuracy": 0.6621359223300971, |
|
"eval_loss": 0.6879016160964966, |
|
"eval_runtime": 320.8657, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.4174757281553396, |
|
"grad_norm": 6.954639911651611, |
|
"learning_rate": 6.277950122987631e-07, |
|
"loss": 0.542, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 3.4261057173678533, |
|
"grad_norm": 16.155237197875977, |
|
"learning_rate": 6.096361924846333e-07, |
|
"loss": 0.6621, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 3.4347357065803665, |
|
"grad_norm": 10.976309776306152, |
|
"learning_rate": 5.917267992770881e-07, |
|
"loss": 0.5217, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 3.44336569579288, |
|
"grad_norm": 17.910186767578125, |
|
"learning_rate": 5.740678501497049e-07, |
|
"loss": 0.669, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 3.451995685005394, |
|
"grad_norm": 16.26474952697754, |
|
"learning_rate": 5.566603483477607e-07, |
|
"loss": 0.5317, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.451995685005394, |
|
"eval_accuracy": 0.6601941747572816, |
|
"eval_loss": 0.6886419057846069, |
|
"eval_runtime": 320.5766, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.460625674217907, |
|
"grad_norm": 22.223215103149414, |
|
"learning_rate": 5.395052828312359e-07, |
|
"loss": 0.5363, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 3.469255663430421, |
|
"grad_norm": 8.730759620666504, |
|
"learning_rate": 5.226036282186286e-07, |
|
"loss": 0.6681, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 3.477885652642934, |
|
"grad_norm": 8.632150650024414, |
|
"learning_rate": 5.059563447315829e-07, |
|
"loss": 0.5089, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 3.4865156418554477, |
|
"grad_norm": 9.663848876953125, |
|
"learning_rate": 4.895643781403375e-07, |
|
"loss": 0.4644, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 3.4951456310679614, |
|
"grad_norm": 11.52153205871582, |
|
"learning_rate": 4.73428659709998e-07, |
|
"loss": 0.5821, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.4951456310679614, |
|
"eval_accuracy": 0.6660194174757281, |
|
"eval_loss": 0.6889378428459167, |
|
"eval_runtime": 320.9557, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.5037756202804746, |
|
"grad_norm": 17.435976028442383, |
|
"learning_rate": 4.575501061476195e-07, |
|
"loss": 0.5951, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 3.512405609492988, |
|
"grad_norm": 13.329899787902832, |
|
"learning_rate": 4.4192961955013766e-07, |
|
"loss": 0.5985, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 3.5210355987055015, |
|
"grad_norm": 10.234993934631348, |
|
"learning_rate": 4.265680873531136e-07, |
|
"loss": 0.5232, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 3.529665587918015, |
|
"grad_norm": 13.122269630432129, |
|
"learning_rate": 4.1146638228031557e-07, |
|
"loss": 0.5554, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 3.5382955771305284, |
|
"grad_norm": 10.752240180969238, |
|
"learning_rate": 3.966253622941385e-07, |
|
"loss": 0.5887, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.5382955771305284, |
|
"eval_accuracy": 0.6640776699029126, |
|
"eval_loss": 0.6890589594841003, |
|
"eval_runtime": 321.1286, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.546925566343042, |
|
"grad_norm": 13.36107063293457, |
|
"learning_rate": 3.820458705468633e-07, |
|
"loss": 0.5101, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 11.969443321228027, |
|
"learning_rate": 3.677287353327519e-07, |
|
"loss": 0.6162, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 3.564185544768069, |
|
"grad_norm": 15.6027250289917, |
|
"learning_rate": 3.536747700409932e-07, |
|
"loss": 0.6591, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 3.5728155339805827, |
|
"grad_norm": 10.335657119750977, |
|
"learning_rate": 3.3988477310948785e-07, |
|
"loss": 0.5749, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.581445523193096, |
|
"grad_norm": 7.062427043914795, |
|
"learning_rate": 3.2635952797949566e-07, |
|
"loss": 0.5362, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.581445523193096, |
|
"eval_accuracy": 0.6640776699029126, |
|
"eval_loss": 0.6879053711891174, |
|
"eval_runtime": 321.1587, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.5900755124056096, |
|
"grad_norm": 9.053596496582031, |
|
"learning_rate": 3.1309980305111674e-07, |
|
"loss": 0.5753, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 3.598705501618123, |
|
"grad_norm": 9.732317924499512, |
|
"learning_rate": 3.0010635163964186e-07, |
|
"loss": 0.5671, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 3.6073354908306365, |
|
"grad_norm": 14.350728034973145, |
|
"learning_rate": 2.8737991193275805e-07, |
|
"loss": 0.525, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 3.61596548004315, |
|
"grad_norm": 12.92699146270752, |
|
"learning_rate": 2.7492120694860237e-07, |
|
"loss": 0.5276, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 3.6245954692556634, |
|
"grad_norm": 8.268197059631348, |
|
"learning_rate": 2.627309444946929e-07, |
|
"loss": 0.4971, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.6245954692556634, |
|
"eval_accuracy": 0.6640776699029126, |
|
"eval_loss": 0.6887635588645935, |
|
"eval_runtime": 320.9246, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.6332254584681767, |
|
"grad_norm": 9.3760404586792, |
|
"learning_rate": 2.5080981712771344e-07, |
|
"loss": 0.4793, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 3.6418554476806904, |
|
"grad_norm": 17.867101669311523, |
|
"learning_rate": 2.391585021141668e-07, |
|
"loss": 0.4916, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 3.650485436893204, |
|
"grad_norm": 9.685575485229492, |
|
"learning_rate": 2.2777766139190084e-07, |
|
"loss": 0.54, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.6591154261057173, |
|
"grad_norm": 20.8098201751709, |
|
"learning_rate": 2.1666794153249792e-07, |
|
"loss": 0.6402, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 3.667745415318231, |
|
"grad_norm": 9.999732971191406, |
|
"learning_rate": 2.0582997370454882e-07, |
|
"loss": 0.5009, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 3.667745415318231, |
|
"eval_accuracy": 0.6640776699029126, |
|
"eval_loss": 0.6899433732032776, |
|
"eval_runtime": 321.085, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 3.676375404530744, |
|
"grad_norm": 13.440372467041016, |
|
"learning_rate": 1.9526437363778404e-07, |
|
"loss": 0.7073, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 3.685005393743258, |
|
"grad_norm": 12.25793170928955, |
|
"learning_rate": 1.8497174158810361e-07, |
|
"loss": 0.6589, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 3.6936353829557715, |
|
"grad_norm": 20.834096908569336, |
|
"learning_rate": 1.749526623034681e-07, |
|
"loss": 0.6127, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 3.7022653721682848, |
|
"grad_norm": 14.255398750305176, |
|
"learning_rate": 1.6520770499068083e-07, |
|
"loss": 0.4761, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 3.710895361380798, |
|
"grad_norm": 6.590888977050781, |
|
"learning_rate": 1.557374232830483e-07, |
|
"loss": 0.5813, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.710895361380798, |
|
"eval_accuracy": 0.6621359223300971, |
|
"eval_loss": 0.6886661648750305, |
|
"eval_runtime": 321.0071, |
|
"eval_samples_per_second": 1.604, |
|
"eval_steps_per_second": 1.604, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.7195253505933117, |
|
"grad_norm": 7.404444694519043, |
|
"learning_rate": 1.4654235520892958e-07, |
|
"loss": 0.5689, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 3.7281553398058254, |
|
"grad_norm": 18.861854553222656, |
|
"learning_rate": 1.3762302316116527e-07, |
|
"loss": 0.4723, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.7367853290183386, |
|
"grad_norm": 20.41657257080078, |
|
"learning_rate": 1.289799338674036e-07, |
|
"loss": 0.6008, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 3.7454153182308523, |
|
"grad_norm": 11.25420093536377, |
|
"learning_rate": 1.2061357836131104e-07, |
|
"loss": 0.5452, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 3.7540453074433655, |
|
"grad_norm": 13.756759643554688, |
|
"learning_rate": 1.1252443195467311e-07, |
|
"loss": 0.6147, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.7540453074433655, |
|
"eval_accuracy": 0.6640776699029126, |
|
"eval_loss": 0.6891469955444336, |
|
"eval_runtime": 320.9449, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.762675296655879, |
|
"grad_norm": 13.715859413146973, |
|
"learning_rate": 1.0471295421039251e-07, |
|
"loss": 0.5173, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 3.771305285868393, |
|
"grad_norm": 7.733090400695801, |
|
"learning_rate": 9.71795889163818e-08, |
|
"loss": 0.6093, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 3.779935275080906, |
|
"grad_norm": 7.727634429931641, |
|
"learning_rate": 8.992476406034845e-08, |
|
"loss": 0.5655, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 3.7885652642934198, |
|
"grad_norm": 8.828600883483887, |
|
"learning_rate": 8.294889180548104e-08, |
|
"loss": 0.7, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 3.797195253505933, |
|
"grad_norm": 8.170161247253418, |
|
"learning_rate": 7.625236846703243e-08, |
|
"loss": 0.6033, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.797195253505933, |
|
"eval_accuracy": 0.6640776699029126, |
|
"eval_loss": 0.6890521049499512, |
|
"eval_runtime": 320.8322, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.8058252427184467, |
|
"grad_norm": 10.907033920288086, |
|
"learning_rate": 6.983557448980549e-08, |
|
"loss": 0.5508, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.81445523193096, |
|
"grad_norm": 16.888439178466797, |
|
"learning_rate": 6.369887442653877e-08, |
|
"loss": 0.5819, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 3.8230852211434736, |
|
"grad_norm": 20.531522750854492, |
|
"learning_rate": 5.7842616917193064e-08, |
|
"loss": 0.4267, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 3.831715210355987, |
|
"grad_norm": 8.410703659057617, |
|
"learning_rate": 5.226713466915001e-08, |
|
"loss": 0.5266, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 3.8403451995685005, |
|
"grad_norm": 6.310892105102539, |
|
"learning_rate": 4.697274443830335e-08, |
|
"loss": 0.565, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.8403451995685005, |
|
"eval_accuracy": 0.6660194174757281, |
|
"eval_loss": 0.6890508532524109, |
|
"eval_runtime": 320.9035, |
|
"eval_samples_per_second": 1.605, |
|
"eval_steps_per_second": 1.605, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.848975188781014, |
|
"grad_norm": 28.219768524169922, |
|
"learning_rate": 4.195974701106775e-08, |
|
"loss": 0.5493, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 3.8576051779935274, |
|
"grad_norm": 19.05866241455078, |
|
"learning_rate": 3.722842718728969e-08, |
|
"loss": 0.5646, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 3.866235167206041, |
|
"grad_norm": 8.093132019042969, |
|
"learning_rate": 3.277905376406654e-08, |
|
"loss": 0.5774, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 3.8748651564185543, |
|
"grad_norm": 10.243422508239746, |
|
"learning_rate": 2.8611879520476503e-08, |
|
"loss": 0.6114, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 3.883495145631068, |
|
"grad_norm": 9.737555503845215, |
|
"learning_rate": 2.4727141203216286e-08, |
|
"loss": 0.5044, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.883495145631068, |
|
"eval_accuracy": 0.6640776699029126, |
|
"eval_loss": 0.6893202662467957, |
|
"eval_runtime": 321.2665, |
|
"eval_samples_per_second": 1.603, |
|
"eval_steps_per_second": 1.603, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.8921251348435817, |
|
"grad_norm": 15.192139625549316, |
|
"learning_rate": 2.1125059513152357e-08, |
|
"loss": 0.5512, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 3.900755124056095, |
|
"grad_norm": 23.43290901184082, |
|
"learning_rate": 1.7805839092781553e-08, |
|
"loss": 0.633, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 3.909385113268608, |
|
"grad_norm": 13.518702507019043, |
|
"learning_rate": 1.4769668514605374e-08, |
|
"loss": 0.5216, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 3.918015102481122, |
|
"grad_norm": 11.329241752624512, |
|
"learning_rate": 1.2016720270417448e-08, |
|
"loss": 0.5502, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 3.9266450916936355, |
|
"grad_norm": 20.290353775024414, |
|
"learning_rate": 9.547150761501922e-09, |
|
"loss": 0.613, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.9266450916936355, |
|
"eval_accuracy": 0.6660194174757281, |
|
"eval_loss": 0.68938148021698, |
|
"eval_runtime": 320.6069, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.9352750809061487, |
|
"grad_norm": 10.623443603515625, |
|
"learning_rate": 7.3611002897489015e-09, |
|
"loss": 0.5943, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 3.9439050701186624, |
|
"grad_norm": 13.714851379394531, |
|
"learning_rate": 5.458693049684161e-09, |
|
"loss": 0.5628, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 3.9525350593311757, |
|
"grad_norm": 20.694622039794922, |
|
"learning_rate": 3.8400371214131205e-09, |
|
"loss": 0.5538, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 3.9611650485436893, |
|
"grad_norm": 14.463215827941895, |
|
"learning_rate": 2.5052244644802048e-09, |
|
"loss": 0.64, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.969795037756203, |
|
"grad_norm": 7.637043476104736, |
|
"learning_rate": 1.4543309126446858e-09, |
|
"loss": 0.4614, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.969795037756203, |
|
"eval_accuracy": 0.6640776699029126, |
|
"eval_loss": 0.6896011829376221, |
|
"eval_runtime": 320.6166, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 1.606, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.9784250269687162, |
|
"grad_norm": 12.583084106445312, |
|
"learning_rate": 6.874161695719084e-10, |
|
"loss": 0.5865, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 3.98705501618123, |
|
"grad_norm": 16.6655216217041, |
|
"learning_rate": 2.045238054415588e-10, |
|
"loss": 0.5533, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 3.995685005393743, |
|
"grad_norm": 26.88420867919922, |
|
"learning_rate": 5.681254474088072e-12, |
|
"loss": 0.6292, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 3.997411003236246, |
|
"step": 4632, |
|
"total_flos": 0.0, |
|
"train_loss": 0.6694142627746947, |
|
"train_runtime": 66014.9203, |
|
"train_samples_per_second": 0.281, |
|
"train_steps_per_second": 0.07 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4632, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|