{ "best_metric": null, "best_model_checkpoint": null, "epoch": 112.94117647058823, "eval_steps": 500, "global_step": 1440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.7843137254901961, "grad_norm": 3.280456066131592, "learning_rate": 3.4722222222222224e-06, "loss": 2.7006, "step": 10 }, { "epoch": 0.9411764705882353, "eval_accuracy": 0.11666666666666667, "eval_loss": 2.678151845932007, "eval_runtime": 4.6451, "eval_samples_per_second": 38.751, "eval_steps_per_second": 1.292, "step": 12 }, { "epoch": 1.5686274509803921, "grad_norm": 7.857480049133301, "learning_rate": 6.944444444444445e-06, "loss": 2.6863, "step": 20 }, { "epoch": 1.9607843137254903, "eval_accuracy": 0.16111111111111112, "eval_loss": 2.627171039581299, "eval_runtime": 3.8353, "eval_samples_per_second": 46.933, "eval_steps_per_second": 1.564, "step": 25 }, { "epoch": 2.3529411764705883, "grad_norm": 3.7707808017730713, "learning_rate": 1.0416666666666668e-05, "loss": 2.6437, "step": 30 }, { "epoch": 2.980392156862745, "eval_accuracy": 0.28888888888888886, "eval_loss": 2.5389277935028076, "eval_runtime": 3.8826, "eval_samples_per_second": 46.36, "eval_steps_per_second": 1.545, "step": 38 }, { "epoch": 3.1372549019607843, "grad_norm": 5.107277870178223, "learning_rate": 1.388888888888889e-05, "loss": 2.5839, "step": 40 }, { "epoch": 3.9215686274509802, "grad_norm": 6.703848361968994, "learning_rate": 1.736111111111111e-05, "loss": 2.4851, "step": 50 }, { "epoch": 4.0, "eval_accuracy": 0.4111111111111111, "eval_loss": 2.411587953567505, "eval_runtime": 3.8828, "eval_samples_per_second": 46.359, "eval_steps_per_second": 1.545, "step": 51 }, { "epoch": 4.705882352941177, "grad_norm": 11.087530136108398, "learning_rate": 2.0833333333333336e-05, "loss": 2.3732, "step": 60 }, { "epoch": 4.9411764705882355, "eval_accuracy": 0.4888888888888889, "eval_loss": 2.270714521408081, "eval_runtime": 3.8466, "eval_samples_per_second": 46.794, "eval_steps_per_second": 1.56, "step": 63 }, { "epoch": 5.490196078431373, "grad_norm": 7.360437393188477, "learning_rate": 2.4305555555555558e-05, "loss": 2.2546, "step": 70 }, { "epoch": 5.96078431372549, "eval_accuracy": 0.5722222222222222, "eval_loss": 2.0710320472717285, "eval_runtime": 3.9713, "eval_samples_per_second": 45.326, "eval_steps_per_second": 1.511, "step": 76 }, { "epoch": 6.2745098039215685, "grad_norm": 6.191979885101318, "learning_rate": 2.777777777777778e-05, "loss": 2.1023, "step": 80 }, { "epoch": 6.980392156862745, "eval_accuracy": 0.6166666666666667, "eval_loss": 1.8370894193649292, "eval_runtime": 3.8628, "eval_samples_per_second": 46.599, "eval_steps_per_second": 1.553, "step": 89 }, { "epoch": 7.0588235294117645, "grad_norm": 7.690328598022461, "learning_rate": 3.125e-05, "loss": 1.9156, "step": 90 }, { "epoch": 7.8431372549019605, "grad_norm": 7.919386863708496, "learning_rate": 3.472222222222222e-05, "loss": 1.7115, "step": 100 }, { "epoch": 8.0, "eval_accuracy": 0.6111111111111112, "eval_loss": 1.6161085367202759, "eval_runtime": 3.8886, "eval_samples_per_second": 46.29, "eval_steps_per_second": 1.543, "step": 102 }, { "epoch": 8.627450980392156, "grad_norm": 12.924628257751465, "learning_rate": 3.8194444444444444e-05, "loss": 1.5295, "step": 110 }, { "epoch": 8.941176470588236, "eval_accuracy": 0.6277777777777778, "eval_loss": 1.4381340742111206, "eval_runtime": 3.8538, "eval_samples_per_second": 46.708, "eval_steps_per_second": 1.557, "step": 114 }, { "epoch": 9.411764705882353, "grad_norm": 14.41945743560791, "learning_rate": 4.166666666666667e-05, "loss": 1.3366, "step": 120 }, { "epoch": 9.96078431372549, "eval_accuracy": 0.65, "eval_loss": 1.2539671659469604, "eval_runtime": 3.9661, "eval_samples_per_second": 45.385, "eval_steps_per_second": 1.513, "step": 127 }, { "epoch": 10.196078431372548, "grad_norm": 17.627479553222656, "learning_rate": 4.5138888888888894e-05, "loss": 1.2377, "step": 130 }, { "epoch": 10.980392156862745, "grad_norm": 13.988055229187012, "learning_rate": 4.8611111111111115e-05, "loss": 1.0556, "step": 140 }, { "epoch": 10.980392156862745, "eval_accuracy": 0.6611111111111111, "eval_loss": 1.1632429361343384, "eval_runtime": 3.9305, "eval_samples_per_second": 45.796, "eval_steps_per_second": 1.527, "step": 140 }, { "epoch": 11.764705882352942, "grad_norm": 12.996641159057617, "learning_rate": 4.976851851851852e-05, "loss": 0.9657, "step": 150 }, { "epoch": 12.0, "eval_accuracy": 0.7, "eval_loss": 1.0600230693817139, "eval_runtime": 3.9116, "eval_samples_per_second": 46.016, "eval_steps_per_second": 1.534, "step": 153 }, { "epoch": 12.549019607843137, "grad_norm": 26.70894432067871, "learning_rate": 4.938271604938271e-05, "loss": 0.8703, "step": 160 }, { "epoch": 12.941176470588236, "eval_accuracy": 0.7222222222222222, "eval_loss": 0.9983330368995667, "eval_runtime": 3.823, "eval_samples_per_second": 47.084, "eval_steps_per_second": 1.569, "step": 165 }, { "epoch": 13.333333333333334, "grad_norm": 24.295700073242188, "learning_rate": 4.899691358024692e-05, "loss": 0.8007, "step": 170 }, { "epoch": 13.96078431372549, "eval_accuracy": 0.7277777777777777, "eval_loss": 0.9474301934242249, "eval_runtime": 4.0708, "eval_samples_per_second": 44.218, "eval_steps_per_second": 1.474, "step": 178 }, { "epoch": 14.117647058823529, "grad_norm": 19.43092918395996, "learning_rate": 4.8611111111111115e-05, "loss": 0.7257, "step": 180 }, { "epoch": 14.901960784313726, "grad_norm": 22.165098190307617, "learning_rate": 4.8225308641975306e-05, "loss": 0.6398, "step": 190 }, { "epoch": 14.980392156862745, "eval_accuracy": 0.75, "eval_loss": 0.8633670210838318, "eval_runtime": 4.1208, "eval_samples_per_second": 43.681, "eval_steps_per_second": 1.456, "step": 191 }, { "epoch": 15.686274509803921, "grad_norm": 24.815359115600586, "learning_rate": 4.783950617283951e-05, "loss": 0.6023, "step": 200 }, { "epoch": 16.0, "eval_accuracy": 0.7277777777777777, "eval_loss": 0.8527319431304932, "eval_runtime": 3.869, "eval_samples_per_second": 46.524, "eval_steps_per_second": 1.551, "step": 204 }, { "epoch": 16.470588235294116, "grad_norm": 28.00403594970703, "learning_rate": 4.745370370370371e-05, "loss": 0.583, "step": 210 }, { "epoch": 16.941176470588236, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.7927896976470947, "eval_runtime": 3.8557, "eval_samples_per_second": 46.684, "eval_steps_per_second": 1.556, "step": 216 }, { "epoch": 17.254901960784313, "grad_norm": 30.892383575439453, "learning_rate": 4.70679012345679e-05, "loss": 0.5279, "step": 220 }, { "epoch": 17.96078431372549, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.7896744012832642, "eval_runtime": 3.7921, "eval_samples_per_second": 47.468, "eval_steps_per_second": 1.582, "step": 229 }, { "epoch": 18.03921568627451, "grad_norm": 19.16618537902832, "learning_rate": 4.66820987654321e-05, "loss": 0.5084, "step": 230 }, { "epoch": 18.823529411764707, "grad_norm": 13.686722755432129, "learning_rate": 4.62962962962963e-05, "loss": 0.4643, "step": 240 }, { "epoch": 18.980392156862745, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.7885976433753967, "eval_runtime": 3.8507, "eval_samples_per_second": 46.745, "eval_steps_per_second": 1.558, "step": 242 }, { "epoch": 19.607843137254903, "grad_norm": 17.4645938873291, "learning_rate": 4.591049382716049e-05, "loss": 0.4296, "step": 250 }, { "epoch": 20.0, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.7328829169273376, "eval_runtime": 3.8093, "eval_samples_per_second": 47.252, "eval_steps_per_second": 1.575, "step": 255 }, { "epoch": 20.392156862745097, "grad_norm": 16.678590774536133, "learning_rate": 4.5524691358024696e-05, "loss": 0.41, "step": 260 }, { "epoch": 20.941176470588236, "eval_accuracy": 0.7611111111111111, "eval_loss": 0.7316663861274719, "eval_runtime": 3.8668, "eval_samples_per_second": 46.55, "eval_steps_per_second": 1.552, "step": 267 }, { "epoch": 21.176470588235293, "grad_norm": 12.328489303588867, "learning_rate": 4.5138888888888894e-05, "loss": 0.3663, "step": 270 }, { "epoch": 21.96078431372549, "grad_norm": 23.868070602416992, "learning_rate": 4.4753086419753084e-05, "loss": 0.3674, "step": 280 }, { "epoch": 21.96078431372549, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.7170845866203308, "eval_runtime": 3.956, "eval_samples_per_second": 45.5, "eval_steps_per_second": 1.517, "step": 280 }, { "epoch": 22.745098039215687, "grad_norm": 21.286258697509766, "learning_rate": 4.436728395061729e-05, "loss": 0.3285, "step": 290 }, { "epoch": 22.980392156862745, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.7005434036254883, "eval_runtime": 3.9134, "eval_samples_per_second": 45.996, "eval_steps_per_second": 1.533, "step": 293 }, { "epoch": 23.529411764705884, "grad_norm": 12.573646545410156, "learning_rate": 4.3981481481481486e-05, "loss": 0.2978, "step": 300 }, { "epoch": 24.0, "eval_accuracy": 0.7888888888888889, "eval_loss": 0.6576042771339417, "eval_runtime": 3.8597, "eval_samples_per_second": 46.636, "eval_steps_per_second": 1.555, "step": 306 }, { "epoch": 24.313725490196077, "grad_norm": 18.386383056640625, "learning_rate": 4.359567901234568e-05, "loss": 0.293, "step": 310 }, { "epoch": 24.941176470588236, "eval_accuracy": 0.8, "eval_loss": 0.644997239112854, "eval_runtime": 4.0291, "eval_samples_per_second": 44.675, "eval_steps_per_second": 1.489, "step": 318 }, { "epoch": 25.098039215686274, "grad_norm": 18.57107925415039, "learning_rate": 4.3209876543209875e-05, "loss": 0.2665, "step": 320 }, { "epoch": 25.88235294117647, "grad_norm": 16.507802963256836, "learning_rate": 4.282407407407408e-05, "loss": 0.2724, "step": 330 }, { "epoch": 25.96078431372549, "eval_accuracy": 0.7888888888888889, "eval_loss": 0.6764713525772095, "eval_runtime": 3.8073, "eval_samples_per_second": 47.278, "eval_steps_per_second": 1.576, "step": 331 }, { "epoch": 26.666666666666668, "grad_norm": 18.386646270751953, "learning_rate": 4.243827160493827e-05, "loss": 0.2494, "step": 340 }, { "epoch": 26.980392156862745, "eval_accuracy": 0.8055555555555556, "eval_loss": 0.6826486587524414, "eval_runtime": 3.9305, "eval_samples_per_second": 45.796, "eval_steps_per_second": 1.527, "step": 344 }, { "epoch": 27.45098039215686, "grad_norm": 16.895566940307617, "learning_rate": 4.205246913580247e-05, "loss": 0.2504, "step": 350 }, { "epoch": 28.0, "eval_accuracy": 0.8055555555555556, "eval_loss": 0.6710352301597595, "eval_runtime": 3.8264, "eval_samples_per_second": 47.041, "eval_steps_per_second": 1.568, "step": 357 }, { "epoch": 28.235294117647058, "grad_norm": 14.318737983703613, "learning_rate": 4.166666666666667e-05, "loss": 0.2332, "step": 360 }, { "epoch": 28.941176470588236, "eval_accuracy": 0.7777777777777778, "eval_loss": 0.666705846786499, "eval_runtime": 3.9608, "eval_samples_per_second": 45.445, "eval_steps_per_second": 1.515, "step": 369 }, { "epoch": 29.019607843137255, "grad_norm": 19.979778289794922, "learning_rate": 4.128086419753087e-05, "loss": 0.2071, "step": 370 }, { "epoch": 29.80392156862745, "grad_norm": 14.415696144104004, "learning_rate": 4.089506172839506e-05, "loss": 0.2012, "step": 380 }, { "epoch": 29.96078431372549, "eval_accuracy": 0.7944444444444444, "eval_loss": 0.7399319410324097, "eval_runtime": 3.8411, "eval_samples_per_second": 46.861, "eval_steps_per_second": 1.562, "step": 382 }, { "epoch": 30.58823529411765, "grad_norm": 10.09865665435791, "learning_rate": 4.0509259259259265e-05, "loss": 0.1866, "step": 390 }, { "epoch": 30.980392156862745, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.7311467528343201, "eval_runtime": 4.2025, "eval_samples_per_second": 42.831, "eval_steps_per_second": 1.428, "step": 395 }, { "epoch": 31.372549019607842, "grad_norm": 31.518268585205078, "learning_rate": 4.012345679012346e-05, "loss": 0.2031, "step": 400 }, { "epoch": 32.0, "eval_accuracy": 0.7944444444444444, "eval_loss": 0.7076573967933655, "eval_runtime": 3.8553, "eval_samples_per_second": 46.689, "eval_steps_per_second": 1.556, "step": 408 }, { "epoch": 32.15686274509804, "grad_norm": 18.065155029296875, "learning_rate": 3.973765432098765e-05, "loss": 0.1969, "step": 410 }, { "epoch": 32.94117647058823, "grad_norm": 19.082073211669922, "learning_rate": 3.935185185185186e-05, "loss": 0.1969, "step": 420 }, { "epoch": 32.94117647058823, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.7769466638565063, "eval_runtime": 3.9404, "eval_samples_per_second": 45.681, "eval_steps_per_second": 1.523, "step": 420 }, { "epoch": 33.72549019607843, "grad_norm": 23.67195701599121, "learning_rate": 3.8966049382716055e-05, "loss": 0.1968, "step": 430 }, { "epoch": 33.96078431372549, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.7666174173355103, "eval_runtime": 3.7732, "eval_samples_per_second": 47.705, "eval_steps_per_second": 1.59, "step": 433 }, { "epoch": 34.509803921568626, "grad_norm": 22.488903045654297, "learning_rate": 3.8580246913580246e-05, "loss": 0.1712, "step": 440 }, { "epoch": 34.98039215686274, "eval_accuracy": 0.8, "eval_loss": 0.6795583367347717, "eval_runtime": 3.7971, "eval_samples_per_second": 47.404, "eval_steps_per_second": 1.58, "step": 446 }, { "epoch": 35.294117647058826, "grad_norm": 20.480487823486328, "learning_rate": 3.8194444444444444e-05, "loss": 0.1813, "step": 450 }, { "epoch": 36.0, "eval_accuracy": 0.8111111111111111, "eval_loss": 0.6653857827186584, "eval_runtime": 3.8151, "eval_samples_per_second": 47.181, "eval_steps_per_second": 1.573, "step": 459 }, { "epoch": 36.07843137254902, "grad_norm": 15.032731056213379, "learning_rate": 3.780864197530865e-05, "loss": 0.1625, "step": 460 }, { "epoch": 36.86274509803921, "grad_norm": 12.346388816833496, "learning_rate": 3.742283950617284e-05, "loss": 0.1678, "step": 470 }, { "epoch": 36.94117647058823, "eval_accuracy": 0.7888888888888889, "eval_loss": 0.6851311326026917, "eval_runtime": 3.8058, "eval_samples_per_second": 47.296, "eval_steps_per_second": 1.577, "step": 471 }, { "epoch": 37.64705882352941, "grad_norm": 16.994136810302734, "learning_rate": 3.7037037037037037e-05, "loss": 0.1461, "step": 480 }, { "epoch": 37.96078431372549, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.7054334878921509, "eval_runtime": 3.847, "eval_samples_per_second": 46.79, "eval_steps_per_second": 1.56, "step": 484 }, { "epoch": 38.431372549019606, "grad_norm": 9.07701587677002, "learning_rate": 3.665123456790124e-05, "loss": 0.1244, "step": 490 }, { "epoch": 38.98039215686274, "eval_accuracy": 0.8055555555555556, "eval_loss": 0.7013460993766785, "eval_runtime": 3.8047, "eval_samples_per_second": 47.31, "eval_steps_per_second": 1.577, "step": 497 }, { "epoch": 39.21568627450981, "grad_norm": 9.217803955078125, "learning_rate": 3.626543209876543e-05, "loss": 0.1385, "step": 500 }, { "epoch": 40.0, "grad_norm": 18.938865661621094, "learning_rate": 3.587962962962963e-05, "loss": 0.1329, "step": 510 }, { "epoch": 40.0, "eval_accuracy": 0.8, "eval_loss": 0.6785274147987366, "eval_runtime": 3.86, "eval_samples_per_second": 46.632, "eval_steps_per_second": 1.554, "step": 510 }, { "epoch": 40.78431372549019, "grad_norm": 12.516762733459473, "learning_rate": 3.5493827160493834e-05, "loss": 0.1186, "step": 520 }, { "epoch": 40.94117647058823, "eval_accuracy": 0.7777777777777778, "eval_loss": 0.7499803900718689, "eval_runtime": 3.8115, "eval_samples_per_second": 47.225, "eval_steps_per_second": 1.574, "step": 522 }, { "epoch": 41.568627450980394, "grad_norm": 9.269219398498535, "learning_rate": 3.5108024691358025e-05, "loss": 0.1397, "step": 530 }, { "epoch": 41.96078431372549, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.6819199919700623, "eval_runtime": 3.8462, "eval_samples_per_second": 46.799, "eval_steps_per_second": 1.56, "step": 535 }, { "epoch": 42.35294117647059, "grad_norm": 23.982585906982422, "learning_rate": 3.472222222222222e-05, "loss": 0.1324, "step": 540 }, { "epoch": 42.98039215686274, "eval_accuracy": 0.8111111111111111, "eval_loss": 0.6256746649742126, "eval_runtime": 3.8195, "eval_samples_per_second": 47.126, "eval_steps_per_second": 1.571, "step": 548 }, { "epoch": 43.13725490196079, "grad_norm": 11.348409652709961, "learning_rate": 3.4336419753086427e-05, "loss": 0.1461, "step": 550 }, { "epoch": 43.92156862745098, "grad_norm": 14.504171371459961, "learning_rate": 3.395061728395062e-05, "loss": 0.111, "step": 560 }, { "epoch": 44.0, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.5938891172409058, "eval_runtime": 3.8255, "eval_samples_per_second": 47.053, "eval_steps_per_second": 1.568, "step": 561 }, { "epoch": 44.705882352941174, "grad_norm": 15.88039493560791, "learning_rate": 3.3564814814814815e-05, "loss": 0.1228, "step": 570 }, { "epoch": 44.94117647058823, "eval_accuracy": 0.8222222222222222, "eval_loss": 0.6379250288009644, "eval_runtime": 3.8691, "eval_samples_per_second": 46.522, "eval_steps_per_second": 1.551, "step": 573 }, { "epoch": 45.490196078431374, "grad_norm": 14.967761993408203, "learning_rate": 3.317901234567901e-05, "loss": 0.1085, "step": 580 }, { "epoch": 45.96078431372549, "eval_accuracy": 0.8222222222222222, "eval_loss": 0.6788524389266968, "eval_runtime": 3.8236, "eval_samples_per_second": 47.077, "eval_steps_per_second": 1.569, "step": 586 }, { "epoch": 46.27450980392157, "grad_norm": 7.978495121002197, "learning_rate": 3.279320987654321e-05, "loss": 0.1234, "step": 590 }, { "epoch": 46.98039215686274, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.624097466468811, "eval_runtime": 3.7905, "eval_samples_per_second": 47.487, "eval_steps_per_second": 1.583, "step": 599 }, { "epoch": 47.05882352941177, "grad_norm": 21.228994369506836, "learning_rate": 3.240740740740741e-05, "loss": 0.1007, "step": 600 }, { "epoch": 47.84313725490196, "grad_norm": 16.632568359375, "learning_rate": 3.2021604938271605e-05, "loss": 0.1129, "step": 610 }, { "epoch": 48.0, "eval_accuracy": 0.7888888888888889, "eval_loss": 0.750299334526062, "eval_runtime": 3.8266, "eval_samples_per_second": 47.039, "eval_steps_per_second": 1.568, "step": 612 }, { "epoch": 48.627450980392155, "grad_norm": 8.629143714904785, "learning_rate": 3.16358024691358e-05, "loss": 0.1197, "step": 620 }, { "epoch": 48.94117647058823, "eval_accuracy": 0.7944444444444444, "eval_loss": 0.6861774325370789, "eval_runtime": 3.8119, "eval_samples_per_second": 47.22, "eval_steps_per_second": 1.574, "step": 624 }, { "epoch": 49.411764705882355, "grad_norm": 7.733061790466309, "learning_rate": 3.125e-05, "loss": 0.0898, "step": 630 }, { "epoch": 49.96078431372549, "eval_accuracy": 0.7888888888888889, "eval_loss": 0.6763875484466553, "eval_runtime": 3.818, "eval_samples_per_second": 47.145, "eval_steps_per_second": 1.571, "step": 637 }, { "epoch": 50.19607843137255, "grad_norm": 4.7213454246521, "learning_rate": 3.08641975308642e-05, "loss": 0.1021, "step": 640 }, { "epoch": 50.98039215686274, "grad_norm": 7.109160423278809, "learning_rate": 3.04783950617284e-05, "loss": 0.1057, "step": 650 }, { "epoch": 50.98039215686274, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.6338934898376465, "eval_runtime": 3.8669, "eval_samples_per_second": 46.549, "eval_steps_per_second": 1.552, "step": 650 }, { "epoch": 51.76470588235294, "grad_norm": 16.793262481689453, "learning_rate": 3.0092592592592593e-05, "loss": 0.0893, "step": 660 }, { "epoch": 52.0, "eval_accuracy": 0.85, "eval_loss": 0.5828067064285278, "eval_runtime": 3.8135, "eval_samples_per_second": 47.2, "eval_steps_per_second": 1.573, "step": 663 }, { "epoch": 52.549019607843135, "grad_norm": 15.305388450622559, "learning_rate": 2.970679012345679e-05, "loss": 0.0736, "step": 670 }, { "epoch": 52.94117647058823, "eval_accuracy": 0.8111111111111111, "eval_loss": 0.6572611331939697, "eval_runtime": 3.8338, "eval_samples_per_second": 46.951, "eval_steps_per_second": 1.565, "step": 675 }, { "epoch": 53.333333333333336, "grad_norm": 13.537842750549316, "learning_rate": 2.9320987654320992e-05, "loss": 0.0752, "step": 680 }, { "epoch": 53.96078431372549, "eval_accuracy": 0.7944444444444444, "eval_loss": 0.6806420087814331, "eval_runtime": 3.821, "eval_samples_per_second": 47.108, "eval_steps_per_second": 1.57, "step": 688 }, { "epoch": 54.11764705882353, "grad_norm": 21.195058822631836, "learning_rate": 2.8935185185185186e-05, "loss": 0.081, "step": 690 }, { "epoch": 54.90196078431372, "grad_norm": 4.959319591522217, "learning_rate": 2.8549382716049384e-05, "loss": 0.1127, "step": 700 }, { "epoch": 54.98039215686274, "eval_accuracy": 0.8111111111111111, "eval_loss": 0.6222459673881531, "eval_runtime": 3.8138, "eval_samples_per_second": 47.197, "eval_steps_per_second": 1.573, "step": 701 }, { "epoch": 55.68627450980392, "grad_norm": 15.941388130187988, "learning_rate": 2.8163580246913578e-05, "loss": 0.1126, "step": 710 }, { "epoch": 56.0, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.6305037140846252, "eval_runtime": 3.7889, "eval_samples_per_second": 47.507, "eval_steps_per_second": 1.584, "step": 714 }, { "epoch": 56.470588235294116, "grad_norm": 12.527265548706055, "learning_rate": 2.777777777777778e-05, "loss": 0.0874, "step": 720 }, { "epoch": 56.94117647058823, "eval_accuracy": 0.8111111111111111, "eval_loss": 0.6593422293663025, "eval_runtime": 3.7695, "eval_samples_per_second": 47.751, "eval_steps_per_second": 1.592, "step": 726 }, { "epoch": 57.254901960784316, "grad_norm": 7.021444797515869, "learning_rate": 2.7391975308641977e-05, "loss": 0.0806, "step": 730 }, { "epoch": 57.96078431372549, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.7005773782730103, "eval_runtime": 3.7654, "eval_samples_per_second": 47.804, "eval_steps_per_second": 1.593, "step": 739 }, { "epoch": 58.03921568627451, "grad_norm": 13.148822784423828, "learning_rate": 2.700617283950617e-05, "loss": 0.0862, "step": 740 }, { "epoch": 58.8235294117647, "grad_norm": 21.20357322692871, "learning_rate": 2.6620370370370372e-05, "loss": 0.0978, "step": 750 }, { "epoch": 58.98039215686274, "eval_accuracy": 0.8055555555555556, "eval_loss": 0.6680053472518921, "eval_runtime": 3.8444, "eval_samples_per_second": 46.821, "eval_steps_per_second": 1.561, "step": 752 }, { "epoch": 59.6078431372549, "grad_norm": 14.901837348937988, "learning_rate": 2.623456790123457e-05, "loss": 0.0875, "step": 760 }, { "epoch": 60.0, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.67389976978302, "eval_runtime": 3.8169, "eval_samples_per_second": 47.158, "eval_steps_per_second": 1.572, "step": 765 }, { "epoch": 60.3921568627451, "grad_norm": 7.75632905960083, "learning_rate": 2.5848765432098764e-05, "loss": 0.0722, "step": 770 }, { "epoch": 60.94117647058823, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.6340806484222412, "eval_runtime": 3.804, "eval_samples_per_second": 47.318, "eval_steps_per_second": 1.577, "step": 777 }, { "epoch": 61.1764705882353, "grad_norm": 15.415738105773926, "learning_rate": 2.5462962962962965e-05, "loss": 0.0901, "step": 780 }, { "epoch": 61.96078431372549, "grad_norm": 4.349020004272461, "learning_rate": 2.5077160493827162e-05, "loss": 0.0942, "step": 790 }, { "epoch": 61.96078431372549, "eval_accuracy": 0.8, "eval_loss": 0.6428362727165222, "eval_runtime": 3.7936, "eval_samples_per_second": 47.448, "eval_steps_per_second": 1.582, "step": 790 }, { "epoch": 62.745098039215684, "grad_norm": 9.049278259277344, "learning_rate": 2.4691358024691357e-05, "loss": 0.0957, "step": 800 }, { "epoch": 62.98039215686274, "eval_accuracy": 0.8, "eval_loss": 0.6757560968399048, "eval_runtime": 3.8348, "eval_samples_per_second": 46.938, "eval_steps_per_second": 1.565, "step": 803 }, { "epoch": 63.529411764705884, "grad_norm": 11.159144401550293, "learning_rate": 2.4305555555555558e-05, "loss": 0.0814, "step": 810 }, { "epoch": 64.0, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.6104480028152466, "eval_runtime": 3.8591, "eval_samples_per_second": 46.643, "eval_steps_per_second": 1.555, "step": 816 }, { "epoch": 64.31372549019608, "grad_norm": 11.834258079528809, "learning_rate": 2.3919753086419755e-05, "loss": 0.077, "step": 820 }, { "epoch": 64.94117647058823, "eval_accuracy": 0.8111111111111111, "eval_loss": 0.6226403713226318, "eval_runtime": 3.8932, "eval_samples_per_second": 46.235, "eval_steps_per_second": 1.541, "step": 828 }, { "epoch": 65.09803921568627, "grad_norm": 11.226044654846191, "learning_rate": 2.353395061728395e-05, "loss": 0.0862, "step": 830 }, { "epoch": 65.88235294117646, "grad_norm": 14.373990058898926, "learning_rate": 2.314814814814815e-05, "loss": 0.1004, "step": 840 }, { "epoch": 65.96078431372548, "eval_accuracy": 0.8055555555555556, "eval_loss": 0.6898564696311951, "eval_runtime": 3.8229, "eval_samples_per_second": 47.085, "eval_steps_per_second": 1.569, "step": 841 }, { "epoch": 66.66666666666667, "grad_norm": 8.56983757019043, "learning_rate": 2.2762345679012348e-05, "loss": 0.0697, "step": 850 }, { "epoch": 66.98039215686275, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.7104570865631104, "eval_runtime": 3.7843, "eval_samples_per_second": 47.565, "eval_steps_per_second": 1.585, "step": 854 }, { "epoch": 67.45098039215686, "grad_norm": 7.092602729797363, "learning_rate": 2.2376543209876542e-05, "loss": 0.0754, "step": 860 }, { "epoch": 68.0, "eval_accuracy": 0.8111111111111111, "eval_loss": 0.675083339214325, "eval_runtime": 3.8377, "eval_samples_per_second": 46.903, "eval_steps_per_second": 1.563, "step": 867 }, { "epoch": 68.23529411764706, "grad_norm": 21.711984634399414, "learning_rate": 2.1990740740740743e-05, "loss": 0.0842, "step": 870 }, { "epoch": 68.94117647058823, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.6912497878074646, "eval_runtime": 4.1438, "eval_samples_per_second": 43.438, "eval_steps_per_second": 1.448, "step": 879 }, { "epoch": 69.01960784313725, "grad_norm": 17.59021759033203, "learning_rate": 2.1604938271604937e-05, "loss": 0.0815, "step": 880 }, { "epoch": 69.80392156862744, "grad_norm": 6.925079822540283, "learning_rate": 2.1219135802469135e-05, "loss": 0.0684, "step": 890 }, { "epoch": 69.96078431372548, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.7235284447669983, "eval_runtime": 3.8887, "eval_samples_per_second": 46.288, "eval_steps_per_second": 1.543, "step": 892 }, { "epoch": 70.58823529411765, "grad_norm": 10.211894989013672, "learning_rate": 2.0833333333333336e-05, "loss": 0.0684, "step": 900 }, { "epoch": 70.98039215686275, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.5839894413948059, "eval_runtime": 3.8698, "eval_samples_per_second": 46.514, "eval_steps_per_second": 1.55, "step": 905 }, { "epoch": 71.37254901960785, "grad_norm": 7.931591987609863, "learning_rate": 2.044753086419753e-05, "loss": 0.0705, "step": 910 }, { "epoch": 72.0, "eval_accuracy": 0.8222222222222222, "eval_loss": 0.6635811924934387, "eval_runtime": 3.899, "eval_samples_per_second": 46.165, "eval_steps_per_second": 1.539, "step": 918 }, { "epoch": 72.15686274509804, "grad_norm": 8.439017295837402, "learning_rate": 2.006172839506173e-05, "loss": 0.0532, "step": 920 }, { "epoch": 72.94117647058823, "grad_norm": 20.3321475982666, "learning_rate": 1.967592592592593e-05, "loss": 0.0681, "step": 930 }, { "epoch": 72.94117647058823, "eval_accuracy": 0.8, "eval_loss": 0.678679883480072, "eval_runtime": 3.8456, "eval_samples_per_second": 46.807, "eval_steps_per_second": 1.56, "step": 930 }, { "epoch": 73.72549019607843, "grad_norm": 16.518983840942383, "learning_rate": 1.9290123456790123e-05, "loss": 0.0906, "step": 940 }, { "epoch": 73.96078431372548, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6242751479148865, "eval_runtime": 3.8813, "eval_samples_per_second": 46.376, "eval_steps_per_second": 1.546, "step": 943 }, { "epoch": 74.50980392156863, "grad_norm": 10.190762519836426, "learning_rate": 1.8904320987654324e-05, "loss": 0.0453, "step": 950 }, { "epoch": 74.98039215686275, "eval_accuracy": 0.8222222222222222, "eval_loss": 0.6786649823188782, "eval_runtime": 3.8468, "eval_samples_per_second": 46.793, "eval_steps_per_second": 1.56, "step": 956 }, { "epoch": 75.29411764705883, "grad_norm": 12.627336502075195, "learning_rate": 1.8518518518518518e-05, "loss": 0.0874, "step": 960 }, { "epoch": 76.0, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.6259381771087646, "eval_runtime": 3.9491, "eval_samples_per_second": 45.58, "eval_steps_per_second": 1.519, "step": 969 }, { "epoch": 76.07843137254902, "grad_norm": 15.190587997436523, "learning_rate": 1.8132716049382716e-05, "loss": 0.0668, "step": 970 }, { "epoch": 76.86274509803921, "grad_norm": 4.534496307373047, "learning_rate": 1.7746913580246917e-05, "loss": 0.051, "step": 980 }, { "epoch": 76.94117647058823, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.6590437293052673, "eval_runtime": 3.8024, "eval_samples_per_second": 47.339, "eval_steps_per_second": 1.578, "step": 981 }, { "epoch": 77.6470588235294, "grad_norm": 12.094006538391113, "learning_rate": 1.736111111111111e-05, "loss": 0.0858, "step": 990 }, { "epoch": 77.96078431372548, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.6306740045547485, "eval_runtime": 4.0063, "eval_samples_per_second": 44.929, "eval_steps_per_second": 1.498, "step": 994 }, { "epoch": 78.43137254901961, "grad_norm": 12.669979095458984, "learning_rate": 1.697530864197531e-05, "loss": 0.0601, "step": 1000 }, { "epoch": 78.98039215686275, "eval_accuracy": 0.8444444444444444, "eval_loss": 0.6041626334190369, "eval_runtime": 3.9261, "eval_samples_per_second": 45.847, "eval_steps_per_second": 1.528, "step": 1007 }, { "epoch": 79.2156862745098, "grad_norm": 13.32419490814209, "learning_rate": 1.6589506172839506e-05, "loss": 0.0596, "step": 1010 }, { "epoch": 80.0, "grad_norm": 17.491554260253906, "learning_rate": 1.6203703703703704e-05, "loss": 0.0601, "step": 1020 }, { "epoch": 80.0, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.5874945521354675, "eval_runtime": 4.0068, "eval_samples_per_second": 44.923, "eval_steps_per_second": 1.497, "step": 1020 }, { "epoch": 80.7843137254902, "grad_norm": 7.148036956787109, "learning_rate": 1.58179012345679e-05, "loss": 0.067, "step": 1030 }, { "epoch": 80.94117647058823, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6078370809555054, "eval_runtime": 3.7634, "eval_samples_per_second": 47.829, "eval_steps_per_second": 1.594, "step": 1032 }, { "epoch": 81.56862745098039, "grad_norm": 6.120352745056152, "learning_rate": 1.54320987654321e-05, "loss": 0.0556, "step": 1040 }, { "epoch": 81.96078431372548, "eval_accuracy": 0.8444444444444444, "eval_loss": 0.6006819605827332, "eval_runtime": 3.8921, "eval_samples_per_second": 46.248, "eval_steps_per_second": 1.542, "step": 1045 }, { "epoch": 82.3529411764706, "grad_norm": 9.7392578125, "learning_rate": 1.5046296296296297e-05, "loss": 0.0661, "step": 1050 }, { "epoch": 82.98039215686275, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.6062378287315369, "eval_runtime": 3.7924, "eval_samples_per_second": 47.464, "eval_steps_per_second": 1.582, "step": 1058 }, { "epoch": 83.13725490196079, "grad_norm": 7.788672924041748, "learning_rate": 1.4660493827160496e-05, "loss": 0.0594, "step": 1060 }, { "epoch": 83.92156862745098, "grad_norm": 3.5243284702301025, "learning_rate": 1.4274691358024692e-05, "loss": 0.0651, "step": 1070 }, { "epoch": 84.0, "eval_accuracy": 0.8111111111111111, "eval_loss": 0.6387273669242859, "eval_runtime": 3.9274, "eval_samples_per_second": 45.832, "eval_steps_per_second": 1.528, "step": 1071 }, { "epoch": 84.70588235294117, "grad_norm": 10.037181854248047, "learning_rate": 1.388888888888889e-05, "loss": 0.0546, "step": 1080 }, { "epoch": 84.94117647058823, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.6861324906349182, "eval_runtime": 3.8238, "eval_samples_per_second": 47.074, "eval_steps_per_second": 1.569, "step": 1083 }, { "epoch": 85.49019607843137, "grad_norm": 10.713313102722168, "learning_rate": 1.3503086419753085e-05, "loss": 0.0827, "step": 1090 }, { "epoch": 85.96078431372548, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6072664260864258, "eval_runtime": 4.2373, "eval_samples_per_second": 42.479, "eval_steps_per_second": 1.416, "step": 1096 }, { "epoch": 86.27450980392157, "grad_norm": 7.2301201820373535, "learning_rate": 1.3117283950617285e-05, "loss": 0.052, "step": 1100 }, { "epoch": 86.98039215686275, "eval_accuracy": 0.85, "eval_loss": 0.593485951423645, "eval_runtime": 3.8302, "eval_samples_per_second": 46.995, "eval_steps_per_second": 1.566, "step": 1109 }, { "epoch": 87.05882352941177, "grad_norm": 4.73368501663208, "learning_rate": 1.2731481481481482e-05, "loss": 0.0442, "step": 1110 }, { "epoch": 87.84313725490196, "grad_norm": 18.29523277282715, "learning_rate": 1.2345679012345678e-05, "loss": 0.0524, "step": 1120 }, { "epoch": 88.0, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.5899335145950317, "eval_runtime": 3.8551, "eval_samples_per_second": 46.692, "eval_steps_per_second": 1.556, "step": 1122 }, { "epoch": 88.62745098039215, "grad_norm": 5.7875494956970215, "learning_rate": 1.1959876543209878e-05, "loss": 0.066, "step": 1130 }, { "epoch": 88.94117647058823, "eval_accuracy": 0.8444444444444444, "eval_loss": 0.5954256057739258, "eval_runtime": 3.8582, "eval_samples_per_second": 46.654, "eval_steps_per_second": 1.555, "step": 1134 }, { "epoch": 89.41176470588235, "grad_norm": 7.960011005401611, "learning_rate": 1.1574074074074075e-05, "loss": 0.0617, "step": 1140 }, { "epoch": 89.96078431372548, "eval_accuracy": 0.8444444444444444, "eval_loss": 0.6145300269126892, "eval_runtime": 3.8594, "eval_samples_per_second": 46.639, "eval_steps_per_second": 1.555, "step": 1147 }, { "epoch": 90.19607843137256, "grad_norm": 5.9040350914001465, "learning_rate": 1.1188271604938271e-05, "loss": 0.0373, "step": 1150 }, { "epoch": 90.98039215686275, "grad_norm": 7.361179828643799, "learning_rate": 1.0802469135802469e-05, "loss": 0.0572, "step": 1160 }, { "epoch": 90.98039215686275, "eval_accuracy": 0.8444444444444444, "eval_loss": 0.6176372766494751, "eval_runtime": 3.825, "eval_samples_per_second": 47.059, "eval_steps_per_second": 1.569, "step": 1160 }, { "epoch": 91.76470588235294, "grad_norm": 14.031915664672852, "learning_rate": 1.0416666666666668e-05, "loss": 0.0719, "step": 1170 }, { "epoch": 92.0, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.6406115889549255, "eval_runtime": 3.7626, "eval_samples_per_second": 47.839, "eval_steps_per_second": 1.595, "step": 1173 }, { "epoch": 92.54901960784314, "grad_norm": 12.152432441711426, "learning_rate": 1.0030864197530866e-05, "loss": 0.0734, "step": 1180 }, { "epoch": 92.94117647058823, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.6484689712524414, "eval_runtime": 3.8146, "eval_samples_per_second": 47.187, "eval_steps_per_second": 1.573, "step": 1185 }, { "epoch": 93.33333333333333, "grad_norm": 7.38240385055542, "learning_rate": 9.645061728395062e-06, "loss": 0.0616, "step": 1190 }, { "epoch": 93.96078431372548, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.619816243648529, "eval_runtime": 3.8069, "eval_samples_per_second": 47.282, "eval_steps_per_second": 1.576, "step": 1198 }, { "epoch": 94.11764705882354, "grad_norm": 15.435483932495117, "learning_rate": 9.259259259259259e-06, "loss": 0.047, "step": 1200 }, { "epoch": 94.90196078431373, "grad_norm": 3.8711276054382324, "learning_rate": 8.873456790123458e-06, "loss": 0.0557, "step": 1210 }, { "epoch": 94.98039215686275, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6167161464691162, "eval_runtime": 3.7712, "eval_samples_per_second": 47.73, "eval_steps_per_second": 1.591, "step": 1211 }, { "epoch": 95.68627450980392, "grad_norm": 6.233323574066162, "learning_rate": 8.487654320987654e-06, "loss": 0.0494, "step": 1220 }, { "epoch": 96.0, "eval_accuracy": 0.8444444444444444, "eval_loss": 0.6479634642601013, "eval_runtime": 3.8139, "eval_samples_per_second": 47.196, "eval_steps_per_second": 1.573, "step": 1224 }, { "epoch": 96.47058823529412, "grad_norm": 2.76275897026062, "learning_rate": 8.101851851851852e-06, "loss": 0.0587, "step": 1230 }, { "epoch": 96.94117647058823, "eval_accuracy": 0.85, "eval_loss": 0.6075512170791626, "eval_runtime": 3.8781, "eval_samples_per_second": 46.414, "eval_steps_per_second": 1.547, "step": 1236 }, { "epoch": 97.25490196078431, "grad_norm": 6.258754730224609, "learning_rate": 7.71604938271605e-06, "loss": 0.052, "step": 1240 }, { "epoch": 97.96078431372548, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6511959433555603, "eval_runtime": 3.7767, "eval_samples_per_second": 47.661, "eval_steps_per_second": 1.589, "step": 1249 }, { "epoch": 98.03921568627452, "grad_norm": 2.6569933891296387, "learning_rate": 7.330246913580248e-06, "loss": 0.0511, "step": 1250 }, { "epoch": 98.82352941176471, "grad_norm": 6.5559306144714355, "learning_rate": 6.944444444444445e-06, "loss": 0.0383, "step": 1260 }, { "epoch": 98.98039215686275, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.6781744360923767, "eval_runtime": 3.7667, "eval_samples_per_second": 47.787, "eval_steps_per_second": 1.593, "step": 1262 }, { "epoch": 99.6078431372549, "grad_norm": 2.813370704650879, "learning_rate": 6.558641975308642e-06, "loss": 0.0499, "step": 1270 }, { "epoch": 100.0, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.6542291641235352, "eval_runtime": 3.8026, "eval_samples_per_second": 47.337, "eval_steps_per_second": 1.578, "step": 1275 }, { "epoch": 100.3921568627451, "grad_norm": 16.95922088623047, "learning_rate": 6.172839506172839e-06, "loss": 0.0511, "step": 1280 }, { "epoch": 100.94117647058823, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6794776320457458, "eval_runtime": 3.8468, "eval_samples_per_second": 46.793, "eval_steps_per_second": 1.56, "step": 1287 }, { "epoch": 101.17647058823529, "grad_norm": 15.969926834106445, "learning_rate": 5.787037037037038e-06, "loss": 0.0527, "step": 1290 }, { "epoch": 101.96078431372548, "grad_norm": 15.459400177001953, "learning_rate": 5.401234567901234e-06, "loss": 0.0452, "step": 1300 }, { "epoch": 101.96078431372548, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.6739789247512817, "eval_runtime": 3.7395, "eval_samples_per_second": 48.135, "eval_steps_per_second": 1.605, "step": 1300 }, { "epoch": 102.74509803921569, "grad_norm": 5.437152862548828, "learning_rate": 5.015432098765433e-06, "loss": 0.0475, "step": 1310 }, { "epoch": 102.98039215686275, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6615740656852722, "eval_runtime": 3.7883, "eval_samples_per_second": 47.515, "eval_steps_per_second": 1.584, "step": 1313 }, { "epoch": 103.52941176470588, "grad_norm": 12.730355262756348, "learning_rate": 4.6296296296296296e-06, "loss": 0.0455, "step": 1320 }, { "epoch": 104.0, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.6490476131439209, "eval_runtime": 3.8157, "eval_samples_per_second": 47.173, "eval_steps_per_second": 1.572, "step": 1326 }, { "epoch": 104.31372549019608, "grad_norm": 10.348357200622559, "learning_rate": 4.243827160493827e-06, "loss": 0.0486, "step": 1330 }, { "epoch": 104.94117647058823, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.6331196427345276, "eval_runtime": 3.7919, "eval_samples_per_second": 47.469, "eval_steps_per_second": 1.582, "step": 1338 }, { "epoch": 105.09803921568627, "grad_norm": 16.444259643554688, "learning_rate": 3.858024691358025e-06, "loss": 0.0442, "step": 1340 }, { "epoch": 105.88235294117646, "grad_norm": 7.596277236938477, "learning_rate": 3.4722222222222224e-06, "loss": 0.0585, "step": 1350 }, { "epoch": 105.96078431372548, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.6298839449882507, "eval_runtime": 3.7758, "eval_samples_per_second": 47.672, "eval_steps_per_second": 1.589, "step": 1351 }, { "epoch": 106.66666666666667, "grad_norm": 13.853099822998047, "learning_rate": 3.0864197530864196e-06, "loss": 0.0549, "step": 1360 }, { "epoch": 106.98039215686275, "eval_accuracy": 0.8277777777777777, "eval_loss": 0.6397578120231628, "eval_runtime": 3.7676, "eval_samples_per_second": 47.776, "eval_steps_per_second": 1.593, "step": 1364 }, { "epoch": 107.45098039215686, "grad_norm": 10.11983871459961, "learning_rate": 2.700617283950617e-06, "loss": 0.0436, "step": 1370 }, { "epoch": 108.0, "eval_accuracy": 0.8444444444444444, "eval_loss": 0.6338447332382202, "eval_runtime": 3.8768, "eval_samples_per_second": 46.43, "eval_steps_per_second": 1.548, "step": 1377 }, { "epoch": 108.23529411764706, "grad_norm": 7.749964237213135, "learning_rate": 2.3148148148148148e-06, "loss": 0.0429, "step": 1380 }, { "epoch": 108.94117647058823, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6458715796470642, "eval_runtime": 3.7687, "eval_samples_per_second": 47.762, "eval_steps_per_second": 1.592, "step": 1389 }, { "epoch": 109.01960784313725, "grad_norm": 5.384810447692871, "learning_rate": 1.9290123456790124e-06, "loss": 0.047, "step": 1390 }, { "epoch": 109.80392156862744, "grad_norm": 14.666231155395508, "learning_rate": 1.5432098765432098e-06, "loss": 0.0449, "step": 1400 }, { "epoch": 109.96078431372548, "eval_accuracy": 0.8444444444444444, "eval_loss": 0.6469634175300598, "eval_runtime": 3.785, "eval_samples_per_second": 47.556, "eval_steps_per_second": 1.585, "step": 1402 }, { "epoch": 110.58823529411765, "grad_norm": 11.94315242767334, "learning_rate": 1.1574074074074074e-06, "loss": 0.0559, "step": 1410 }, { "epoch": 110.98039215686275, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.646262526512146, "eval_runtime": 3.8809, "eval_samples_per_second": 46.381, "eval_steps_per_second": 1.546, "step": 1415 }, { "epoch": 111.37254901960785, "grad_norm": 8.830354690551758, "learning_rate": 7.716049382716049e-07, "loss": 0.0378, "step": 1420 }, { "epoch": 112.0, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6480041742324829, "eval_runtime": 3.8373, "eval_samples_per_second": 46.908, "eval_steps_per_second": 1.564, "step": 1428 }, { "epoch": 112.15686274509804, "grad_norm": 16.867820739746094, "learning_rate": 3.8580246913580245e-07, "loss": 0.0509, "step": 1430 }, { "epoch": 112.94117647058823, "grad_norm": 13.072942733764648, "learning_rate": 0.0, "loss": 0.0476, "step": 1440 }, { "epoch": 112.94117647058823, "eval_accuracy": 0.8388888888888889, "eval_loss": 0.6477780342102051, "eval_runtime": 3.8103, "eval_samples_per_second": 47.24, "eval_steps_per_second": 1.575, "step": 1440 }, { "epoch": 112.94117647058823, "step": 1440, "total_flos": 4.607069541812011e+18, "train_loss": 0.33674598841203585, "train_runtime": 5395.5485, "train_samples_per_second": 36.03, "train_steps_per_second": 0.267 } ], "logging_steps": 10, "max_steps": 1440, "num_input_tokens_seen": 0, "num_train_epochs": 120, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.607069541812011e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }