{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8555111364934325, "eval_steps": 1000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014277555682467162, "grad_norm": 7.571424961090088, "learning_rate": 5.000000000000001e-07, "loss": 1.5088, "step": 25 }, { "epoch": 0.028555111364934323, "grad_norm": 5.992729187011719, "learning_rate": 1.0000000000000002e-06, "loss": 1.2038, "step": 50 }, { "epoch": 0.04283266704740148, "grad_norm": 5.949503421783447, "learning_rate": 1.5e-06, "loss": 0.8879, "step": 75 }, { "epoch": 0.05711022272986865, "grad_norm": 4.452832221984863, "learning_rate": 2.0000000000000003e-06, "loss": 0.7647, "step": 100 }, { "epoch": 0.0713877784123358, "grad_norm": 4.690545558929443, "learning_rate": 2.5e-06, "loss": 0.6792, "step": 125 }, { "epoch": 0.08566533409480297, "grad_norm": 4.969720840454102, "learning_rate": 3e-06, "loss": 0.6549, "step": 150 }, { "epoch": 0.09994288977727013, "grad_norm": 5.184281349182129, "learning_rate": 3.5e-06, "loss": 0.6376, "step": 175 }, { "epoch": 0.1142204454597373, "grad_norm": 5.00349235534668, "learning_rate": 4.000000000000001e-06, "loss": 0.5982, "step": 200 }, { "epoch": 0.12849800114220444, "grad_norm": 4.239490032196045, "learning_rate": 4.5e-06, "loss": 0.6084, "step": 225 }, { "epoch": 0.1427755568246716, "grad_norm": 4.2740068435668945, "learning_rate": 5e-06, "loss": 0.58, "step": 250 }, { "epoch": 0.15705311250713877, "grad_norm": 4.718848705291748, "learning_rate": 5.500000000000001e-06, "loss": 0.5759, "step": 275 }, { "epoch": 0.17133066818960593, "grad_norm": 4.2935638427734375, "learning_rate": 6e-06, "loss": 0.5625, "step": 300 }, { "epoch": 0.1856082238720731, "grad_norm": 4.917020797729492, "learning_rate": 6.5000000000000004e-06, "loss": 0.5621, "step": 325 }, { "epoch": 0.19988577955454026, "grad_norm": 3.9521942138671875, "learning_rate": 7e-06, "loss": 0.5644, "step": 350 }, { "epoch": 0.21416333523700742, "grad_norm": 4.506232738494873, "learning_rate": 7.500000000000001e-06, "loss": 0.5508, "step": 375 }, { "epoch": 0.2284408909194746, "grad_norm": 4.1483540534973145, "learning_rate": 8.000000000000001e-06, "loss": 0.5244, "step": 400 }, { "epoch": 0.24271844660194175, "grad_norm": 4.077396392822266, "learning_rate": 8.5e-06, "loss": 0.5051, "step": 425 }, { "epoch": 0.2569960022844089, "grad_norm": 4.375626087188721, "learning_rate": 9e-06, "loss": 0.5222, "step": 450 }, { "epoch": 0.2712735579668761, "grad_norm": 3.5698530673980713, "learning_rate": 9.5e-06, "loss": 0.5038, "step": 475 }, { "epoch": 0.2855511136493432, "grad_norm": 4.99509859085083, "learning_rate": 1e-05, "loss": 0.5196, "step": 500 }, { "epoch": 0.2998286693318104, "grad_norm": 3.666332721710205, "learning_rate": 9.944444444444445e-06, "loss": 0.5066, "step": 525 }, { "epoch": 0.31410622501427754, "grad_norm": 3.9203736782073975, "learning_rate": 9.88888888888889e-06, "loss": 0.4822, "step": 550 }, { "epoch": 0.32838378069674473, "grad_norm": 3.5677530765533447, "learning_rate": 9.833333333333333e-06, "loss": 0.519, "step": 575 }, { "epoch": 0.34266133637921187, "grad_norm": 3.3873414993286133, "learning_rate": 9.777777777777779e-06, "loss": 0.5205, "step": 600 }, { "epoch": 0.35693889206167906, "grad_norm": 3.9527816772460938, "learning_rate": 9.722222222222223e-06, "loss": 0.4769, "step": 625 }, { "epoch": 0.3712164477441462, "grad_norm": 3.3437490463256836, "learning_rate": 9.666666666666667e-06, "loss": 0.4629, "step": 650 }, { "epoch": 0.3854940034266134, "grad_norm": 3.7754790782928467, "learning_rate": 9.611111111111112e-06, "loss": 0.4812, "step": 675 }, { "epoch": 0.3997715591090805, "grad_norm": 3.744267225265503, "learning_rate": 9.555555555555556e-06, "loss": 0.467, "step": 700 }, { "epoch": 0.4140491147915477, "grad_norm": 3.5076072216033936, "learning_rate": 9.5e-06, "loss": 0.4454, "step": 725 }, { "epoch": 0.42832667047401485, "grad_norm": 3.556335687637329, "learning_rate": 9.444444444444445e-06, "loss": 0.4447, "step": 750 }, { "epoch": 0.442604226156482, "grad_norm": 4.256951332092285, "learning_rate": 9.38888888888889e-06, "loss": 0.4809, "step": 775 }, { "epoch": 0.4568817818389492, "grad_norm": 3.533447742462158, "learning_rate": 9.333333333333334e-06, "loss": 0.4425, "step": 800 }, { "epoch": 0.4711593375214163, "grad_norm": 4.324098587036133, "learning_rate": 9.277777777777778e-06, "loss": 0.424, "step": 825 }, { "epoch": 0.4854368932038835, "grad_norm": 2.913189649581909, "learning_rate": 9.222222222222224e-06, "loss": 0.4314, "step": 850 }, { "epoch": 0.49971444888635064, "grad_norm": 3.432490825653076, "learning_rate": 9.166666666666666e-06, "loss": 0.4355, "step": 875 }, { "epoch": 0.5139920045688178, "grad_norm": 3.645869255065918, "learning_rate": 9.111111111111112e-06, "loss": 0.4395, "step": 900 }, { "epoch": 0.528269560251285, "grad_norm": 3.2094240188598633, "learning_rate": 9.055555555555556e-06, "loss": 0.4144, "step": 925 }, { "epoch": 0.5425471159337522, "grad_norm": 3.4623546600341797, "learning_rate": 9e-06, "loss": 0.4277, "step": 950 }, { "epoch": 0.5568246716162193, "grad_norm": 3.640333414077759, "learning_rate": 8.944444444444446e-06, "loss": 0.4246, "step": 975 }, { "epoch": 0.5711022272986864, "grad_norm": 3.0283167362213135, "learning_rate": 8.888888888888888e-06, "loss": 0.4047, "step": 1000 }, { "epoch": 0.5711022272986864, "eval_loss": 0.4848648011684418, "eval_runtime": 1825.4203, "eval_samples_per_second": 2.137, "eval_steps_per_second": 0.134, "eval_wer": 0.35052641746353713, "step": 1000 }, { "epoch": 0.5853797829811537, "grad_norm": 3.7762739658355713, "learning_rate": 8.833333333333334e-06, "loss": 0.4218, "step": 1025 }, { "epoch": 0.5996573386636208, "grad_norm": 3.495347023010254, "learning_rate": 8.777777777777778e-06, "loss": 0.3968, "step": 1050 }, { "epoch": 0.613934894346088, "grad_norm": 3.5088939666748047, "learning_rate": 8.722222222222224e-06, "loss": 0.4108, "step": 1075 }, { "epoch": 0.6282124500285551, "grad_norm": 3.555328845977783, "learning_rate": 8.666666666666668e-06, "loss": 0.4063, "step": 1100 }, { "epoch": 0.6424900057110223, "grad_norm": 2.9576587677001953, "learning_rate": 8.611111111111112e-06, "loss": 0.4116, "step": 1125 }, { "epoch": 0.6567675613934895, "grad_norm": 3.280855178833008, "learning_rate": 8.555555555555556e-06, "loss": 0.4083, "step": 1150 }, { "epoch": 0.6710451170759566, "grad_norm": 3.903722047805786, "learning_rate": 8.5e-06, "loss": 0.411, "step": 1175 }, { "epoch": 0.6853226727584237, "grad_norm": 3.519038438796997, "learning_rate": 8.444444444444446e-06, "loss": 0.3964, "step": 1200 }, { "epoch": 0.6996002284408909, "grad_norm": 3.3553972244262695, "learning_rate": 8.38888888888889e-06, "loss": 0.4049, "step": 1225 }, { "epoch": 0.7138777841233581, "grad_norm": 3.3820197582244873, "learning_rate": 8.333333333333334e-06, "loss": 0.4159, "step": 1250 }, { "epoch": 0.7281553398058253, "grad_norm": 2.782127857208252, "learning_rate": 8.277777777777778e-06, "loss": 0.3859, "step": 1275 }, { "epoch": 0.7424328954882924, "grad_norm": 3.5839345455169678, "learning_rate": 8.222222222222222e-06, "loss": 0.392, "step": 1300 }, { "epoch": 0.7567104511707595, "grad_norm": 3.0308761596679688, "learning_rate": 8.166666666666668e-06, "loss": 0.3899, "step": 1325 }, { "epoch": 0.7709880068532268, "grad_norm": 3.136904001235962, "learning_rate": 8.111111111111112e-06, "loss": 0.3907, "step": 1350 }, { "epoch": 0.7852655625356939, "grad_norm": 3.3192756175994873, "learning_rate": 8.055555555555557e-06, "loss": 0.3941, "step": 1375 }, { "epoch": 0.799543118218161, "grad_norm": 4.766107082366943, "learning_rate": 8.000000000000001e-06, "loss": 0.3887, "step": 1400 }, { "epoch": 0.8138206739006282, "grad_norm": 4.241744041442871, "learning_rate": 7.944444444444445e-06, "loss": 0.4033, "step": 1425 }, { "epoch": 0.8280982295830954, "grad_norm": 3.1559460163116455, "learning_rate": 7.88888888888889e-06, "loss": 0.3567, "step": 1450 }, { "epoch": 0.8423757852655626, "grad_norm": 3.142645835876465, "learning_rate": 7.833333333333333e-06, "loss": 0.3731, "step": 1475 }, { "epoch": 0.8566533409480297, "grad_norm": 3.1183199882507324, "learning_rate": 7.77777777777778e-06, "loss": 0.3668, "step": 1500 }, { "epoch": 0.8709308966304968, "grad_norm": 2.7859325408935547, "learning_rate": 7.722222222222223e-06, "loss": 0.3965, "step": 1525 }, { "epoch": 0.885208452312964, "grad_norm": 3.191088914871216, "learning_rate": 7.666666666666667e-06, "loss": 0.3574, "step": 1550 }, { "epoch": 0.8994860079954312, "grad_norm": 3.0640053749084473, "learning_rate": 7.611111111111111e-06, "loss": 0.3811, "step": 1575 }, { "epoch": 0.9137635636778983, "grad_norm": 3.0769450664520264, "learning_rate": 7.555555555555556e-06, "loss": 0.3788, "step": 1600 }, { "epoch": 0.9280411193603655, "grad_norm": 3.1407933235168457, "learning_rate": 7.500000000000001e-06, "loss": 0.3698, "step": 1625 }, { "epoch": 0.9423186750428326, "grad_norm": 3.410187244415283, "learning_rate": 7.444444444444445e-06, "loss": 0.3907, "step": 1650 }, { "epoch": 0.9565962307252999, "grad_norm": 3.3382880687713623, "learning_rate": 7.38888888888889e-06, "loss": 0.3368, "step": 1675 }, { "epoch": 0.970873786407767, "grad_norm": 3.194368600845337, "learning_rate": 7.333333333333333e-06, "loss": 0.369, "step": 1700 }, { "epoch": 0.9851513420902341, "grad_norm": 3.089852809906006, "learning_rate": 7.277777777777778e-06, "loss": 0.3765, "step": 1725 }, { "epoch": 0.9994288977727013, "grad_norm": 3.0002810955047607, "learning_rate": 7.222222222222223e-06, "loss": 0.3705, "step": 1750 }, { "epoch": 1.0137064534551685, "grad_norm": 2.3977696895599365, "learning_rate": 7.166666666666667e-06, "loss": 0.2584, "step": 1775 }, { "epoch": 1.0279840091376355, "grad_norm": 2.3220465183258057, "learning_rate": 7.111111111111112e-06, "loss": 0.2538, "step": 1800 }, { "epoch": 1.0422615648201028, "grad_norm": 2.819687843322754, "learning_rate": 7.055555555555557e-06, "loss": 0.2571, "step": 1825 }, { "epoch": 1.05653912050257, "grad_norm": 2.514644145965576, "learning_rate": 7e-06, "loss": 0.2806, "step": 1850 }, { "epoch": 1.070816676185037, "grad_norm": 2.1887128353118896, "learning_rate": 6.944444444444445e-06, "loss": 0.2626, "step": 1875 }, { "epoch": 1.0850942318675043, "grad_norm": 2.592247486114502, "learning_rate": 6.88888888888889e-06, "loss": 0.2509, "step": 1900 }, { "epoch": 1.0993717875499716, "grad_norm": 2.371534824371338, "learning_rate": 6.833333333333334e-06, "loss": 0.2605, "step": 1925 }, { "epoch": 1.1136493432324386, "grad_norm": 3.1825778484344482, "learning_rate": 6.777777777777779e-06, "loss": 0.2495, "step": 1950 }, { "epoch": 1.1279268989149058, "grad_norm": 2.901749849319458, "learning_rate": 6.7222222222222235e-06, "loss": 0.261, "step": 1975 }, { "epoch": 1.1422044545973729, "grad_norm": 2.658766984939575, "learning_rate": 6.666666666666667e-06, "loss": 0.2476, "step": 2000 }, { "epoch": 1.1422044545973729, "eval_loss": 0.41870468854904175, "eval_runtime": 1722.2575, "eval_samples_per_second": 2.265, "eval_steps_per_second": 0.142, "eval_wer": 0.3136771950159374, "step": 2000 }, { "epoch": 1.15648201027984, "grad_norm": 2.711312770843506, "learning_rate": 6.6111111111111115e-06, "loss": 0.2414, "step": 2025 }, { "epoch": 1.1707595659623073, "grad_norm": 2.9044759273529053, "learning_rate": 6.555555555555556e-06, "loss": 0.2502, "step": 2050 }, { "epoch": 1.1850371216447744, "grad_norm": 2.549725294113159, "learning_rate": 6.5000000000000004e-06, "loss": 0.2511, "step": 2075 }, { "epoch": 1.1993146773272416, "grad_norm": 2.95792555809021, "learning_rate": 6.444444444444445e-06, "loss": 0.2427, "step": 2100 }, { "epoch": 1.2135922330097086, "grad_norm": 2.686870574951172, "learning_rate": 6.3888888888888885e-06, "loss": 0.2637, "step": 2125 }, { "epoch": 1.227869788692176, "grad_norm": 3.7834455966949463, "learning_rate": 6.333333333333333e-06, "loss": 0.2554, "step": 2150 }, { "epoch": 1.2421473443746431, "grad_norm": 3.0891430377960205, "learning_rate": 6.277777777777778e-06, "loss": 0.2467, "step": 2175 }, { "epoch": 1.2564249000571102, "grad_norm": 2.771472930908203, "learning_rate": 6.222222222222223e-06, "loss": 0.2467, "step": 2200 }, { "epoch": 1.2707024557395774, "grad_norm": 2.6807925701141357, "learning_rate": 6.166666666666667e-06, "loss": 0.2682, "step": 2225 }, { "epoch": 1.2849800114220447, "grad_norm": 2.2320196628570557, "learning_rate": 6.111111111111112e-06, "loss": 0.2408, "step": 2250 }, { "epoch": 1.2992575671045117, "grad_norm": 3.066009759902954, "learning_rate": 6.055555555555555e-06, "loss": 0.2363, "step": 2275 }, { "epoch": 1.313535122786979, "grad_norm": 2.6043167114257812, "learning_rate": 6e-06, "loss": 0.2483, "step": 2300 }, { "epoch": 1.327812678469446, "grad_norm": 2.6250624656677246, "learning_rate": 5.944444444444445e-06, "loss": 0.2563, "step": 2325 }, { "epoch": 1.3420902341519132, "grad_norm": 2.508998394012451, "learning_rate": 5.88888888888889e-06, "loss": 0.2581, "step": 2350 }, { "epoch": 1.3563677898343802, "grad_norm": 2.872715473175049, "learning_rate": 5.833333333333334e-06, "loss": 0.2371, "step": 2375 }, { "epoch": 1.3706453455168475, "grad_norm": 3.1910557746887207, "learning_rate": 5.777777777777778e-06, "loss": 0.2515, "step": 2400 }, { "epoch": 1.3849229011993147, "grad_norm": 2.7466485500335693, "learning_rate": 5.722222222222222e-06, "loss": 0.2578, "step": 2425 }, { "epoch": 1.3992004568817817, "grad_norm": 2.388066530227661, "learning_rate": 5.666666666666667e-06, "loss": 0.2541, "step": 2450 }, { "epoch": 1.413478012564249, "grad_norm": 2.688497304916382, "learning_rate": 5.611111111111112e-06, "loss": 0.2514, "step": 2475 }, { "epoch": 1.4277555682467162, "grad_norm": 2.710899591445923, "learning_rate": 5.555555555555557e-06, "loss": 0.2765, "step": 2500 }, { "epoch": 1.4420331239291833, "grad_norm": 2.296635389328003, "learning_rate": 5.500000000000001e-06, "loss": 0.2487, "step": 2525 }, { "epoch": 1.4563106796116505, "grad_norm": 2.7988133430480957, "learning_rate": 5.444444444444445e-06, "loss": 0.2499, "step": 2550 }, { "epoch": 1.4705882352941178, "grad_norm": 3.1988582611083984, "learning_rate": 5.388888888888889e-06, "loss": 0.2456, "step": 2575 }, { "epoch": 1.4848657909765848, "grad_norm": 2.657517910003662, "learning_rate": 5.333333333333334e-06, "loss": 0.2613, "step": 2600 }, { "epoch": 1.499143346659052, "grad_norm": 2.5517725944519043, "learning_rate": 5.2777777777777785e-06, "loss": 0.2528, "step": 2625 }, { "epoch": 1.5134209023415193, "grad_norm": 2.7166850566864014, "learning_rate": 5.2222222222222226e-06, "loss": 0.2476, "step": 2650 }, { "epoch": 1.5276984580239863, "grad_norm": 2.7338292598724365, "learning_rate": 5.1666666666666675e-06, "loss": 0.2489, "step": 2675 }, { "epoch": 1.5419760137064533, "grad_norm": 2.1498470306396484, "learning_rate": 5.1111111111111115e-06, "loss": 0.2388, "step": 2700 }, { "epoch": 1.5562535693889206, "grad_norm": 2.595247745513916, "learning_rate": 5.0555555555555555e-06, "loss": 0.2566, "step": 2725 }, { "epoch": 1.5705311250713878, "grad_norm": 2.652132987976074, "learning_rate": 5e-06, "loss": 0.239, "step": 2750 }, { "epoch": 1.5848086807538548, "grad_norm": 2.436605930328369, "learning_rate": 4.944444444444445e-06, "loss": 0.2419, "step": 2775 }, { "epoch": 1.599086236436322, "grad_norm": 2.618035316467285, "learning_rate": 4.888888888888889e-06, "loss": 0.2295, "step": 2800 }, { "epoch": 1.6133637921187893, "grad_norm": 2.2901298999786377, "learning_rate": 4.833333333333333e-06, "loss": 0.2446, "step": 2825 }, { "epoch": 1.6276413478012564, "grad_norm": 2.899315595626831, "learning_rate": 4.777777777777778e-06, "loss": 0.2628, "step": 2850 }, { "epoch": 1.6419189034837236, "grad_norm": 2.616224527359009, "learning_rate": 4.722222222222222e-06, "loss": 0.2273, "step": 2875 }, { "epoch": 1.6561964591661908, "grad_norm": 2.43113112449646, "learning_rate": 4.666666666666667e-06, "loss": 0.2362, "step": 2900 }, { "epoch": 1.6704740148486579, "grad_norm": 2.5203065872192383, "learning_rate": 4.611111111111112e-06, "loss": 0.2428, "step": 2925 }, { "epoch": 1.6847515705311251, "grad_norm": 2.3064985275268555, "learning_rate": 4.555555555555556e-06, "loss": 0.2441, "step": 2950 }, { "epoch": 1.6990291262135924, "grad_norm": 2.201695680618286, "learning_rate": 4.5e-06, "loss": 0.2324, "step": 2975 }, { "epoch": 1.7133066818960594, "grad_norm": 2.442471981048584, "learning_rate": 4.444444444444444e-06, "loss": 0.2527, "step": 3000 }, { "epoch": 1.7133066818960594, "eval_loss": 0.3882293701171875, "eval_runtime": 1749.1422, "eval_samples_per_second": 2.23, "eval_steps_per_second": 0.139, "eval_wer": 0.2901091471071187, "step": 3000 }, { "epoch": 1.7275842375785264, "grad_norm": 2.77786922454834, "learning_rate": 4.388888888888889e-06, "loss": 0.2492, "step": 3025 }, { "epoch": 1.7418617932609937, "grad_norm": 2.5009052753448486, "learning_rate": 4.333333333333334e-06, "loss": 0.2341, "step": 3050 }, { "epoch": 1.756139348943461, "grad_norm": 2.780186176300049, "learning_rate": 4.277777777777778e-06, "loss": 0.2407, "step": 3075 }, { "epoch": 1.770416904625928, "grad_norm": 1.9574618339538574, "learning_rate": 4.222222222222223e-06, "loss": 0.2437, "step": 3100 }, { "epoch": 1.7846944603083952, "grad_norm": 2.151125907897949, "learning_rate": 4.166666666666667e-06, "loss": 0.2341, "step": 3125 }, { "epoch": 1.7989720159908624, "grad_norm": 2.170015811920166, "learning_rate": 4.111111111111111e-06, "loss": 0.2373, "step": 3150 }, { "epoch": 1.8132495716733295, "grad_norm": 3.0467231273651123, "learning_rate": 4.055555555555556e-06, "loss": 0.2317, "step": 3175 }, { "epoch": 1.8275271273557967, "grad_norm": 3.0150015354156494, "learning_rate": 4.000000000000001e-06, "loss": 0.228, "step": 3200 }, { "epoch": 1.841804683038264, "grad_norm": 3.275949001312256, "learning_rate": 3.944444444444445e-06, "loss": 0.2438, "step": 3225 }, { "epoch": 1.856082238720731, "grad_norm": 3.0381839275360107, "learning_rate": 3.88888888888889e-06, "loss": 0.2478, "step": 3250 }, { "epoch": 1.8703597944031982, "grad_norm": 2.770716428756714, "learning_rate": 3.833333333333334e-06, "loss": 0.2312, "step": 3275 }, { "epoch": 1.8846373500856655, "grad_norm": 2.6976678371429443, "learning_rate": 3.777777777777778e-06, "loss": 0.2284, "step": 3300 }, { "epoch": 1.8989149057681325, "grad_norm": 2.8799102306365967, "learning_rate": 3.7222222222222225e-06, "loss": 0.2484, "step": 3325 }, { "epoch": 1.9131924614505995, "grad_norm": 2.574629545211792, "learning_rate": 3.6666666666666666e-06, "loss": 0.2295, "step": 3350 }, { "epoch": 1.927470017133067, "grad_norm": 2.4746835231781006, "learning_rate": 3.6111111111111115e-06, "loss": 0.2335, "step": 3375 }, { "epoch": 1.941747572815534, "grad_norm": 3.084383964538574, "learning_rate": 3.555555555555556e-06, "loss": 0.212, "step": 3400 }, { "epoch": 1.956025128498001, "grad_norm": 2.4441068172454834, "learning_rate": 3.5e-06, "loss": 0.221, "step": 3425 }, { "epoch": 1.9703026841804683, "grad_norm": 3.031568765640259, "learning_rate": 3.444444444444445e-06, "loss": 0.2341, "step": 3450 }, { "epoch": 1.9845802398629355, "grad_norm": 2.3584327697753906, "learning_rate": 3.3888888888888893e-06, "loss": 0.2431, "step": 3475 }, { "epoch": 1.9988577955454025, "grad_norm": 2.1590421199798584, "learning_rate": 3.3333333333333333e-06, "loss": 0.2357, "step": 3500 }, { "epoch": 2.0131353512278696, "grad_norm": 2.2845587730407715, "learning_rate": 3.277777777777778e-06, "loss": 0.1576, "step": 3525 }, { "epoch": 2.027412906910337, "grad_norm": 2.033133029937744, "learning_rate": 3.2222222222222227e-06, "loss": 0.1422, "step": 3550 }, { "epoch": 2.041690462592804, "grad_norm": 2.2549259662628174, "learning_rate": 3.1666666666666667e-06, "loss": 0.1473, "step": 3575 }, { "epoch": 2.055968018275271, "grad_norm": 1.5837754011154175, "learning_rate": 3.1111111111111116e-06, "loss": 0.143, "step": 3600 }, { "epoch": 2.0702455739577386, "grad_norm": 1.9988360404968262, "learning_rate": 3.055555555555556e-06, "loss": 0.1416, "step": 3625 }, { "epoch": 2.0845231296402056, "grad_norm": 2.148613929748535, "learning_rate": 3e-06, "loss": 0.1338, "step": 3650 }, { "epoch": 2.0988006853226726, "grad_norm": 1.8176393508911133, "learning_rate": 2.944444444444445e-06, "loss": 0.1514, "step": 3675 }, { "epoch": 2.11307824100514, "grad_norm": 2.60271954536438, "learning_rate": 2.888888888888889e-06, "loss": 0.1533, "step": 3700 }, { "epoch": 2.127355796687607, "grad_norm": 2.120281457901001, "learning_rate": 2.8333333333333335e-06, "loss": 0.1404, "step": 3725 }, { "epoch": 2.141633352370074, "grad_norm": 2.3522286415100098, "learning_rate": 2.7777777777777783e-06, "loss": 0.1511, "step": 3750 }, { "epoch": 2.1559109080525416, "grad_norm": 1.8738924264907837, "learning_rate": 2.7222222222222224e-06, "loss": 0.1417, "step": 3775 }, { "epoch": 2.1701884637350086, "grad_norm": 2.255291223526001, "learning_rate": 2.666666666666667e-06, "loss": 0.1437, "step": 3800 }, { "epoch": 2.1844660194174756, "grad_norm": 1.7046154737472534, "learning_rate": 2.6111111111111113e-06, "loss": 0.1446, "step": 3825 }, { "epoch": 2.198743575099943, "grad_norm": 2.0543861389160156, "learning_rate": 2.5555555555555557e-06, "loss": 0.1504, "step": 3850 }, { "epoch": 2.21302113078241, "grad_norm": 2.139716863632202, "learning_rate": 2.5e-06, "loss": 0.1345, "step": 3875 }, { "epoch": 2.227298686464877, "grad_norm": 1.7999951839447021, "learning_rate": 2.4444444444444447e-06, "loss": 0.1389, "step": 3900 }, { "epoch": 2.241576242147344, "grad_norm": 1.7282090187072754, "learning_rate": 2.388888888888889e-06, "loss": 0.1324, "step": 3925 }, { "epoch": 2.2558537978298117, "grad_norm": 2.6271605491638184, "learning_rate": 2.3333333333333336e-06, "loss": 0.1551, "step": 3950 }, { "epoch": 2.2701313535122787, "grad_norm": 2.170382022857666, "learning_rate": 2.277777777777778e-06, "loss": 0.144, "step": 3975 }, { "epoch": 2.2844089091947457, "grad_norm": 1.796635627746582, "learning_rate": 2.222222222222222e-06, "loss": 0.1568, "step": 4000 }, { "epoch": 2.2844089091947457, "eval_loss": 0.3901652991771698, "eval_runtime": 1765.3609, "eval_samples_per_second": 2.21, "eval_steps_per_second": 0.138, "eval_wer": 0.28160919540229884, "step": 4000 }, { "epoch": 2.298686464877213, "grad_norm": 2.0357980728149414, "learning_rate": 2.166666666666667e-06, "loss": 0.161, "step": 4025 }, { "epoch": 2.31296402055968, "grad_norm": 2.027215003967285, "learning_rate": 2.1111111111111114e-06, "loss": 0.1353, "step": 4050 }, { "epoch": 2.3272415762421472, "grad_norm": 2.8169405460357666, "learning_rate": 2.0555555555555555e-06, "loss": 0.1449, "step": 4075 }, { "epoch": 2.3415191319246147, "grad_norm": 1.9528751373291016, "learning_rate": 2.0000000000000003e-06, "loss": 0.1376, "step": 4100 }, { "epoch": 2.3557966876070817, "grad_norm": 2.5781335830688477, "learning_rate": 1.944444444444445e-06, "loss": 0.1383, "step": 4125 }, { "epoch": 2.3700742432895487, "grad_norm": 2.083077907562256, "learning_rate": 1.888888888888889e-06, "loss": 0.1362, "step": 4150 }, { "epoch": 2.384351798972016, "grad_norm": 2.431272029876709, "learning_rate": 1.8333333333333333e-06, "loss": 0.1329, "step": 4175 }, { "epoch": 2.3986293546544832, "grad_norm": 2.157139539718628, "learning_rate": 1.777777777777778e-06, "loss": 0.1377, "step": 4200 }, { "epoch": 2.4129069103369503, "grad_norm": 2.5328071117401123, "learning_rate": 1.7222222222222224e-06, "loss": 0.1361, "step": 4225 }, { "epoch": 2.4271844660194173, "grad_norm": 2.433239459991455, "learning_rate": 1.6666666666666667e-06, "loss": 0.157, "step": 4250 }, { "epoch": 2.4414620217018848, "grad_norm": 2.5167510509490967, "learning_rate": 1.6111111111111113e-06, "loss": 0.132, "step": 4275 }, { "epoch": 2.455739577384352, "grad_norm": 1.9507442712783813, "learning_rate": 1.5555555555555558e-06, "loss": 0.1625, "step": 4300 }, { "epoch": 2.470017133066819, "grad_norm": 2.2467007637023926, "learning_rate": 1.5e-06, "loss": 0.1333, "step": 4325 }, { "epoch": 2.4842946887492863, "grad_norm": 2.4816768169403076, "learning_rate": 1.4444444444444445e-06, "loss": 0.1499, "step": 4350 }, { "epoch": 2.4985722444317533, "grad_norm": 2.0616416931152344, "learning_rate": 1.3888888888888892e-06, "loss": 0.1508, "step": 4375 }, { "epoch": 2.5128498001142203, "grad_norm": 2.089355230331421, "learning_rate": 1.3333333333333334e-06, "loss": 0.1344, "step": 4400 }, { "epoch": 2.5271273557966873, "grad_norm": 2.2235498428344727, "learning_rate": 1.28e-06, "loss": 0.1717, "step": 4425 }, { "epoch": 2.541404911479155, "grad_norm": 1.9268138408660889, "learning_rate": 1.2244444444444445e-06, "loss": 0.143, "step": 4450 }, { "epoch": 2.555682467161622, "grad_norm": 1.8911551237106323, "learning_rate": 1.168888888888889e-06, "loss": 0.1439, "step": 4475 }, { "epoch": 2.5699600228440893, "grad_norm": 2.5078868865966797, "learning_rate": 1.1133333333333334e-06, "loss": 0.1341, "step": 4500 }, { "epoch": 2.5842375785265563, "grad_norm": 2.1232492923736572, "learning_rate": 1.0577777777777779e-06, "loss": 0.1415, "step": 4525 }, { "epoch": 2.5985151342090234, "grad_norm": 1.9214311838150024, "learning_rate": 1.0022222222222223e-06, "loss": 0.1301, "step": 4550 }, { "epoch": 2.6127926898914904, "grad_norm": 2.4226858615875244, "learning_rate": 9.466666666666667e-07, "loss": 0.1438, "step": 4575 }, { "epoch": 2.627070245573958, "grad_norm": 2.324777126312256, "learning_rate": 8.911111111111112e-07, "loss": 0.1306, "step": 4600 }, { "epoch": 2.641347801256425, "grad_norm": 2.427114486694336, "learning_rate": 8.355555555555556e-07, "loss": 0.1359, "step": 4625 }, { "epoch": 2.655625356938892, "grad_norm": 1.989882469177246, "learning_rate": 7.8e-07, "loss": 0.1386, "step": 4650 }, { "epoch": 2.6699029126213594, "grad_norm": 2.6079118251800537, "learning_rate": 7.244444444444446e-07, "loss": 0.135, "step": 4675 }, { "epoch": 2.6841804683038264, "grad_norm": 2.3429243564605713, "learning_rate": 6.68888888888889e-07, "loss": 0.1356, "step": 4700 }, { "epoch": 2.6984580239862934, "grad_norm": 2.3358540534973145, "learning_rate": 6.133333333333333e-07, "loss": 0.1304, "step": 4725 }, { "epoch": 2.7127355796687604, "grad_norm": 1.917809247970581, "learning_rate": 5.577777777777779e-07, "loss": 0.1395, "step": 4750 }, { "epoch": 2.727013135351228, "grad_norm": 2.0677952766418457, "learning_rate": 5.022222222222222e-07, "loss": 0.1309, "step": 4775 }, { "epoch": 2.741290691033695, "grad_norm": 2.135127305984497, "learning_rate": 4.466666666666667e-07, "loss": 0.1424, "step": 4800 }, { "epoch": 2.7555682467161624, "grad_norm": 2.3306682109832764, "learning_rate": 3.9111111111111115e-07, "loss": 0.1318, "step": 4825 }, { "epoch": 2.7698458023986294, "grad_norm": 2.0700454711914062, "learning_rate": 3.3555555555555556e-07, "loss": 0.1566, "step": 4850 }, { "epoch": 2.7841233580810965, "grad_norm": 1.8561683893203735, "learning_rate": 2.8e-07, "loss": 0.1453, "step": 4875 }, { "epoch": 2.7984009137635635, "grad_norm": 2.2682347297668457, "learning_rate": 2.2444444444444445e-07, "loss": 0.1415, "step": 4900 }, { "epoch": 2.812678469446031, "grad_norm": 2.2898778915405273, "learning_rate": 1.6888888888888888e-07, "loss": 0.1427, "step": 4925 }, { "epoch": 2.826956025128498, "grad_norm": 2.328401803970337, "learning_rate": 1.1333333333333336e-07, "loss": 0.1357, "step": 4950 }, { "epoch": 2.841233580810965, "grad_norm": 2.2169013023376465, "learning_rate": 5.777777777777778e-08, "loss": 0.1343, "step": 4975 }, { "epoch": 2.8555111364934325, "grad_norm": 2.42340350151062, "learning_rate": 2.2222222222222225e-09, "loss": 0.1313, "step": 5000 }, { "epoch": 2.8555111364934325, "eval_loss": 0.38383349776268005, "eval_runtime": 1820.062, "eval_samples_per_second": 2.143, "eval_steps_per_second": 0.134, "eval_wer": 0.27318168646769053, "step": 5000 }, { "epoch": 2.8555111364934325, "step": 5000, "total_flos": 5.435589590699213e+20, "train_loss": 0.3002769865989685, "train_runtime": 59305.2217, "train_samples_per_second": 2.698, "train_steps_per_second": 0.084 } ], "logging_steps": 25, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.435589590699213e+20, "train_batch_size": 16, "trial_name": null, "trial_params": null }