{ "best_metric": 1.735720157623291, "best_model_checkpoint": "models/dehanalkautsar/mbert-uncased-modified_embedding_table-multi/checkpoint-62000", "epoch": 44.606610911987254, "eval_steps": 2000, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.273994424532059, "grad_norm": 14.472808837890625, "learning_rate": 4.840662842574888e-05, "loss": 6.7847, "step": 2000 }, { "epoch": 1.273994424532059, "eval_loss": 6.310997486114502, "eval_runtime": 1.0444, "eval_samples_per_second": 134.046, "eval_steps_per_second": 2.872, "step": 2000 }, { "epoch": 2.547988849064118, "grad_norm": 13.92039966583252, "learning_rate": 4.6813256851497774e-05, "loss": 5.2141, "step": 4000 }, { "epoch": 2.547988849064118, "eval_loss": 4.0085954666137695, "eval_runtime": 1.0423, "eval_samples_per_second": 134.315, "eval_steps_per_second": 2.878, "step": 4000 }, { "epoch": 3.8219832735961767, "grad_norm": 12.217097282409668, "learning_rate": 4.521988527724665e-05, "loss": 3.6444, "step": 6000 }, { "epoch": 3.8219832735961767, "eval_loss": 3.3225996494293213, "eval_runtime": 1.0422, "eval_samples_per_second": 134.329, "eval_steps_per_second": 2.878, "step": 6000 }, { "epoch": 5.09557945041816, "grad_norm": 13.112276077270508, "learning_rate": 4.362651370299554e-05, "loss": 3.185, "step": 8000 }, { "epoch": 5.09557945041816, "eval_loss": 2.9885096549987793, "eval_runtime": 1.0432, "eval_samples_per_second": 134.208, "eval_steps_per_second": 2.876, "step": 8000 }, { "epoch": 6.369573874950219, "grad_norm": 9.95456314086914, "learning_rate": 4.2033142128744425e-05, "loss": 2.9191, "step": 10000 }, { "epoch": 6.369573874950219, "eval_loss": 2.759176254272461, "eval_runtime": 1.0435, "eval_samples_per_second": 134.158, "eval_steps_per_second": 2.875, "step": 10000 }, { "epoch": 7.643568299482278, "grad_norm": 10.692479133605957, "learning_rate": 4.043977055449331e-05, "loss": 2.7335, "step": 12000 }, { "epoch": 7.643568299482278, "eval_loss": 2.601854085922241, "eval_runtime": 1.7999, "eval_samples_per_second": 77.783, "eval_steps_per_second": 1.667, "step": 12000 }, { "epoch": 8.917562724014337, "grad_norm": 10.000415802001953, "learning_rate": 3.8846398980242196e-05, "loss": 2.6016, "step": 14000 }, { "epoch": 8.917562724014337, "eval_loss": 2.4759533405303955, "eval_runtime": 2.1495, "eval_samples_per_second": 65.131, "eval_steps_per_second": 1.396, "step": 14000 }, { "epoch": 10.19115890083632, "grad_norm": 9.746127128601074, "learning_rate": 3.725302740599108e-05, "loss": 2.4983, "step": 16000 }, { "epoch": 10.19115890083632, "eval_loss": 2.3779261112213135, "eval_runtime": 2.1857, "eval_samples_per_second": 64.052, "eval_steps_per_second": 1.373, "step": 16000 }, { "epoch": 11.465153325368378, "grad_norm": 26.588117599487305, "learning_rate": 3.565965583173996e-05, "loss": 2.4195, "step": 18000 }, { "epoch": 11.465153325368378, "eval_loss": 2.34489369392395, "eval_runtime": 1.0425, "eval_samples_per_second": 134.292, "eval_steps_per_second": 2.878, "step": 18000 }, { "epoch": 12.739147749900438, "grad_norm": 10.147093772888184, "learning_rate": 3.4066284257488853e-05, "loss": 2.355, "step": 20000 }, { "epoch": 12.739147749900438, "eval_loss": 2.2396912574768066, "eval_runtime": 1.0419, "eval_samples_per_second": 134.367, "eval_steps_per_second": 2.879, "step": 20000 }, { "epoch": 14.012743926722422, "grad_norm": 9.359902381896973, "learning_rate": 3.247291268323773e-05, "loss": 2.3011, "step": 22000 }, { "epoch": 14.012743926722422, "eval_loss": 2.2220568656921387, "eval_runtime": 1.0421, "eval_samples_per_second": 134.349, "eval_steps_per_second": 2.879, "step": 22000 }, { "epoch": 15.296296296296296, "grad_norm": 9.719951629638672, "learning_rate": 4.550106849623215e-05, "loss": 2.2641, "step": 24000 }, { "epoch": 15.296296296296296, "eval_loss": 2.1969993114471436, "eval_runtime": 1.0459, "eval_samples_per_second": 133.856, "eval_steps_per_second": 2.868, "step": 24000 }, { "epoch": 16.570290720828353, "grad_norm": 10.155664443969727, "learning_rate": 4.5126157537584826e-05, "loss": 2.2145, "step": 26000 }, { "epoch": 16.570290720828353, "eval_loss": 2.1572253704071045, "eval_runtime": 1.0453, "eval_samples_per_second": 133.93, "eval_steps_per_second": 2.87, "step": 26000 }, { "epoch": 17.844285145360413, "grad_norm": 9.276083946228027, "learning_rate": 4.4751246578937504e-05, "loss": 2.1687, "step": 28000 }, { "epoch": 17.844285145360413, "eval_loss": 2.1415774822235107, "eval_runtime": 1.0448, "eval_samples_per_second": 134.002, "eval_steps_per_second": 2.871, "step": 28000 }, { "epoch": 19.1178813221824, "grad_norm": 9.626545906066895, "learning_rate": 4.437633562029018e-05, "loss": 2.1288, "step": 30000 }, { "epoch": 19.1178813221824, "eval_loss": 2.0869765281677246, "eval_runtime": 1.0442, "eval_samples_per_second": 134.079, "eval_steps_per_second": 2.873, "step": 30000 }, { "epoch": 20.391875746714458, "grad_norm": 9.785582542419434, "learning_rate": 4.400142466164286e-05, "loss": 2.0936, "step": 32000 }, { "epoch": 20.391875746714458, "eval_loss": 2.0372226238250732, "eval_runtime": 1.0457, "eval_samples_per_second": 133.886, "eval_steps_per_second": 2.869, "step": 32000 }, { "epoch": 21.665870171246514, "grad_norm": 10.339402198791504, "learning_rate": 4.362651370299554e-05, "loss": 2.0629, "step": 34000 }, { "epoch": 21.665870171246514, "eval_loss": 2.0290327072143555, "eval_runtime": 1.0449, "eval_samples_per_second": 133.981, "eval_steps_per_second": 2.871, "step": 34000 }, { "epoch": 22.939864595778573, "grad_norm": 33.6297492980957, "learning_rate": 4.325160274434822e-05, "loss": 2.0352, "step": 36000 }, { "epoch": 22.939864595778573, "eval_loss": 1.9997824430465698, "eval_runtime": 1.0457, "eval_samples_per_second": 133.884, "eval_steps_per_second": 2.869, "step": 36000 }, { "epoch": 24.21346077260056, "grad_norm": 9.419952392578125, "learning_rate": 4.2876691785700895e-05, "loss": 2.0085, "step": 38000 }, { "epoch": 24.21346077260056, "eval_loss": 1.9848532676696777, "eval_runtime": 1.0456, "eval_samples_per_second": 133.889, "eval_steps_per_second": 2.869, "step": 38000 }, { "epoch": 25.487455197132615, "grad_norm": 9.297412872314453, "learning_rate": 4.250178082705358e-05, "loss": 1.9835, "step": 40000 }, { "epoch": 25.487455197132615, "eval_loss": 1.9594355821609497, "eval_runtime": 1.0451, "eval_samples_per_second": 133.962, "eval_steps_per_second": 2.871, "step": 40000 }, { "epoch": 26.761449621664674, "grad_norm": 9.560755729675293, "learning_rate": 4.212686986840625e-05, "loss": 1.9622, "step": 42000 }, { "epoch": 26.761449621664674, "eval_loss": 1.9543912410736084, "eval_runtime": 1.0446, "eval_samples_per_second": 134.027, "eval_steps_per_second": 2.872, "step": 42000 }, { "epoch": 28.03504579848666, "grad_norm": 9.696600914001465, "learning_rate": 4.1751958909758937e-05, "loss": 1.9423, "step": 44000 }, { "epoch": 28.03504579848666, "eval_loss": 1.9044808149337769, "eval_runtime": 1.0452, "eval_samples_per_second": 133.942, "eval_steps_per_second": 2.87, "step": 44000 }, { "epoch": 29.309040223018716, "grad_norm": 9.66083812713623, "learning_rate": 4.1377047951111615e-05, "loss": 1.9237, "step": 46000 }, { "epoch": 29.309040223018716, "eval_loss": 1.872032642364502, "eval_runtime": 1.0448, "eval_samples_per_second": 134.0, "eval_steps_per_second": 2.871, "step": 46000 }, { "epoch": 30.583034647550775, "grad_norm": 8.918251991271973, "learning_rate": 4.100213699246429e-05, "loss": 1.905, "step": 48000 }, { "epoch": 30.583034647550775, "eval_loss": 1.857001781463623, "eval_runtime": 1.0437, "eval_samples_per_second": 134.137, "eval_steps_per_second": 2.874, "step": 48000 }, { "epoch": 31.857029072082835, "grad_norm": 9.568062782287598, "learning_rate": 4.062722603381697e-05, "loss": 1.8905, "step": 50000 }, { "epoch": 31.857029072082835, "eval_loss": 1.8617095947265625, "eval_runtime": 1.0446, "eval_samples_per_second": 134.024, "eval_steps_per_second": 2.872, "step": 50000 }, { "epoch": 33.14145758661888, "grad_norm": 9.223485946655273, "learning_rate": 4.025231507516965e-05, "loss": 1.8744, "step": 52000 }, { "epoch": 33.14145758661888, "eval_loss": 1.8508821725845337, "eval_runtime": 1.0449, "eval_samples_per_second": 133.978, "eval_steps_per_second": 2.871, "step": 52000 }, { "epoch": 34.41545201115093, "grad_norm": 9.19514274597168, "learning_rate": 3.987740411652233e-05, "loss": 1.8594, "step": 54000 }, { "epoch": 34.41545201115093, "eval_loss": 1.807867169380188, "eval_runtime": 1.0427, "eval_samples_per_second": 134.272, "eval_steps_per_second": 2.877, "step": 54000 }, { "epoch": 35.689446435682996, "grad_norm": 9.288874626159668, "learning_rate": 3.9502493157875006e-05, "loss": 1.845, "step": 56000 }, { "epoch": 35.689446435682996, "eval_loss": 1.8352290391921997, "eval_runtime": 1.0447, "eval_samples_per_second": 134.011, "eval_steps_per_second": 2.872, "step": 56000 }, { "epoch": 36.96344086021505, "grad_norm": 8.9916353225708, "learning_rate": 3.9127582199227684e-05, "loss": 1.8325, "step": 58000 }, { "epoch": 36.96344086021505, "eval_loss": 1.8519763946533203, "eval_runtime": 1.0422, "eval_samples_per_second": 134.334, "eval_steps_per_second": 2.879, "step": 58000 }, { "epoch": 38.237037037037034, "grad_norm": 9.245351791381836, "learning_rate": 3.875267124058036e-05, "loss": 1.8204, "step": 60000 }, { "epoch": 38.237037037037034, "eval_loss": 1.792788028717041, "eval_runtime": 1.0429, "eval_samples_per_second": 134.237, "eval_steps_per_second": 2.877, "step": 60000 }, { "epoch": 39.5110314615691, "grad_norm": 9.415380477905273, "learning_rate": 3.837776028193305e-05, "loss": 1.808, "step": 62000 }, { "epoch": 39.5110314615691, "eval_loss": 1.735720157623291, "eval_runtime": 1.0435, "eval_samples_per_second": 134.164, "eval_steps_per_second": 2.875, "step": 62000 }, { "epoch": 40.78502588610115, "grad_norm": 9.212890625, "learning_rate": 3.800284932328572e-05, "loss": 1.7973, "step": 64000 }, { "epoch": 40.78502588610115, "eval_loss": 1.7825311422348022, "eval_runtime": 1.0432, "eval_samples_per_second": 134.204, "eval_steps_per_second": 2.876, "step": 64000 }, { "epoch": 42.058622062923135, "grad_norm": 12.76929759979248, "learning_rate": 3.76279383646384e-05, "loss": 1.7868, "step": 66000 }, { "epoch": 42.058622062923135, "eval_loss": 1.8231488466262817, "eval_runtime": 1.0438, "eval_samples_per_second": 134.121, "eval_steps_per_second": 2.874, "step": 66000 }, { "epoch": 43.3326164874552, "grad_norm": 9.313870429992676, "learning_rate": 3.725302740599108e-05, "loss": 1.776, "step": 68000 }, { "epoch": 43.3326164874552, "eval_loss": 1.7821528911590576, "eval_runtime": 1.0442, "eval_samples_per_second": 134.071, "eval_steps_per_second": 2.873, "step": 68000 }, { "epoch": 44.606610911987254, "grad_norm": 9.745814323425293, "learning_rate": 3.687811644734376e-05, "loss": 1.7671, "step": 70000 }, { "epoch": 44.606610911987254, "eval_loss": 1.7638314962387085, "eval_runtime": 1.0433, "eval_samples_per_second": 134.193, "eval_steps_per_second": 2.876, "step": 70000 } ], "logging_steps": 2000, "max_steps": 266730, "num_input_tokens_seen": 0, "num_train_epochs": 170, "save_steps": 2000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.714819396263936e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }