mms-1b-bemgen-combined-model / trainer_state.json
csikasote's picture
End of training
684074e verified
{
"best_metric": 0.24565543234348297,
"best_model_checkpoint": "/scratch/skscla001/speech/results/mms-1b-bemgen-combined-model/checkpoint-3900",
"epoch": 2.1660649819494586,
"eval_steps": 100,
"global_step": 4200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05157297576070139,
"grad_norm": 2.867751359939575,
"learning_rate": 0.000285,
"loss": 6.8762,
"step": 100
},
{
"epoch": 0.05157297576070139,
"eval_loss": 0.9800576567649841,
"eval_runtime": 61.3726,
"eval_samples_per_second": 15.903,
"eval_steps_per_second": 3.976,
"eval_wer": 0.9386155855463648,
"step": 100
},
{
"epoch": 0.10314595152140278,
"grad_norm": 4.205641269683838,
"learning_rate": 0.0002995092130187704,
"loss": 0.5788,
"step": 200
},
{
"epoch": 0.10314595152140278,
"eval_loss": 0.34664157032966614,
"eval_runtime": 60.8325,
"eval_samples_per_second": 16.044,
"eval_steps_per_second": 4.011,
"eval_wer": 0.5014148889856335,
"step": 200
},
{
"epoch": 0.15471892728210418,
"grad_norm": 16.612009048461914,
"learning_rate": 0.00029899259514379193,
"loss": 0.4891,
"step": 300
},
{
"epoch": 0.15471892728210418,
"eval_loss": 0.3219561278820038,
"eval_runtime": 60.7824,
"eval_samples_per_second": 16.057,
"eval_steps_per_second": 4.014,
"eval_wer": 0.48204179364388333,
"step": 300
},
{
"epoch": 0.20629190304280556,
"grad_norm": 15.71129035949707,
"learning_rate": 0.0002984759772688135,
"loss": 0.4386,
"step": 400
},
{
"epoch": 0.20629190304280556,
"eval_loss": 0.30709779262542725,
"eval_runtime": 60.8186,
"eval_samples_per_second": 16.048,
"eval_steps_per_second": 4.012,
"eval_wer": 0.4801915542011319,
"step": 400
},
{
"epoch": 0.25786487880350695,
"grad_norm": 2.6226158142089844,
"learning_rate": 0.000297959359393835,
"loss": 0.4272,
"step": 500
},
{
"epoch": 0.25786487880350695,
"eval_loss": 0.30557531118392944,
"eval_runtime": 61.7053,
"eval_samples_per_second": 15.817,
"eval_steps_per_second": 3.954,
"eval_wer": 0.49880278624292557,
"step": 500
},
{
"epoch": 0.30943785456420836,
"grad_norm": 2.265719413757324,
"learning_rate": 0.00029744274151885655,
"loss": 0.3982,
"step": 600
},
{
"epoch": 0.30943785456420836,
"eval_loss": 0.2980726957321167,
"eval_runtime": 61.2582,
"eval_samples_per_second": 15.933,
"eval_steps_per_second": 3.983,
"eval_wer": 0.4625598606878537,
"step": 600
},
{
"epoch": 0.36101083032490977,
"grad_norm": 1.4846241474151611,
"learning_rate": 0.00029692612364387805,
"loss": 0.425,
"step": 700
},
{
"epoch": 0.36101083032490977,
"eval_loss": 0.2976619601249695,
"eval_runtime": 61.0566,
"eval_samples_per_second": 15.985,
"eval_steps_per_second": 3.996,
"eval_wer": 0.4631040487592512,
"step": 700
},
{
"epoch": 0.4125838060856111,
"grad_norm": 1.5233323574066162,
"learning_rate": 0.0002964095057688996,
"loss": 0.4036,
"step": 800
},
{
"epoch": 0.4125838060856111,
"eval_loss": 0.2897385358810425,
"eval_runtime": 60.9529,
"eval_samples_per_second": 16.012,
"eval_steps_per_second": 4.003,
"eval_wer": 0.44383979103178056,
"step": 800
},
{
"epoch": 0.46415678184631254,
"grad_norm": 1.4918992519378662,
"learning_rate": 0.0002958928878939211,
"loss": 0.3903,
"step": 900
},
{
"epoch": 0.46415678184631254,
"eval_loss": 0.28775253891944885,
"eval_runtime": 61.5398,
"eval_samples_per_second": 15.86,
"eval_steps_per_second": 3.965,
"eval_wer": 0.4626686983021332,
"step": 900
},
{
"epoch": 0.5157297576070139,
"grad_norm": 35.91661071777344,
"learning_rate": 0.0002953762700189426,
"loss": 0.3758,
"step": 1000
},
{
"epoch": 0.5157297576070139,
"eval_loss": 0.29262155294418335,
"eval_runtime": 61.6256,
"eval_samples_per_second": 15.838,
"eval_steps_per_second": 3.959,
"eval_wer": 0.4523291249455812,
"step": 1000
},
{
"epoch": 0.5673027333677153,
"grad_norm": 2.335728645324707,
"learning_rate": 0.0002948596521439642,
"loss": 0.3861,
"step": 1100
},
{
"epoch": 0.5673027333677153,
"eval_loss": 0.28073564171791077,
"eval_runtime": 60.9884,
"eval_samples_per_second": 16.003,
"eval_steps_per_second": 4.001,
"eval_wer": 0.44101001306051374,
"step": 1100
},
{
"epoch": 0.6188757091284167,
"grad_norm": 1.2785513401031494,
"learning_rate": 0.0002943430342689857,
"loss": 0.3763,
"step": 1200
},
{
"epoch": 0.6188757091284167,
"eval_loss": 0.2789745032787323,
"eval_runtime": 61.0183,
"eval_samples_per_second": 15.995,
"eval_steps_per_second": 3.999,
"eval_wer": 0.4330648672181106,
"step": 1200
},
{
"epoch": 0.6704486848891181,
"grad_norm": 4.705647945404053,
"learning_rate": 0.0002938264163940072,
"loss": 0.3984,
"step": 1300
},
{
"epoch": 0.6704486848891181,
"eval_loss": 0.2803143262863159,
"eval_runtime": 61.7547,
"eval_samples_per_second": 15.804,
"eval_steps_per_second": 3.951,
"eval_wer": 0.4312146277753592,
"step": 1300
},
{
"epoch": 0.7220216606498195,
"grad_norm": 1.780588984489441,
"learning_rate": 0.00029330979851902874,
"loss": 0.373,
"step": 1400
},
{
"epoch": 0.7220216606498195,
"eval_loss": 0.2802477478981018,
"eval_runtime": 61.6178,
"eval_samples_per_second": 15.84,
"eval_steps_per_second": 3.96,
"eval_wer": 0.42457553330430997,
"step": 1400
},
{
"epoch": 0.7735946364105208,
"grad_norm": 2.3556313514709473,
"learning_rate": 0.0002927931806440503,
"loss": 0.3848,
"step": 1500
},
{
"epoch": 0.7735946364105208,
"eval_loss": 0.2759012281894684,
"eval_runtime": 61.2513,
"eval_samples_per_second": 15.934,
"eval_steps_per_second": 3.984,
"eval_wer": 0.47518502394427514,
"step": 1500
},
{
"epoch": 0.8251676121712223,
"grad_norm": 4.117414951324463,
"learning_rate": 0.0002922765627690718,
"loss": 0.4235,
"step": 1600
},
{
"epoch": 0.8251676121712223,
"eval_loss": 0.2738034725189209,
"eval_runtime": 60.9855,
"eval_samples_per_second": 16.004,
"eval_steps_per_second": 4.001,
"eval_wer": 0.42675228558989986,
"step": 1600
},
{
"epoch": 0.8767405879319237,
"grad_norm": 7.5644683837890625,
"learning_rate": 0.0002917651110728431,
"loss": 0.3704,
"step": 1700
},
{
"epoch": 0.8767405879319237,
"eval_loss": 0.26875266432762146,
"eval_runtime": 61.6159,
"eval_samples_per_second": 15.84,
"eval_steps_per_second": 3.96,
"eval_wer": 0.4218545929473226,
"step": 1700
},
{
"epoch": 0.9283135636926251,
"grad_norm": 2.0241034030914307,
"learning_rate": 0.00029124849319786463,
"loss": 0.3911,
"step": 1800
},
{
"epoch": 0.9283135636926251,
"eval_loss": 0.2653037905693054,
"eval_runtime": 61.6914,
"eval_samples_per_second": 15.821,
"eval_steps_per_second": 3.955,
"eval_wer": 0.42011319111885065,
"step": 1800
},
{
"epoch": 0.9798865394533265,
"grad_norm": 1.9250996112823486,
"learning_rate": 0.00029073187532288613,
"loss": 0.3954,
"step": 1900
},
{
"epoch": 0.9798865394533265,
"eval_loss": 0.26971080899238586,
"eval_runtime": 61.0364,
"eval_samples_per_second": 15.99,
"eval_steps_per_second": 3.998,
"eval_wer": 0.4481932956029604,
"step": 1900
},
{
"epoch": 1.0314595152140278,
"grad_norm": 2.152578353881836,
"learning_rate": 0.0002902152574479077,
"loss": 0.352,
"step": 2000
},
{
"epoch": 1.0314595152140278,
"eval_loss": 0.26541659235954285,
"eval_runtime": 61.1446,
"eval_samples_per_second": 15.962,
"eval_steps_per_second": 3.991,
"eval_wer": 0.4154331737048324,
"step": 2000
},
{
"epoch": 1.0830324909747293,
"grad_norm": 1.2193535566329956,
"learning_rate": 0.0002896986395729292,
"loss": 0.3808,
"step": 2100
},
{
"epoch": 1.0830324909747293,
"eval_loss": 0.2631310522556305,
"eval_runtime": 61.7949,
"eval_samples_per_second": 15.794,
"eval_steps_per_second": 3.949,
"eval_wer": 0.40509360034828035,
"step": 2100
},
{
"epoch": 1.1346054667354306,
"grad_norm": 0.9607815742492676,
"learning_rate": 0.0002891820216979507,
"loss": 0.3681,
"step": 2200
},
{
"epoch": 1.1346054667354306,
"eval_loss": 0.26097217202186584,
"eval_runtime": 61.6359,
"eval_samples_per_second": 15.835,
"eval_steps_per_second": 3.959,
"eval_wer": 0.4218545929473226,
"step": 2200
},
{
"epoch": 1.1861784424961321,
"grad_norm": 1.6105040311813354,
"learning_rate": 0.00028866540382297226,
"loss": 0.3355,
"step": 2300
},
{
"epoch": 1.1861784424961321,
"eval_loss": 0.26081275939941406,
"eval_runtime": 61.1026,
"eval_samples_per_second": 15.973,
"eval_steps_per_second": 3.993,
"eval_wer": 0.40977361776229865,
"step": 2300
},
{
"epoch": 1.2377514182568334,
"grad_norm": 2.3081679344177246,
"learning_rate": 0.0002881487859479938,
"loss": 0.342,
"step": 2400
},
{
"epoch": 1.2377514182568334,
"eval_loss": 0.2601791322231293,
"eval_runtime": 61.0993,
"eval_samples_per_second": 15.974,
"eval_steps_per_second": 3.993,
"eval_wer": 0.40824989116238575,
"step": 2400
},
{
"epoch": 1.2893243940175347,
"grad_norm": 0.6918842792510986,
"learning_rate": 0.0002876321680730153,
"loss": 0.347,
"step": 2500
},
{
"epoch": 1.2893243940175347,
"eval_loss": 0.26280567049980164,
"eval_runtime": 61.7023,
"eval_samples_per_second": 15.818,
"eval_steps_per_second": 3.954,
"eval_wer": 0.40552895080539836,
"step": 2500
},
{
"epoch": 1.3408973697782363,
"grad_norm": 0.920050323009491,
"learning_rate": 0.0002871155501980368,
"loss": 0.3409,
"step": 2600
},
{
"epoch": 1.3408973697782363,
"eval_loss": 0.25879552960395813,
"eval_runtime": 61.69,
"eval_samples_per_second": 15.821,
"eval_steps_per_second": 3.955,
"eval_wer": 0.412929908576404,
"step": 2600
},
{
"epoch": 1.3924703455389376,
"grad_norm": 0.7028564810752869,
"learning_rate": 0.0002865989323230584,
"loss": 0.3423,
"step": 2700
},
{
"epoch": 1.3924703455389376,
"eval_loss": 0.2616526484489441,
"eval_runtime": 61.13,
"eval_samples_per_second": 15.966,
"eval_steps_per_second": 3.991,
"eval_wer": 0.41924249020461474,
"step": 2700
},
{
"epoch": 1.444043321299639,
"grad_norm": 3.5141775608062744,
"learning_rate": 0.0002860823144480799,
"loss": 0.3341,
"step": 2800
},
{
"epoch": 1.444043321299639,
"eval_loss": 0.25779473781585693,
"eval_runtime": 61.3161,
"eval_samples_per_second": 15.918,
"eval_steps_per_second": 3.979,
"eval_wer": 0.40552895080539836,
"step": 2800
},
{
"epoch": 1.4956162970603404,
"grad_norm": 1.0055650472640991,
"learning_rate": 0.0002855656965731014,
"loss": 0.3425,
"step": 2900
},
{
"epoch": 1.4956162970603404,
"eval_loss": 0.2579568922519684,
"eval_runtime": 61.9639,
"eval_samples_per_second": 15.751,
"eval_steps_per_second": 3.938,
"eval_wer": 0.39878101872006966,
"step": 2900
},
{
"epoch": 1.5471892728210417,
"grad_norm": 1.4293900728225708,
"learning_rate": 0.00028504907869812294,
"loss": 0.337,
"step": 3000
},
{
"epoch": 1.5471892728210417,
"eval_loss": 0.25681352615356445,
"eval_runtime": 61.6019,
"eval_samples_per_second": 15.844,
"eval_steps_per_second": 3.961,
"eval_wer": 0.40705267740531126,
"step": 3000
},
{
"epoch": 1.5987622485817432,
"grad_norm": 0.6135945916175842,
"learning_rate": 0.00028453246082314445,
"loss": 0.3412,
"step": 3100
},
{
"epoch": 1.5987622485817432,
"eval_loss": 0.25524020195007324,
"eval_runtime": 61.308,
"eval_samples_per_second": 15.92,
"eval_steps_per_second": 3.98,
"eval_wer": 0.39932520679146716,
"step": 3100
},
{
"epoch": 1.6503352243424445,
"grad_norm": 4.757889270782471,
"learning_rate": 0.00028401584294816595,
"loss": 0.3837,
"step": 3200
},
{
"epoch": 1.6503352243424445,
"eval_loss": 0.26221156120300293,
"eval_runtime": 61.5959,
"eval_samples_per_second": 15.845,
"eval_steps_per_second": 3.961,
"eval_wer": 0.40835872877666524,
"step": 3200
},
{
"epoch": 1.701908200103146,
"grad_norm": 1.1227970123291016,
"learning_rate": 0.0002834992250731875,
"loss": 0.3372,
"step": 3300
},
{
"epoch": 1.701908200103146,
"eval_loss": 0.2548165023326874,
"eval_runtime": 61.9858,
"eval_samples_per_second": 15.746,
"eval_steps_per_second": 3.936,
"eval_wer": 0.3991075315629081,
"step": 3300
},
{
"epoch": 1.7534811758638473,
"grad_norm": 0.8613722324371338,
"learning_rate": 0.00028298260719820907,
"loss": 0.3394,
"step": 3400
},
{
"epoch": 1.7534811758638473,
"eval_loss": 0.2535094916820526,
"eval_runtime": 61.5961,
"eval_samples_per_second": 15.845,
"eval_steps_per_second": 3.961,
"eval_wer": 0.4060731388767958,
"step": 3400
},
{
"epoch": 1.8050541516245486,
"grad_norm": 0.5101200938224792,
"learning_rate": 0.00028246598932323057,
"loss": 0.3542,
"step": 3500
},
{
"epoch": 1.8050541516245486,
"eval_loss": 0.25123441219329834,
"eval_runtime": 61.2846,
"eval_samples_per_second": 15.926,
"eval_steps_per_second": 3.981,
"eval_wer": 0.39268611232041795,
"step": 3500
},
{
"epoch": 1.8566271273852502,
"grad_norm": 1.271552324295044,
"learning_rate": 0.0002819493714482521,
"loss": 0.3368,
"step": 3600
},
{
"epoch": 1.8566271273852502,
"eval_loss": 0.258027583360672,
"eval_runtime": 61.4346,
"eval_samples_per_second": 15.887,
"eval_steps_per_second": 3.972,
"eval_wer": 0.4004135829342621,
"step": 3600
},
{
"epoch": 1.9082001031459517,
"grad_norm": 1.8105818033218384,
"learning_rate": 0.00028143275357327363,
"loss": 0.3807,
"step": 3700
},
{
"epoch": 1.9082001031459517,
"eval_loss": 0.24900555610656738,
"eval_runtime": 61.7426,
"eval_samples_per_second": 15.808,
"eval_steps_per_second": 3.952,
"eval_wer": 0.39747496734871574,
"step": 3700
},
{
"epoch": 1.959773078906653,
"grad_norm": 1.8133718967437744,
"learning_rate": 0.00028091613569829514,
"loss": 0.3454,
"step": 3800
},
{
"epoch": 1.959773078906653,
"eval_loss": 0.2513742446899414,
"eval_runtime": 61.6449,
"eval_samples_per_second": 15.833,
"eval_steps_per_second": 3.958,
"eval_wer": 0.40019590770570307,
"step": 3800
},
{
"epoch": 2.0113460546673543,
"grad_norm": 0.7546507120132446,
"learning_rate": 0.00028039951782331664,
"loss": 0.3456,
"step": 3900
},
{
"epoch": 2.0113460546673543,
"eval_loss": 0.24565543234348297,
"eval_runtime": 61.2207,
"eval_samples_per_second": 15.942,
"eval_steps_per_second": 3.986,
"eval_wer": 0.3931214627775359,
"step": 3900
},
{
"epoch": 2.0629190304280556,
"grad_norm": 0.9227738976478577,
"learning_rate": 0.0002798828999483382,
"loss": 0.3202,
"step": 4000
},
{
"epoch": 2.0629190304280556,
"eval_loss": 0.24660241603851318,
"eval_runtime": 61.4456,
"eval_samples_per_second": 15.884,
"eval_steps_per_second": 3.971,
"eval_wer": 0.391597736177623,
"step": 4000
},
{
"epoch": 2.114492006188757,
"grad_norm": 1.0991692543029785,
"learning_rate": 0.00027937144825210947,
"loss": 0.3233,
"step": 4100
},
{
"epoch": 2.114492006188757,
"eval_loss": 0.2494671791791916,
"eval_runtime": 61.9678,
"eval_samples_per_second": 15.75,
"eval_steps_per_second": 3.938,
"eval_wer": 0.39747496734871574,
"step": 4100
},
{
"epoch": 2.1660649819494586,
"grad_norm": 0.6972938776016235,
"learning_rate": 0.000278854830377131,
"loss": 0.3052,
"step": 4200
},
{
"epoch": 2.1660649819494586,
"eval_loss": 0.247751384973526,
"eval_runtime": 61.6785,
"eval_samples_per_second": 15.824,
"eval_steps_per_second": 3.956,
"eval_wer": 0.38985633434915107,
"step": 4200
},
{
"epoch": 2.1660649819494586,
"step": 4200,
"total_flos": 1.0016492328632693e+19,
"train_loss": 0.5294508952186221,
"train_runtime": 6287.6058,
"train_samples_per_second": 37.006,
"train_steps_per_second": 9.252
}
],
"logging_steps": 100,
"max_steps": 58170,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 400,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0016492328632693e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}