urlbert-tiny-base-v4 / trainer_state.json
CrabInHoney's picture
Upload 12 files
b45a7af verified
{
"best_metric": 3.070533275604248,
"best_model_checkpoint": "./distilled3/checkpoint-46000",
"epoch": 1.7583705765990183,
"eval_steps": 2000,
"global_step": 48000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"combined_loss": 13.355602264404297,
"distill_loss": 1.4010732173919678,
"epoch": 0,
"step": 0,
"student_mlm_loss": 25.310131072998047
},
{
"epoch": 0.003663272034581288,
"grad_norm": 11.128765106201172,
"learning_rate": 1e-05,
"loss": 17.4544,
"step": 100
},
{
"combined_loss": 9.379831314086914,
"distill_loss": 1.5227235555648804,
"epoch": 0.003663272034581288,
"step": 100,
"student_mlm_loss": 17.2369384765625
},
{
"epoch": 0.007326544069162576,
"grad_norm": 14.151921272277832,
"learning_rate": 2e-05,
"loss": 16.0099,
"step": 200
},
{
"combined_loss": 28.136512756347656,
"distill_loss": 1.571045160293579,
"epoch": 0.007326544069162576,
"step": 200,
"student_mlm_loss": 54.70198059082031
},
{
"epoch": 0.010989816103743864,
"grad_norm": 11.68195915222168,
"learning_rate": 3e-05,
"loss": 18.8223,
"step": 300
},
{
"combined_loss": 15.699158668518066,
"distill_loss": 1.5519400835037231,
"epoch": 0.010989816103743864,
"step": 300,
"student_mlm_loss": 29.846376419067383
},
{
"epoch": 0.014653088138325152,
"grad_norm": 8.982569694519043,
"learning_rate": 4e-05,
"loss": 16.9008,
"step": 400
},
{
"combined_loss": 3.035900592803955,
"distill_loss": 1.4880340099334717,
"epoch": 0.014653088138325152,
"step": 400,
"student_mlm_loss": 4.583766937255859
},
{
"epoch": 0.01831636017290644,
"grad_norm": 7.045658111572266,
"learning_rate": 5e-05,
"loss": 8.812,
"step": 500
},
{
"combined_loss": 7.002770900726318,
"distill_loss": 1.351847529411316,
"epoch": 0.01831636017290644,
"step": 500,
"student_mlm_loss": 12.653694152832031
},
{
"epoch": 0.021979632207487727,
"grad_norm": 4.265043258666992,
"learning_rate": 4.9938570410595373e-05,
"loss": 16.8853,
"step": 600
},
{
"combined_loss": 3.2060928344726562,
"distill_loss": 1.2962806224822998,
"epoch": 0.021979632207487727,
"step": 600,
"student_mlm_loss": 5.115904808044434
},
{
"epoch": 0.025642904242069015,
"grad_norm": 7.744924545288086,
"learning_rate": 4.987714082119075e-05,
"loss": 7.1609,
"step": 700
},
{
"combined_loss": 2.2816712856292725,
"distill_loss": 1.5105196237564087,
"epoch": 0.025642904242069015,
"step": 700,
"student_mlm_loss": 3.052823066711426
},
{
"epoch": 0.029306176276650303,
"grad_norm": 12.44052791595459,
"learning_rate": 4.981571123178613e-05,
"loss": 13.0471,
"step": 800
},
{
"combined_loss": 3.225351095199585,
"distill_loss": 1.5753816366195679,
"epoch": 0.029306176276650303,
"step": 800,
"student_mlm_loss": 4.8753204345703125
},
{
"epoch": 0.032969448311231594,
"grad_norm": 6.2059645652771,
"learning_rate": 4.975428164238151e-05,
"loss": 6.2833,
"step": 900
},
{
"combined_loss": 8.580605506896973,
"distill_loss": 1.530474066734314,
"epoch": 0.032969448311231594,
"step": 900,
"student_mlm_loss": 15.630736351013184
},
{
"epoch": 0.03663272034581288,
"grad_norm": 14.731459617614746,
"learning_rate": 4.969285205297688e-05,
"loss": 5.8549,
"step": 1000
},
{
"combined_loss": 3.7085845470428467,
"distill_loss": 1.4659323692321777,
"epoch": 0.03663272034581288,
"step": 1000,
"student_mlm_loss": 5.951236724853516
},
{
"epoch": 0.04029599238039417,
"grad_norm": 9.745060920715332,
"learning_rate": 4.9631422463572256e-05,
"loss": 5.174,
"step": 1100
},
{
"combined_loss": 4.752764701843262,
"distill_loss": 1.4000483751296997,
"epoch": 0.04029599238039417,
"step": 1100,
"student_mlm_loss": 8.105481147766113
},
{
"epoch": 0.043959264414975455,
"grad_norm": 13.801424026489258,
"learning_rate": 4.9569992874167634e-05,
"loss": 19.8368,
"step": 1200
},
{
"combined_loss": 3.1324005126953125,
"distill_loss": 1.404078483581543,
"epoch": 0.043959264414975455,
"step": 1200,
"student_mlm_loss": 4.860722541809082
},
{
"epoch": 0.047622536449556746,
"grad_norm": 52.244632720947266,
"learning_rate": 4.9508563284763005e-05,
"loss": 5.547,
"step": 1300
},
{
"combined_loss": 3.1176328659057617,
"distill_loss": 1.3057805299758911,
"epoch": 0.047622536449556746,
"step": 1300,
"student_mlm_loss": 4.929485321044922
},
{
"epoch": 0.05128580848413803,
"grad_norm": 47.002349853515625,
"learning_rate": 4.944713369535838e-05,
"loss": 4.7784,
"step": 1400
},
{
"combined_loss": 3.871903657913208,
"distill_loss": 1.5537463426589966,
"epoch": 0.05128580848413803,
"step": 1400,
"student_mlm_loss": 6.190061092376709
},
{
"epoch": 0.05494908051871932,
"grad_norm": 11.417911529541016,
"learning_rate": 4.9385704105953754e-05,
"loss": 5.9593,
"step": 1500
},
{
"combined_loss": 6.293668270111084,
"distill_loss": 1.3082151412963867,
"epoch": 0.05494908051871932,
"step": 1500,
"student_mlm_loss": 11.279121398925781
},
{
"epoch": 0.058612352553300606,
"grad_norm": 24.519105911254883,
"learning_rate": 4.932427451654914e-05,
"loss": 7.2762,
"step": 1600
},
{
"combined_loss": 3.350501775741577,
"distill_loss": 1.4593900442123413,
"epoch": 0.058612352553300606,
"step": 1600,
"student_mlm_loss": 5.241613388061523
},
{
"epoch": 0.0622756245878819,
"grad_norm": 42.58499526977539,
"learning_rate": 4.926284492714451e-05,
"loss": 7.1364,
"step": 1700
},
{
"combined_loss": 10.976073265075684,
"distill_loss": 1.594639539718628,
"epoch": 0.0622756245878819,
"step": 1700,
"student_mlm_loss": 20.357507705688477
},
{
"epoch": 0.06593889662246319,
"grad_norm": 105.27689361572266,
"learning_rate": 4.920141533773989e-05,
"loss": 5.7662,
"step": 1800
},
{
"combined_loss": 4.272126197814941,
"distill_loss": 1.4649100303649902,
"epoch": 0.06593889662246319,
"step": 1800,
"student_mlm_loss": 7.079341888427734
},
{
"epoch": 0.06960216865704447,
"grad_norm": 9.272991180419922,
"learning_rate": 4.913998574833526e-05,
"loss": 4.9898,
"step": 1900
},
{
"combined_loss": 2.2884514331817627,
"distill_loss": 1.5105092525482178,
"epoch": 0.06960216865704447,
"step": 1900,
"student_mlm_loss": 3.0663936138153076
},
{
"epoch": 0.07326544069162576,
"grad_norm": 15.299578666687012,
"learning_rate": 4.9078556158930636e-05,
"loss": 6.8909,
"step": 2000
},
{
"epoch": 0.07326544069162576,
"eval_loss": 6.166979789733887,
"eval_runtime": 2.1158,
"eval_samples_per_second": 3306.616,
"eval_steps_per_second": 13.234,
"step": 2000
},
{
"combined_loss": 5.612101078033447,
"distill_loss": 1.332657814025879,
"epoch": 0.07326544069162576,
"step": 2000,
"student_mlm_loss": 9.891544342041016
},
{
"epoch": 0.07692871272620705,
"grad_norm": 12.242279052734375,
"learning_rate": 4.9017126569526014e-05,
"loss": 8.6608,
"step": 2100
},
{
"combined_loss": 2.035828113555908,
"distill_loss": 1.3731106519699097,
"epoch": 0.07692871272620705,
"step": 2100,
"student_mlm_loss": 2.6985456943511963
},
{
"epoch": 0.08059198476078834,
"grad_norm": 27.212379455566406,
"learning_rate": 4.8955696980121385e-05,
"loss": 9.4649,
"step": 2200
},
{
"combined_loss": 2.5593996047973633,
"distill_loss": 1.5456775426864624,
"epoch": 0.08059198476078834,
"step": 2200,
"student_mlm_loss": 3.5731217861175537
},
{
"epoch": 0.08425525679536962,
"grad_norm": 9.444129943847656,
"learning_rate": 4.889426739071676e-05,
"loss": 12.6304,
"step": 2300
},
{
"combined_loss": 3.0112435817718506,
"distill_loss": 1.268593192100525,
"epoch": 0.08425525679536962,
"step": 2300,
"student_mlm_loss": 4.753893852233887
},
{
"epoch": 0.08791852882995091,
"grad_norm": 6.72172212600708,
"learning_rate": 4.8832837801312134e-05,
"loss": 4.2453,
"step": 2400
},
{
"combined_loss": 2.3823843002319336,
"distill_loss": 1.3674836158752441,
"epoch": 0.08791852882995091,
"step": 2400,
"student_mlm_loss": 3.397284984588623
},
{
"epoch": 0.0915818008645322,
"grad_norm": 88.5478744506836,
"learning_rate": 4.877140821190752e-05,
"loss": 4.6849,
"step": 2500
},
{
"combined_loss": 3.8919034004211426,
"distill_loss": 1.523806095123291,
"epoch": 0.0915818008645322,
"step": 2500,
"student_mlm_loss": 6.260000705718994
},
{
"epoch": 0.09524507289911349,
"grad_norm": 11.671692848205566,
"learning_rate": 4.870997862250289e-05,
"loss": 4.8686,
"step": 2600
},
{
"combined_loss": 2.8186635971069336,
"distill_loss": 1.313085913658142,
"epoch": 0.09524507289911349,
"step": 2600,
"student_mlm_loss": 4.3242411613464355
},
{
"epoch": 0.09890834493369477,
"grad_norm": 7.681136131286621,
"learning_rate": 4.864854903309827e-05,
"loss": 14.7468,
"step": 2700
},
{
"combined_loss": 2.6350021362304688,
"distill_loss": 1.5300695896148682,
"epoch": 0.09890834493369477,
"step": 2700,
"student_mlm_loss": 3.7399346828460693
},
{
"epoch": 0.10257161696827606,
"grad_norm": 10.245522499084473,
"learning_rate": 4.858711944369364e-05,
"loss": 4.7465,
"step": 2800
},
{
"combined_loss": 1.9805179834365845,
"distill_loss": 1.3671844005584717,
"epoch": 0.10257161696827606,
"step": 2800,
"student_mlm_loss": 2.5938515663146973
},
{
"epoch": 0.10623488900285735,
"grad_norm": 51.705352783203125,
"learning_rate": 4.8525689854289016e-05,
"loss": 3.8985,
"step": 2900
},
{
"combined_loss": 1.9335501194000244,
"distill_loss": 1.3294615745544434,
"epoch": 0.10623488900285735,
"step": 2900,
"student_mlm_loss": 2.5376386642456055
},
{
"epoch": 0.10989816103743864,
"grad_norm": 7.661074161529541,
"learning_rate": 4.8464260264884394e-05,
"loss": 3.9846,
"step": 3000
},
{
"combined_loss": 2.815329074859619,
"distill_loss": 1.5120948553085327,
"epoch": 0.10989816103743864,
"step": 3000,
"student_mlm_loss": 4.118563175201416
},
{
"epoch": 0.11356143307201993,
"grad_norm": 3.9512596130371094,
"learning_rate": 4.8402830675479765e-05,
"loss": 5.6509,
"step": 3100
},
{
"combined_loss": 5.329846382141113,
"distill_loss": 1.5839005708694458,
"epoch": 0.11356143307201993,
"step": 3100,
"student_mlm_loss": 9.07579231262207
},
{
"epoch": 0.11722470510660121,
"grad_norm": 21.47922134399414,
"learning_rate": 4.834140108607514e-05,
"loss": 4.5437,
"step": 3200
},
{
"combined_loss": 3.32517147064209,
"distill_loss": 1.4834882020950317,
"epoch": 0.11722470510660121,
"step": 3200,
"student_mlm_loss": 5.1668548583984375
},
{
"epoch": 0.1208879771411825,
"grad_norm": 11.865033149719238,
"learning_rate": 4.827997149667052e-05,
"loss": 5.0218,
"step": 3300
},
{
"combined_loss": 2.84318208694458,
"distill_loss": 1.302217960357666,
"epoch": 0.1208879771411825,
"step": 3300,
"student_mlm_loss": 4.384146213531494
},
{
"epoch": 0.1245512491757638,
"grad_norm": 13.824487686157227,
"learning_rate": 4.82185419072659e-05,
"loss": 33.2949,
"step": 3400
},
{
"combined_loss": 2.065192937850952,
"distill_loss": 1.3474924564361572,
"epoch": 0.1245512491757638,
"step": 3400,
"student_mlm_loss": 2.782893419265747
},
{
"epoch": 0.12821452121034507,
"grad_norm": 34.21382522583008,
"learning_rate": 4.815711231786127e-05,
"loss": 12.5775,
"step": 3500
},
{
"combined_loss": 2.2148988246917725,
"distill_loss": 1.616875171661377,
"epoch": 0.12821452121034507,
"step": 3500,
"student_mlm_loss": 2.812922477722168
},
{
"epoch": 0.13187779324492638,
"grad_norm": 8.859841346740723,
"learning_rate": 4.809568272845665e-05,
"loss": 4.6975,
"step": 3600
},
{
"combined_loss": 4.478976726531982,
"distill_loss": 1.3554083108901978,
"epoch": 0.13187779324492638,
"step": 3600,
"student_mlm_loss": 7.602544784545898
},
{
"epoch": 0.13554106527950766,
"grad_norm": 12.680179595947266,
"learning_rate": 4.803425313905202e-05,
"loss": 4.5414,
"step": 3700
},
{
"combined_loss": 6.908867835998535,
"distill_loss": 1.3570021390914917,
"epoch": 0.13554106527950766,
"step": 3700,
"student_mlm_loss": 12.460733413696289
},
{
"epoch": 0.13920433731408893,
"grad_norm": 18.478200912475586,
"learning_rate": 4.7972823549647396e-05,
"loss": 35.1443,
"step": 3800
},
{
"combined_loss": 13.97608757019043,
"distill_loss": 1.418832778930664,
"epoch": 0.13920433731408893,
"step": 3800,
"student_mlm_loss": 26.533342361450195
},
{
"epoch": 0.14286760934867024,
"grad_norm": 10.53610897064209,
"learning_rate": 4.7911393960242774e-05,
"loss": 13.766,
"step": 3900
},
{
"combined_loss": 2.1997413635253906,
"distill_loss": 1.4529953002929688,
"epoch": 0.14286760934867024,
"step": 3900,
"student_mlm_loss": 2.9464874267578125
},
{
"epoch": 0.14653088138325152,
"grad_norm": 42.095558166503906,
"learning_rate": 4.7849964370838145e-05,
"loss": 3.297,
"step": 4000
},
{
"epoch": 0.14653088138325152,
"eval_loss": 4.568027496337891,
"eval_runtime": 2.0693,
"eval_samples_per_second": 3380.818,
"eval_steps_per_second": 13.531,
"step": 4000
},
{
"combined_loss": 2.278163433074951,
"distill_loss": 1.5395259857177734,
"epoch": 0.14653088138325152,
"step": 4000,
"student_mlm_loss": 3.016800880432129
},
{
"epoch": 0.15019415341783282,
"grad_norm": 15.655592918395996,
"learning_rate": 4.778853478143352e-05,
"loss": 4.5795,
"step": 4100
},
{
"combined_loss": 2.117962598800659,
"distill_loss": 1.5073814392089844,
"epoch": 0.15019415341783282,
"step": 4100,
"student_mlm_loss": 2.728543758392334
},
{
"epoch": 0.1538574254524141,
"grad_norm": 9.47999382019043,
"learning_rate": 4.77271051920289e-05,
"loss": 4.6384,
"step": 4200
},
{
"combined_loss": 2.2614216804504395,
"distill_loss": 1.3999947309494019,
"epoch": 0.1538574254524141,
"step": 4200,
"student_mlm_loss": 3.1228485107421875
},
{
"epoch": 0.15752069748699538,
"grad_norm": 12.137129783630371,
"learning_rate": 4.766567560262428e-05,
"loss": 3.6101,
"step": 4300
},
{
"combined_loss": 1.9776763916015625,
"distill_loss": 1.4785245656967163,
"epoch": 0.15752069748699538,
"step": 4300,
"student_mlm_loss": 2.476828098297119
},
{
"epoch": 0.16118396952157668,
"grad_norm": 74.8094253540039,
"learning_rate": 4.760424601321965e-05,
"loss": 4.9111,
"step": 4400
},
{
"combined_loss": 3.0158274173736572,
"distill_loss": 1.2940564155578613,
"epoch": 0.16118396952157668,
"step": 4400,
"student_mlm_loss": 4.737598419189453
},
{
"epoch": 0.16484724155615796,
"grad_norm": 5.339694499969482,
"learning_rate": 4.754281642381502e-05,
"loss": 3.4013,
"step": 4500
},
{
"combined_loss": 2.176065683364868,
"distill_loss": 1.5688632726669312,
"epoch": 0.16484724155615796,
"step": 4500,
"student_mlm_loss": 2.7832682132720947
},
{
"epoch": 0.16851051359073924,
"grad_norm": 12.745500564575195,
"learning_rate": 4.74813868344104e-05,
"loss": 3.1244,
"step": 4600
},
{
"combined_loss": 2.4230682849884033,
"distill_loss": 1.46636962890625,
"epoch": 0.16851051359073924,
"step": 4600,
"student_mlm_loss": 3.3797669410705566
},
{
"epoch": 0.17217378562532054,
"grad_norm": 14.515507698059082,
"learning_rate": 4.7419957245005777e-05,
"loss": 4.9862,
"step": 4700
},
{
"combined_loss": 6.772428512573242,
"distill_loss": 1.6445391178131104,
"epoch": 0.17217378562532054,
"step": 4700,
"student_mlm_loss": 11.900318145751953
},
{
"epoch": 0.17583705765990182,
"grad_norm": 10.036664962768555,
"learning_rate": 4.7358527655601154e-05,
"loss": 3.72,
"step": 4800
},
{
"combined_loss": 27.606048583984375,
"distill_loss": 1.4302338361740112,
"epoch": 0.17583705765990182,
"step": 4800,
"student_mlm_loss": 53.781864166259766
},
{
"epoch": 0.17950032969448312,
"grad_norm": 14.220582008361816,
"learning_rate": 4.7297098066196525e-05,
"loss": 9.0684,
"step": 4900
},
{
"combined_loss": 7.97739839553833,
"distill_loss": 1.4764257669448853,
"epoch": 0.17950032969448312,
"step": 4900,
"student_mlm_loss": 14.478370666503906
},
{
"epoch": 0.1831636017290644,
"grad_norm": 8.734748840332031,
"learning_rate": 4.72356684767919e-05,
"loss": 13.2974,
"step": 5000
},
{
"combined_loss": 3.3007736206054688,
"distill_loss": 1.5111989974975586,
"epoch": 0.1831636017290644,
"step": 5000,
"student_mlm_loss": 5.090348243713379
},
{
"epoch": 0.18682687376364568,
"grad_norm": 23.457653045654297,
"learning_rate": 4.717423888738728e-05,
"loss": 4.4811,
"step": 5100
},
{
"combined_loss": 2.695789337158203,
"distill_loss": 1.4495799541473389,
"epoch": 0.18682687376364568,
"step": 5100,
"student_mlm_loss": 3.9419989585876465
},
{
"epoch": 0.19049014579822698,
"grad_norm": 11.504470825195312,
"learning_rate": 4.711280929798265e-05,
"loss": 3.2576,
"step": 5200
},
{
"combined_loss": 3.5765743255615234,
"distill_loss": 1.3500127792358398,
"epoch": 0.19049014579822698,
"step": 5200,
"student_mlm_loss": 5.803135871887207
},
{
"epoch": 0.19415341783280826,
"grad_norm": 34.68207550048828,
"learning_rate": 4.705137970857803e-05,
"loss": 5.8403,
"step": 5300
},
{
"combined_loss": 4.304483413696289,
"distill_loss": 1.4075747728347778,
"epoch": 0.19415341783280826,
"step": 5300,
"student_mlm_loss": 7.20139217376709
},
{
"epoch": 0.19781668986738954,
"grad_norm": 22.416582107543945,
"learning_rate": 4.69899501191734e-05,
"loss": 4.045,
"step": 5400
},
{
"combined_loss": 1.9111289978027344,
"distill_loss": 1.321276307106018,
"epoch": 0.19781668986738954,
"step": 5400,
"student_mlm_loss": 2.500981569290161
},
{
"epoch": 0.20147996190197084,
"grad_norm": 27.66775894165039,
"learning_rate": 4.6928520529768786e-05,
"loss": 3.8896,
"step": 5500
},
{
"combined_loss": 2.142390251159668,
"distill_loss": 1.4025957584381104,
"epoch": 0.20147996190197084,
"step": 5500,
"student_mlm_loss": 2.8821845054626465
},
{
"epoch": 0.20514323393655212,
"grad_norm": 35.84339141845703,
"learning_rate": 4.686709094036416e-05,
"loss": 4.94,
"step": 5600
},
{
"combined_loss": 2.1642816066741943,
"distill_loss": 1.392912745475769,
"epoch": 0.20514323393655212,
"step": 5600,
"student_mlm_loss": 2.935650587081909
},
{
"epoch": 0.20880650597113343,
"grad_norm": 18.43452262878418,
"learning_rate": 4.6805661350959535e-05,
"loss": 7.4575,
"step": 5700
},
{
"combined_loss": 2.354356288909912,
"distill_loss": 1.3411612510681152,
"epoch": 0.20880650597113343,
"step": 5700,
"student_mlm_loss": 3.36755108833313
},
{
"epoch": 0.2124697780057147,
"grad_norm": 5.364467144012451,
"learning_rate": 4.6744231761554906e-05,
"loss": 3.2172,
"step": 5800
},
{
"combined_loss": 2.129748821258545,
"distill_loss": 1.4555408954620361,
"epoch": 0.2124697780057147,
"step": 5800,
"student_mlm_loss": 2.8039567470550537
},
{
"epoch": 0.21613305004029598,
"grad_norm": 12.704414367675781,
"learning_rate": 4.6682802172150283e-05,
"loss": 9.9214,
"step": 5900
},
{
"combined_loss": 5.396609783172607,
"distill_loss": 1.3954136371612549,
"epoch": 0.21613305004029598,
"step": 5900,
"student_mlm_loss": 9.397806167602539
},
{
"epoch": 0.2197963220748773,
"grad_norm": 9.411243438720703,
"learning_rate": 4.662137258274566e-05,
"loss": 4.6268,
"step": 6000
},
{
"epoch": 0.2197963220748773,
"eval_loss": 4.474331855773926,
"eval_runtime": 2.0765,
"eval_samples_per_second": 3369.116,
"eval_steps_per_second": 13.484,
"step": 6000
},
{
"combined_loss": 2.3863794803619385,
"distill_loss": 1.4665789604187012,
"epoch": 0.2197963220748773,
"step": 6000,
"student_mlm_loss": 3.306180000305176
},
{
"epoch": 0.22345959410945856,
"grad_norm": 15.34604263305664,
"learning_rate": 4.655994299334103e-05,
"loss": 3.586,
"step": 6100
},
{
"combined_loss": 2.5740702152252197,
"distill_loss": 1.5186127424240112,
"epoch": 0.22345959410945856,
"step": 6100,
"student_mlm_loss": 3.6295275688171387
},
{
"epoch": 0.22712286614403987,
"grad_norm": 10.821826934814453,
"learning_rate": 4.649851340393641e-05,
"loss": 5.516,
"step": 6200
},
{
"combined_loss": 4.770940780639648,
"distill_loss": 1.5328683853149414,
"epoch": 0.22712286614403987,
"step": 6200,
"student_mlm_loss": 8.009013175964355
},
{
"epoch": 0.23078613817862115,
"grad_norm": 45.33203887939453,
"learning_rate": 4.643708381453178e-05,
"loss": 6.4937,
"step": 6300
},
{
"combined_loss": 2.257235050201416,
"distill_loss": 1.4594223499298096,
"epoch": 0.23078613817862115,
"step": 6300,
"student_mlm_loss": 3.0550475120544434
},
{
"epoch": 0.23444941021320242,
"grad_norm": 24.137001037597656,
"learning_rate": 4.6375654225127166e-05,
"loss": 2.8761,
"step": 6400
},
{
"combined_loss": 3.673408031463623,
"distill_loss": 1.5113860368728638,
"epoch": 0.23444941021320242,
"step": 6400,
"student_mlm_loss": 5.835430145263672
},
{
"epoch": 0.23811268224778373,
"grad_norm": 89.53437042236328,
"learning_rate": 4.631422463572254e-05,
"loss": 4.9469,
"step": 6500
},
{
"combined_loss": 2.289175271987915,
"distill_loss": 1.6255369186401367,
"epoch": 0.23811268224778373,
"step": 6500,
"student_mlm_loss": 2.9528136253356934
},
{
"epoch": 0.241775954282365,
"grad_norm": 29.47341537475586,
"learning_rate": 4.6252795046317915e-05,
"loss": 3.2857,
"step": 6600
},
{
"combined_loss": 2.986036777496338,
"distill_loss": 1.3628634214401245,
"epoch": 0.241775954282365,
"step": 6600,
"student_mlm_loss": 4.609210014343262
},
{
"epoch": 0.24543922631694629,
"grad_norm": 8.413643836975098,
"learning_rate": 4.6191365456913286e-05,
"loss": 4.1874,
"step": 6700
},
{
"combined_loss": 4.9381103515625,
"distill_loss": 1.5604116916656494,
"epoch": 0.24543922631694629,
"step": 6700,
"student_mlm_loss": 8.31580924987793
},
{
"epoch": 0.2491024983515276,
"grad_norm": 19.279678344726562,
"learning_rate": 4.6129935867508664e-05,
"loss": 5.5581,
"step": 6800
},
{
"combined_loss": 4.7175493240356445,
"distill_loss": 1.5657355785369873,
"epoch": 0.2491024983515276,
"step": 6800,
"student_mlm_loss": 7.869362831115723
},
{
"epoch": 0.25276577038610887,
"grad_norm": 14.9283447265625,
"learning_rate": 4.606850627810404e-05,
"loss": 4.6319,
"step": 6900
},
{
"combined_loss": 5.707411766052246,
"distill_loss": 1.566019058227539,
"epoch": 0.25276577038610887,
"step": 6900,
"student_mlm_loss": 9.848804473876953
},
{
"epoch": 0.25642904242069015,
"grad_norm": 5.006555557250977,
"learning_rate": 4.600707668869941e-05,
"loss": 6.1192,
"step": 7000
},
{
"combined_loss": 4.373297691345215,
"distill_loss": 1.4654217958450317,
"epoch": 0.25642904242069015,
"step": 7000,
"student_mlm_loss": 7.281173229217529
},
{
"epoch": 0.2600923144552714,
"grad_norm": 15.025683403015137,
"learning_rate": 4.594564709929479e-05,
"loss": 3.472,
"step": 7100
},
{
"combined_loss": 5.1388630867004395,
"distill_loss": 1.5254905223846436,
"epoch": 0.2600923144552714,
"step": 7100,
"student_mlm_loss": 8.752235412597656
},
{
"epoch": 0.26375558648985276,
"grad_norm": 44.157169342041016,
"learning_rate": 4.588421750989017e-05,
"loss": 8.8482,
"step": 7200
},
{
"combined_loss": 2.1565892696380615,
"distill_loss": 1.2985585927963257,
"epoch": 0.26375558648985276,
"step": 7200,
"student_mlm_loss": 3.014619827270508
},
{
"epoch": 0.26741885852443403,
"grad_norm": 5.755523204803467,
"learning_rate": 4.5822787920485546e-05,
"loss": 5.7829,
"step": 7300
},
{
"combined_loss": 2.5404441356658936,
"distill_loss": 1.5058717727661133,
"epoch": 0.26741885852443403,
"step": 7300,
"student_mlm_loss": 3.575016498565674
},
{
"epoch": 0.2710821305590153,
"grad_norm": 15.252013206481934,
"learning_rate": 4.576135833108092e-05,
"loss": 7.9361,
"step": 7400
},
{
"combined_loss": 2.5752511024475098,
"distill_loss": 1.5916697978973389,
"epoch": 0.2710821305590153,
"step": 7400,
"student_mlm_loss": 3.5588326454162598
},
{
"epoch": 0.2747454025935966,
"grad_norm": 26.218740463256836,
"learning_rate": 4.5699928741676295e-05,
"loss": 4.8534,
"step": 7500
},
{
"combined_loss": 2.1656486988067627,
"distill_loss": 1.4179739952087402,
"epoch": 0.2747454025935966,
"step": 7500,
"student_mlm_loss": 2.913323402404785
},
{
"epoch": 0.27840867462817787,
"grad_norm": 6.031148910522461,
"learning_rate": 4.5638499152271666e-05,
"loss": 6.4535,
"step": 7600
},
{
"combined_loss": 2.8603813648223877,
"distill_loss": 1.5837383270263672,
"epoch": 0.27840867462817787,
"step": 7600,
"student_mlm_loss": 4.137024402618408
},
{
"epoch": 0.2820719466627592,
"grad_norm": 107.95591735839844,
"learning_rate": 4.5577069562867044e-05,
"loss": 3.2702,
"step": 7700
},
{
"combined_loss": 1.8474111557006836,
"distill_loss": 1.437280297279358,
"epoch": 0.2820719466627592,
"step": 7700,
"student_mlm_loss": 2.257542133331299
},
{
"epoch": 0.2857352186973405,
"grad_norm": 5.394913673400879,
"learning_rate": 4.551563997346242e-05,
"loss": 2.8998,
"step": 7800
},
{
"combined_loss": 4.77987813949585,
"distill_loss": 1.5358555316925049,
"epoch": 0.2857352186973405,
"step": 7800,
"student_mlm_loss": 8.023900985717773
},
{
"epoch": 0.28939849073192175,
"grad_norm": 7.790286540985107,
"learning_rate": 4.545421038405779e-05,
"loss": 2.9018,
"step": 7900
},
{
"combined_loss": 3.34071946144104,
"distill_loss": 1.3893283605575562,
"epoch": 0.28939849073192175,
"step": 7900,
"student_mlm_loss": 5.292110443115234
},
{
"epoch": 0.29306176276650303,
"grad_norm": 10.3685941696167,
"learning_rate": 4.539278079465317e-05,
"loss": 3.5884,
"step": 8000
},
{
"epoch": 0.29306176276650303,
"eval_loss": 3.7581117153167725,
"eval_runtime": 2.0302,
"eval_samples_per_second": 3446.049,
"eval_steps_per_second": 13.792,
"step": 8000
},
{
"combined_loss": 2.8955559730529785,
"distill_loss": 1.3627426624298096,
"epoch": 0.29306176276650303,
"step": 8000,
"student_mlm_loss": 4.428369522094727
},
{
"epoch": 0.2967250348010843,
"grad_norm": 49.06619644165039,
"learning_rate": 4.533135120524855e-05,
"loss": 3.5788,
"step": 8100
},
{
"combined_loss": 4.52724552154541,
"distill_loss": 1.3924285173416138,
"epoch": 0.2967250348010843,
"step": 8100,
"student_mlm_loss": 7.662062644958496
},
{
"epoch": 0.30038830683566564,
"grad_norm": 27.40319061279297,
"learning_rate": 4.5269921615843926e-05,
"loss": 3.9229,
"step": 8200
},
{
"combined_loss": 3.3075461387634277,
"distill_loss": 1.5311795473098755,
"epoch": 0.30038830683566564,
"step": 8200,
"student_mlm_loss": 5.0839128494262695
},
{
"epoch": 0.3040515788702469,
"grad_norm": 31.07562255859375,
"learning_rate": 4.52084920264393e-05,
"loss": 3.9566,
"step": 8300
},
{
"combined_loss": 1.9784274101257324,
"distill_loss": 1.41036057472229,
"epoch": 0.3040515788702469,
"step": 8300,
"student_mlm_loss": 2.546494245529175
},
{
"epoch": 0.3077148509048282,
"grad_norm": 4.548298358917236,
"learning_rate": 4.514706243703467e-05,
"loss": 5.1591,
"step": 8400
},
{
"combined_loss": 1.9796760082244873,
"distill_loss": 1.408158302307129,
"epoch": 0.3077148509048282,
"step": 8400,
"student_mlm_loss": 2.5511937141418457
},
{
"epoch": 0.3113781229394095,
"grad_norm": 8.897561073303223,
"learning_rate": 4.5085632847630046e-05,
"loss": 5.7057,
"step": 8500
},
{
"combined_loss": 2.080671548843384,
"distill_loss": 1.4321857690811157,
"epoch": 0.3113781229394095,
"step": 8500,
"student_mlm_loss": 2.7291574478149414
},
{
"epoch": 0.31504139497399075,
"grad_norm": 10.005053520202637,
"learning_rate": 4.5024203258225424e-05,
"loss": 7.7928,
"step": 8600
},
{
"combined_loss": 2.6395342350006104,
"distill_loss": 1.5675503015518188,
"epoch": 0.31504139497399075,
"step": 8600,
"student_mlm_loss": 3.7115182876586914
},
{
"epoch": 0.31870466700857203,
"grad_norm": 5.425146579742432,
"learning_rate": 4.49627736688208e-05,
"loss": 3.7716,
"step": 8700
},
{
"combined_loss": 2.9848690032958984,
"distill_loss": 1.592170000076294,
"epoch": 0.31870466700857203,
"step": 8700,
"student_mlm_loss": 4.377568244934082
},
{
"epoch": 0.32236793904315336,
"grad_norm": 5.64302396774292,
"learning_rate": 4.490134407941617e-05,
"loss": 6.8888,
"step": 8800
},
{
"combined_loss": 4.167844772338867,
"distill_loss": 1.4308810234069824,
"epoch": 0.32236793904315336,
"step": 8800,
"student_mlm_loss": 6.904808044433594
},
{
"epoch": 0.32603121107773464,
"grad_norm": 99.88166809082031,
"learning_rate": 4.483991449001155e-05,
"loss": 3.988,
"step": 8900
},
{
"combined_loss": 2.484290599822998,
"distill_loss": 1.3509743213653564,
"epoch": 0.32603121107773464,
"step": 8900,
"student_mlm_loss": 3.6176071166992188
},
{
"epoch": 0.3296944831123159,
"grad_norm": 74.52608489990234,
"learning_rate": 4.477848490060693e-05,
"loss": 7.0959,
"step": 9000
},
{
"combined_loss": 3.0457074642181396,
"distill_loss": 1.3116565942764282,
"epoch": 0.3296944831123159,
"step": 9000,
"student_mlm_loss": 4.779758453369141
},
{
"epoch": 0.3333577551468972,
"grad_norm": 11.735849380493164,
"learning_rate": 4.47170553112023e-05,
"loss": 3.3274,
"step": 9100
},
{
"combined_loss": 4.452191352844238,
"distill_loss": 1.3943032026290894,
"epoch": 0.3333577551468972,
"step": 9100,
"student_mlm_loss": 7.510079860687256
},
{
"epoch": 0.33702102718147847,
"grad_norm": 9.601778030395508,
"learning_rate": 4.465562572179768e-05,
"loss": 3.8928,
"step": 9200
},
{
"combined_loss": 4.875356197357178,
"distill_loss": 1.4536867141723633,
"epoch": 0.33702102718147847,
"step": 9200,
"student_mlm_loss": 8.297025680541992
},
{
"epoch": 0.3406842992160598,
"grad_norm": 9.49219799041748,
"learning_rate": 4.459419613239305e-05,
"loss": 3.7362,
"step": 9300
},
{
"combined_loss": 2.9027719497680664,
"distill_loss": 1.3480241298675537,
"epoch": 0.3406842992160598,
"step": 9300,
"student_mlm_loss": 4.45751953125
},
{
"epoch": 0.3443475712506411,
"grad_norm": 7.6804728507995605,
"learning_rate": 4.453276654298843e-05,
"loss": 4.4018,
"step": 9400
},
{
"combined_loss": 2.7022647857666016,
"distill_loss": 1.3614214658737183,
"epoch": 0.3443475712506411,
"step": 9400,
"student_mlm_loss": 4.043107986450195
},
{
"epoch": 0.34801084328522236,
"grad_norm": 38.41388702392578,
"learning_rate": 4.4471336953583804e-05,
"loss": 3.0632,
"step": 9500
},
{
"combined_loss": 1.9494025707244873,
"distill_loss": 1.3876396417617798,
"epoch": 0.34801084328522236,
"step": 9500,
"student_mlm_loss": 2.5111656188964844
},
{
"epoch": 0.35167411531980364,
"grad_norm": 37.10932540893555,
"learning_rate": 4.440990736417918e-05,
"loss": 3.3258,
"step": 9600
},
{
"combined_loss": 2.6435036659240723,
"distill_loss": 1.3941702842712402,
"epoch": 0.35167411531980364,
"step": 9600,
"student_mlm_loss": 3.8928370475769043
},
{
"epoch": 0.3553373873543849,
"grad_norm": 17.652099609375,
"learning_rate": 4.434847777477455e-05,
"loss": 8.3854,
"step": 9700
},
{
"combined_loss": 2.336359977722168,
"distill_loss": 1.5497583150863647,
"epoch": 0.3553373873543849,
"step": 9700,
"student_mlm_loss": 3.1229615211486816
},
{
"epoch": 0.35900065938896625,
"grad_norm": 58.41902160644531,
"learning_rate": 4.428704818536993e-05,
"loss": 6.9624,
"step": 9800
},
{
"combined_loss": 2.6561923027038574,
"distill_loss": 1.5154696702957153,
"epoch": 0.35900065938896625,
"step": 9800,
"student_mlm_loss": 3.796915054321289
},
{
"epoch": 0.3626639314235475,
"grad_norm": 23.230680465698242,
"learning_rate": 4.422561859596531e-05,
"loss": 3.4226,
"step": 9900
},
{
"combined_loss": 1.9643871784210205,
"distill_loss": 1.3770619630813599,
"epoch": 0.3626639314235475,
"step": 9900,
"student_mlm_loss": 2.5517125129699707
},
{
"epoch": 0.3663272034581288,
"grad_norm": 11.580951690673828,
"learning_rate": 4.416418900656068e-05,
"loss": 4.7414,
"step": 10000
},
{
"epoch": 0.3663272034581288,
"eval_loss": 3.8432743549346924,
"eval_runtime": 2.2879,
"eval_samples_per_second": 3057.772,
"eval_steps_per_second": 12.238,
"step": 10000
},
{
"combined_loss": 2.395519971847534,
"distill_loss": 1.382614254951477,
"epoch": 0.3663272034581288,
"step": 10000,
"student_mlm_loss": 3.408425807952881
},
{
"epoch": 0.3699904754927101,
"grad_norm": 19.014955520629883,
"learning_rate": 4.410275941715606e-05,
"loss": 6.6365,
"step": 10100
},
{
"combined_loss": 2.1697921752929688,
"distill_loss": 1.5128508806228638,
"epoch": 0.3699904754927101,
"step": 10100,
"student_mlm_loss": 2.8267335891723633
},
{
"epoch": 0.37365374752729136,
"grad_norm": 6.532296180725098,
"learning_rate": 4.404132982775143e-05,
"loss": 3.199,
"step": 10200
},
{
"combined_loss": 1.8516874313354492,
"distill_loss": 1.413927674293518,
"epoch": 0.37365374752729136,
"step": 10200,
"student_mlm_loss": 2.289447069168091
},
{
"epoch": 0.3773170195618727,
"grad_norm": 25.607181549072266,
"learning_rate": 4.397990023834681e-05,
"loss": 3.822,
"step": 10300
},
{
"combined_loss": 3.3827946186065674,
"distill_loss": 1.4635933637619019,
"epoch": 0.3773170195618727,
"step": 10300,
"student_mlm_loss": 5.301995754241943
},
{
"epoch": 0.38098029159645397,
"grad_norm": 12.52314567565918,
"learning_rate": 4.3918470648942184e-05,
"loss": 6.9491,
"step": 10400
},
{
"combined_loss": 1.9748457670211792,
"distill_loss": 1.445707082748413,
"epoch": 0.38098029159645397,
"step": 10400,
"student_mlm_loss": 2.5039844512939453
},
{
"epoch": 0.38464356363103525,
"grad_norm": 12.69713306427002,
"learning_rate": 4.385704105953756e-05,
"loss": 9.4794,
"step": 10500
},
{
"combined_loss": 3.5582261085510254,
"distill_loss": 1.4324952363967896,
"epoch": 0.38464356363103525,
"step": 10500,
"student_mlm_loss": 5.683957099914551
},
{
"epoch": 0.3883068356656165,
"grad_norm": 9.131495475769043,
"learning_rate": 4.379561147013293e-05,
"loss": 7.1932,
"step": 10600
},
{
"combined_loss": 6.080216407775879,
"distill_loss": 1.477283000946045,
"epoch": 0.3883068356656165,
"step": 10600,
"student_mlm_loss": 10.683149337768555
},
{
"epoch": 0.3919701077001978,
"grad_norm": 24.739810943603516,
"learning_rate": 4.373418188072831e-05,
"loss": 5.6399,
"step": 10700
},
{
"combined_loss": 3.7993698120117188,
"distill_loss": 1.452317476272583,
"epoch": 0.3919701077001978,
"step": 10700,
"student_mlm_loss": 6.146422386169434
},
{
"epoch": 0.3956333797347791,
"grad_norm": 42.44218063354492,
"learning_rate": 4.367275229132369e-05,
"loss": 4.2291,
"step": 10800
},
{
"combined_loss": 2.037079095840454,
"distill_loss": 1.4349570274353027,
"epoch": 0.3956333797347791,
"step": 10800,
"student_mlm_loss": 2.6392011642456055
},
{
"epoch": 0.3992966517693604,
"grad_norm": 231.26116943359375,
"learning_rate": 4.361132270191906e-05,
"loss": 4.6188,
"step": 10900
},
{
"combined_loss": 182.1781768798828,
"distill_loss": 1.4427307844161987,
"epoch": 0.3992966517693604,
"step": 10900,
"student_mlm_loss": 362.91363525390625
},
{
"epoch": 0.4029599238039417,
"grad_norm": 16.01262092590332,
"learning_rate": 4.354989311251444e-05,
"loss": 4.8535,
"step": 11000
},
{
"combined_loss": 3.2922308444976807,
"distill_loss": 1.7308834791183472,
"epoch": 0.4029599238039417,
"step": 11000,
"student_mlm_loss": 4.853578090667725
},
{
"epoch": 0.40662319583852297,
"grad_norm": 23.69573974609375,
"learning_rate": 4.3488463523109816e-05,
"loss": 2.8692,
"step": 11100
},
{
"combined_loss": 2.1010890007019043,
"distill_loss": 1.3140019178390503,
"epoch": 0.40662319583852297,
"step": 11100,
"student_mlm_loss": 2.888176202774048
},
{
"epoch": 0.41028646787310424,
"grad_norm": 9.695125579833984,
"learning_rate": 4.3427033933705193e-05,
"loss": 7.6829,
"step": 11200
},
{
"combined_loss": 2.24194598197937,
"distill_loss": 1.560063362121582,
"epoch": 0.41028646787310424,
"step": 11200,
"student_mlm_loss": 2.923828601837158
},
{
"epoch": 0.4139497399076855,
"grad_norm": 37.06310272216797,
"learning_rate": 4.3365604344300565e-05,
"loss": 3.5562,
"step": 11300
},
{
"combined_loss": 9.297407150268555,
"distill_loss": 1.2328678369522095,
"epoch": 0.4139497399076855,
"step": 11300,
"student_mlm_loss": 17.36194610595703
},
{
"epoch": 0.41761301194226685,
"grad_norm": 6.411166667938232,
"learning_rate": 4.330417475489594e-05,
"loss": 4.0543,
"step": 11400
},
{
"combined_loss": 2.141500949859619,
"distill_loss": 1.467064380645752,
"epoch": 0.41761301194226685,
"step": 11400,
"student_mlm_loss": 2.8159377574920654
},
{
"epoch": 0.42127628397684813,
"grad_norm": 5.802677154541016,
"learning_rate": 4.3242745165491313e-05,
"loss": 14.3215,
"step": 11500
},
{
"combined_loss": 6.576130390167236,
"distill_loss": 1.46802818775177,
"epoch": 0.42127628397684813,
"step": 11500,
"student_mlm_loss": 11.684232711791992
},
{
"epoch": 0.4249395560114294,
"grad_norm": 15.660844802856445,
"learning_rate": 4.318131557608669e-05,
"loss": 30.5877,
"step": 11600
},
{
"combined_loss": 1.9305293560028076,
"distill_loss": 1.405720591545105,
"epoch": 0.4249395560114294,
"step": 11600,
"student_mlm_loss": 2.4553380012512207
},
{
"epoch": 0.4286028280460107,
"grad_norm": 3.041947603225708,
"learning_rate": 4.311988598668207e-05,
"loss": 3.7156,
"step": 11700
},
{
"combined_loss": 2.78572940826416,
"distill_loss": 1.45219886302948,
"epoch": 0.4286028280460107,
"step": 11700,
"student_mlm_loss": 4.119259834289551
},
{
"epoch": 0.43226610008059196,
"grad_norm": 20.6744384765625,
"learning_rate": 4.305845639727744e-05,
"loss": 3.3939,
"step": 11800
},
{
"combined_loss": 2.0835349559783936,
"distill_loss": 1.4508671760559082,
"epoch": 0.43226610008059196,
"step": 11800,
"student_mlm_loss": 2.716202735900879
},
{
"epoch": 0.4359293721151733,
"grad_norm": 5.804731369018555,
"learning_rate": 4.299702680787282e-05,
"loss": 6.1951,
"step": 11900
},
{
"combined_loss": 3.1048030853271484,
"distill_loss": 1.455564260482788,
"epoch": 0.4359293721151733,
"step": 11900,
"student_mlm_loss": 4.75404167175293
},
{
"epoch": 0.4395926441497546,
"grad_norm": 33.689720153808594,
"learning_rate": 4.2935597218468196e-05,
"loss": 3.6583,
"step": 12000
},
{
"epoch": 0.4395926441497546,
"eval_loss": 3.919630527496338,
"eval_runtime": 2.0425,
"eval_samples_per_second": 3425.261,
"eval_steps_per_second": 13.709,
"step": 12000
},
{
"combined_loss": 2.315965175628662,
"distill_loss": 1.3009124994277954,
"epoch": 0.4395926441497546,
"step": 12000,
"student_mlm_loss": 3.3310179710388184
},
{
"epoch": 0.44325591618433585,
"grad_norm": 24.73545265197754,
"learning_rate": 4.2874167629063574e-05,
"loss": 2.9828,
"step": 12100
},
{
"combined_loss": 5.060952186584473,
"distill_loss": 1.3712559938430786,
"epoch": 0.44325591618433585,
"step": 12100,
"student_mlm_loss": 8.750648498535156
},
{
"epoch": 0.44691918821891713,
"grad_norm": 19.548921585083008,
"learning_rate": 4.2812738039658945e-05,
"loss": 3.1716,
"step": 12200
},
{
"combined_loss": 2.3697307109832764,
"distill_loss": 1.480096459388733,
"epoch": 0.44691918821891713,
"step": 12200,
"student_mlm_loss": 3.2593650817871094
},
{
"epoch": 0.4505824602534984,
"grad_norm": 6.217925548553467,
"learning_rate": 4.2751308450254316e-05,
"loss": 5.1037,
"step": 12300
},
{
"combined_loss": 1.9682085514068604,
"distill_loss": 1.3534774780273438,
"epoch": 0.4505824602534984,
"step": 12300,
"student_mlm_loss": 2.582939624786377
},
{
"epoch": 0.45424573228807974,
"grad_norm": 53.592735290527344,
"learning_rate": 4.2689878860849694e-05,
"loss": 5.3409,
"step": 12400
},
{
"combined_loss": 2.413550853729248,
"distill_loss": 1.3951433897018433,
"epoch": 0.45424573228807974,
"step": 12400,
"student_mlm_loss": 3.4319584369659424
},
{
"epoch": 0.457909004322661,
"grad_norm": 13.716507911682129,
"learning_rate": 4.262844927144507e-05,
"loss": 3.2261,
"step": 12500
},
{
"combined_loss": 3.6318020820617676,
"distill_loss": 1.3529082536697388,
"epoch": 0.457909004322661,
"step": 12500,
"student_mlm_loss": 5.910696029663086
},
{
"epoch": 0.4615722763572423,
"grad_norm": 16.206933975219727,
"learning_rate": 4.256701968204045e-05,
"loss": 3.1534,
"step": 12600
},
{
"combined_loss": 15.371432304382324,
"distill_loss": 1.4290032386779785,
"epoch": 0.4615722763572423,
"step": 12600,
"student_mlm_loss": 29.313861846923828
},
{
"epoch": 0.4652355483918236,
"grad_norm": 8.626960754394531,
"learning_rate": 4.250559009263582e-05,
"loss": 3.0824,
"step": 12700
},
{
"combined_loss": 2.0715112686157227,
"distill_loss": 1.3553932905197144,
"epoch": 0.4652355483918236,
"step": 12700,
"student_mlm_loss": 2.7876293659210205
},
{
"epoch": 0.46889882042640485,
"grad_norm": 8.153878211975098,
"learning_rate": 4.24441605032312e-05,
"loss": 3.8805,
"step": 12800
},
{
"combined_loss": 2.0972392559051514,
"distill_loss": 1.2276250123977661,
"epoch": 0.46889882042640485,
"step": 12800,
"student_mlm_loss": 2.966853618621826
},
{
"epoch": 0.4725620924609861,
"grad_norm": 12.068700790405273,
"learning_rate": 4.2382730913826576e-05,
"loss": 2.8937,
"step": 12900
},
{
"combined_loss": 2.9497852325439453,
"distill_loss": 1.314728021621704,
"epoch": 0.4725620924609861,
"step": 12900,
"student_mlm_loss": 4.584842681884766
},
{
"epoch": 0.47622536449556746,
"grad_norm": 12.260379791259766,
"learning_rate": 4.232130132442195e-05,
"loss": 5.581,
"step": 13000
},
{
"combined_loss": 1.8658246994018555,
"distill_loss": 1.2703187465667725,
"epoch": 0.47622536449556746,
"step": 13000,
"student_mlm_loss": 2.4613306522369385
},
{
"epoch": 0.47988863653014874,
"grad_norm": 22.688852310180664,
"learning_rate": 4.2259871735017325e-05,
"loss": 7.0059,
"step": 13100
},
{
"combined_loss": 3.673346519470215,
"distill_loss": 1.397099256515503,
"epoch": 0.47988863653014874,
"step": 13100,
"student_mlm_loss": 5.949593544006348
},
{
"epoch": 0.48355190856473,
"grad_norm": 28.811817169189453,
"learning_rate": 4.2198442145612696e-05,
"loss": 9.6395,
"step": 13200
},
{
"combined_loss": 2.036362409591675,
"distill_loss": 1.3239866495132446,
"epoch": 0.48355190856473,
"step": 13200,
"student_mlm_loss": 2.7487380504608154
},
{
"epoch": 0.4872151805993113,
"grad_norm": 6.380947589874268,
"learning_rate": 4.213701255620808e-05,
"loss": 2.7095,
"step": 13300
},
{
"combined_loss": 2.2547478675842285,
"distill_loss": 1.4122509956359863,
"epoch": 0.4872151805993113,
"step": 13300,
"student_mlm_loss": 3.09724497795105
},
{
"epoch": 0.49087845263389257,
"grad_norm": 83.60982513427734,
"learning_rate": 4.207558296680345e-05,
"loss": 3.2917,
"step": 13400
},
{
"combined_loss": 2.009040355682373,
"distill_loss": 1.4236946105957031,
"epoch": 0.49087845263389257,
"step": 13400,
"student_mlm_loss": 2.594385862350464
},
{
"epoch": 0.4945417246684739,
"grad_norm": 10.06588077545166,
"learning_rate": 4.201415337739883e-05,
"loss": 12.3205,
"step": 13500
},
{
"combined_loss": 2.9317073822021484,
"distill_loss": 1.4229042530059814,
"epoch": 0.4945417246684739,
"step": 13500,
"student_mlm_loss": 4.440510272979736
},
{
"epoch": 0.4982049967030552,
"grad_norm": 4.126479625701904,
"learning_rate": 4.19527237879942e-05,
"loss": 3.8077,
"step": 13600
},
{
"combined_loss": 1.9033926725387573,
"distill_loss": 1.357490062713623,
"epoch": 0.4982049967030552,
"step": 13600,
"student_mlm_loss": 2.4492952823638916
},
{
"epoch": 0.5018682687376365,
"grad_norm": 18.483203887939453,
"learning_rate": 4.189129419858958e-05,
"loss": 11.6361,
"step": 13700
},
{
"combined_loss": 3.165005683898926,
"distill_loss": 1.3812006711959839,
"epoch": 0.5018682687376365,
"step": 13700,
"student_mlm_loss": 4.948810577392578
},
{
"epoch": 0.5055315407722177,
"grad_norm": 7.388655662536621,
"learning_rate": 4.1829864609184956e-05,
"loss": 3.875,
"step": 13800
},
{
"combined_loss": 1.8155145645141602,
"distill_loss": 1.3641600608825684,
"epoch": 0.5055315407722177,
"step": 13800,
"student_mlm_loss": 2.266869068145752
},
{
"epoch": 0.509194812806799,
"grad_norm": 9.352982521057129,
"learning_rate": 4.176843501978033e-05,
"loss": 9.268,
"step": 13900
},
{
"combined_loss": 2.3618173599243164,
"distill_loss": 1.3162891864776611,
"epoch": 0.509194812806799,
"step": 13900,
"student_mlm_loss": 3.4073452949523926
},
{
"epoch": 0.5128580848413803,
"grad_norm": 8.513871192932129,
"learning_rate": 4.1707005430375705e-05,
"loss": 3.3999,
"step": 14000
},
{
"epoch": 0.5128580848413803,
"eval_loss": 3.5987370014190674,
"eval_runtime": 2.2869,
"eval_samples_per_second": 3059.222,
"eval_steps_per_second": 12.244,
"step": 14000
},
{
"combined_loss": 2.6841559410095215,
"distill_loss": 1.401199460029602,
"epoch": 0.5128580848413803,
"step": 14000,
"student_mlm_loss": 3.9671125411987305
},
{
"epoch": 0.5165213568759616,
"grad_norm": 30.661813735961914,
"learning_rate": 4.1645575840971076e-05,
"loss": 18.3341,
"step": 14100
},
{
"combined_loss": 4.752758026123047,
"distill_loss": 1.247560977935791,
"epoch": 0.5165213568759616,
"step": 14100,
"student_mlm_loss": 8.257954597473145
},
{
"epoch": 0.5201846289105428,
"grad_norm": 40.303707122802734,
"learning_rate": 4.158414625156646e-05,
"loss": 3.1057,
"step": 14200
},
{
"combined_loss": 1.988144874572754,
"distill_loss": 1.2577546834945679,
"epoch": 0.5201846289105428,
"step": 14200,
"student_mlm_loss": 2.7185349464416504
},
{
"epoch": 0.5238479009451242,
"grad_norm": 19.77947235107422,
"learning_rate": 4.152271666216183e-05,
"loss": 7.3457,
"step": 14300
},
{
"combined_loss": 4.299380779266357,
"distill_loss": 1.2770593166351318,
"epoch": 0.5238479009451242,
"step": 14300,
"student_mlm_loss": 7.321702480316162
},
{
"epoch": 0.5275111729797055,
"grad_norm": 7.412100315093994,
"learning_rate": 4.146128707275721e-05,
"loss": 4.8104,
"step": 14400
},
{
"combined_loss": 10.650766372680664,
"distill_loss": 1.3233892917633057,
"epoch": 0.5275111729797055,
"step": 14400,
"student_mlm_loss": 19.9781436920166
},
{
"epoch": 0.5311744450142868,
"grad_norm": 5.799710750579834,
"learning_rate": 4.139985748335258e-05,
"loss": 3.4765,
"step": 14500
},
{
"combined_loss": 2.4540774822235107,
"distill_loss": 1.319036841392517,
"epoch": 0.5311744450142868,
"step": 14500,
"student_mlm_loss": 3.589118003845215
},
{
"epoch": 0.5348377170488681,
"grad_norm": 7.147758483886719,
"learning_rate": 4.133842789394796e-05,
"loss": 3.12,
"step": 14600
},
{
"combined_loss": 1.8580541610717773,
"distill_loss": 1.3114832639694214,
"epoch": 0.5348377170488681,
"step": 14600,
"student_mlm_loss": 2.4046249389648438
},
{
"epoch": 0.5385009890834493,
"grad_norm": 5.120487213134766,
"learning_rate": 4.1276998304543336e-05,
"loss": 6.7029,
"step": 14700
},
{
"combined_loss": 1.9685258865356445,
"distill_loss": 1.2455390691757202,
"epoch": 0.5385009890834493,
"step": 14700,
"student_mlm_loss": 2.6915125846862793
},
{
"epoch": 0.5421642611180306,
"grad_norm": 6.225675106048584,
"learning_rate": 4.121556871513871e-05,
"loss": 7.1336,
"step": 14800
},
{
"combined_loss": 1.8886613845825195,
"distill_loss": 1.2913726568222046,
"epoch": 0.5421642611180306,
"step": 14800,
"student_mlm_loss": 2.485949993133545
},
{
"epoch": 0.5458275331526119,
"grad_norm": 11.508244514465332,
"learning_rate": 4.1154139125734085e-05,
"loss": 11.8719,
"step": 14900
},
{
"combined_loss": 2.1455585956573486,
"distill_loss": 1.3711117506027222,
"epoch": 0.5458275331526119,
"step": 14900,
"student_mlm_loss": 2.9200053215026855
},
{
"epoch": 0.5494908051871932,
"grad_norm": 17.030780792236328,
"learning_rate": 4.109270953632946e-05,
"loss": 3.091,
"step": 15000
},
{
"combined_loss": 1.9433504343032837,
"distill_loss": 1.538583517074585,
"epoch": 0.5494908051871932,
"step": 15000,
"student_mlm_loss": 2.3481173515319824
},
{
"epoch": 0.5531540772217745,
"grad_norm": 4.692992687225342,
"learning_rate": 4.103127994692484e-05,
"loss": 3.2488,
"step": 15100
},
{
"combined_loss": 2.820077657699585,
"distill_loss": 1.2906769514083862,
"epoch": 0.5531540772217745,
"step": 15100,
"student_mlm_loss": 4.349478244781494
},
{
"epoch": 0.5568173492563557,
"grad_norm": 49.70892333984375,
"learning_rate": 4.096985035752021e-05,
"loss": 10.6593,
"step": 15200
},
{
"combined_loss": 1.857104778289795,
"distill_loss": 1.4106833934783936,
"epoch": 0.5568173492563557,
"step": 15200,
"student_mlm_loss": 2.3035261631011963
},
{
"epoch": 0.5604806212909371,
"grad_norm": 7.913967609405518,
"learning_rate": 4.090842076811558e-05,
"loss": 3.3056,
"step": 15300
},
{
"combined_loss": 3.2144076824188232,
"distill_loss": 1.3917032480239868,
"epoch": 0.5604806212909371,
"step": 15300,
"student_mlm_loss": 5.037112236022949
},
{
"epoch": 0.5641438933255184,
"grad_norm": 10.575057983398438,
"learning_rate": 4.084699117871096e-05,
"loss": 10.0757,
"step": 15400
},
{
"combined_loss": 5.352452754974365,
"distill_loss": 1.3542910814285278,
"epoch": 0.5641438933255184,
"step": 15400,
"student_mlm_loss": 9.350614547729492
},
{
"epoch": 0.5678071653600997,
"grad_norm": 119.92784118652344,
"learning_rate": 4.078556158930634e-05,
"loss": 3.4463,
"step": 15500
},
{
"combined_loss": 1.7753610610961914,
"distill_loss": 1.3875095844268799,
"epoch": 0.5678071653600997,
"step": 15500,
"student_mlm_loss": 2.163212537765503
},
{
"epoch": 0.571470437394681,
"grad_norm": 4.203140735626221,
"learning_rate": 4.0724131999901717e-05,
"loss": 4.8205,
"step": 15600
},
{
"combined_loss": 1.8941802978515625,
"distill_loss": 1.3584777116775513,
"epoch": 0.571470437394681,
"step": 15600,
"student_mlm_loss": 2.4298830032348633
},
{
"epoch": 0.5751337094292622,
"grad_norm": 16.848825454711914,
"learning_rate": 4.066270241049709e-05,
"loss": 7.7339,
"step": 15700
},
{
"combined_loss": 1.9499808549880981,
"distill_loss": 1.3122260570526123,
"epoch": 0.5751337094292622,
"step": 15700,
"student_mlm_loss": 2.587735652923584
},
{
"epoch": 0.5787969814638435,
"grad_norm": 2.9838955402374268,
"learning_rate": 4.0601272821092465e-05,
"loss": 3.4354,
"step": 15800
},
{
"combined_loss": 1.9672229290008545,
"distill_loss": 1.3119910955429077,
"epoch": 0.5787969814638435,
"step": 15800,
"student_mlm_loss": 2.622454881668091
},
{
"epoch": 0.5824602534984248,
"grad_norm": 6.6938676834106445,
"learning_rate": 4.053984323168784e-05,
"loss": 5.2244,
"step": 15900
},
{
"combined_loss": 2.8469321727752686,
"distill_loss": 1.361178994178772,
"epoch": 0.5824602534984248,
"step": 15900,
"student_mlm_loss": 4.332685470581055
},
{
"epoch": 0.5861235255330061,
"grad_norm": 31.440717697143555,
"learning_rate": 4.047841364228322e-05,
"loss": 8.7168,
"step": 16000
},
{
"epoch": 0.5861235255330061,
"eval_loss": 3.480536937713623,
"eval_runtime": 2.1572,
"eval_samples_per_second": 3243.154,
"eval_steps_per_second": 12.98,
"step": 16000
},
{
"combined_loss": 2.0847339630126953,
"distill_loss": 1.4640412330627441,
"epoch": 0.5861235255330061,
"step": 16000,
"student_mlm_loss": 2.7054266929626465
},
{
"epoch": 0.5897867975675873,
"grad_norm": 6.238570690155029,
"learning_rate": 4.041698405287859e-05,
"loss": 3.2375,
"step": 16100
},
{
"combined_loss": 2.2635374069213867,
"distill_loss": 1.5188945531845093,
"epoch": 0.5897867975675873,
"step": 16100,
"student_mlm_loss": 3.0081801414489746
},
{
"epoch": 0.5934500696021686,
"grad_norm": 11.832098960876465,
"learning_rate": 4.035555446347396e-05,
"loss": 3.3115,
"step": 16200
},
{
"combined_loss": 2.2285714149475098,
"distill_loss": 1.4724992513656616,
"epoch": 0.5934500696021686,
"step": 16200,
"student_mlm_loss": 2.9846436977386475
},
{
"epoch": 0.5971133416367499,
"grad_norm": 8.876389503479004,
"learning_rate": 4.029412487406934e-05,
"loss": 4.1388,
"step": 16300
},
{
"combined_loss": 2.0907256603240967,
"distill_loss": 1.2955131530761719,
"epoch": 0.5971133416367499,
"step": 16300,
"student_mlm_loss": 2.8859381675720215
},
{
"epoch": 0.6007766136713313,
"grad_norm": 4.118688106536865,
"learning_rate": 4.023269528466472e-05,
"loss": 5.4036,
"step": 16400
},
{
"combined_loss": 5.190587997436523,
"distill_loss": 1.502519965171814,
"epoch": 0.6007766136713313,
"step": 16400,
"student_mlm_loss": 8.878656387329102
},
{
"epoch": 0.6044398857059126,
"grad_norm": 17.806203842163086,
"learning_rate": 4.01712656952601e-05,
"loss": 3.4529,
"step": 16500
},
{
"combined_loss": 2.0771563053131104,
"distill_loss": 1.5032036304473877,
"epoch": 0.6044398857059126,
"step": 16500,
"student_mlm_loss": 2.651108980178833
},
{
"epoch": 0.6081031577404938,
"grad_norm": 11.406692504882812,
"learning_rate": 4.010983610585547e-05,
"loss": 2.9157,
"step": 16600
},
{
"combined_loss": 2.0262105464935303,
"distill_loss": 1.406888723373413,
"epoch": 0.6081031577404938,
"step": 16600,
"student_mlm_loss": 2.6455323696136475
},
{
"epoch": 0.6117664297750751,
"grad_norm": 9.248611450195312,
"learning_rate": 4.0048406516450846e-05,
"loss": 3.7273,
"step": 16700
},
{
"combined_loss": 9.912755966186523,
"distill_loss": 1.3654385805130005,
"epoch": 0.6117664297750751,
"step": 16700,
"student_mlm_loss": 18.460073471069336
},
{
"epoch": 0.6154297018096564,
"grad_norm": 7.337488651275635,
"learning_rate": 3.9986976927046223e-05,
"loss": 3.5316,
"step": 16800
},
{
"combined_loss": 2.2111759185791016,
"distill_loss": 1.410059928894043,
"epoch": 0.6154297018096564,
"step": 16800,
"student_mlm_loss": 3.012291669845581
},
{
"epoch": 0.6190929738442377,
"grad_norm": 3.7927513122558594,
"learning_rate": 3.9925547337641595e-05,
"loss": 2.942,
"step": 16900
},
{
"combined_loss": 1.9941096305847168,
"distill_loss": 1.3353883028030396,
"epoch": 0.6190929738442377,
"step": 16900,
"student_mlm_loss": 2.6528310775756836
},
{
"epoch": 0.622756245878819,
"grad_norm": 8.092863082885742,
"learning_rate": 3.986411774823697e-05,
"loss": 8.3194,
"step": 17000
},
{
"combined_loss": 1.8197941780090332,
"distill_loss": 1.2830308675765991,
"epoch": 0.622756245878819,
"step": 17000,
"student_mlm_loss": 2.356557607650757
},
{
"epoch": 0.6264195179134002,
"grad_norm": 21.95607566833496,
"learning_rate": 3.9802688158832343e-05,
"loss": 3.6842,
"step": 17100
},
{
"combined_loss": 1.967858076095581,
"distill_loss": 1.3744505643844604,
"epoch": 0.6264195179134002,
"step": 17100,
"student_mlm_loss": 2.561265707015991
},
{
"epoch": 0.6300827899479815,
"grad_norm": 17.734630584716797,
"learning_rate": 3.974125856942773e-05,
"loss": 3.4446,
"step": 17200
},
{
"combined_loss": 3.56831955909729,
"distill_loss": 1.4127169847488403,
"epoch": 0.6300827899479815,
"step": 17200,
"student_mlm_loss": 5.723922252655029
},
{
"epoch": 0.6337460619825628,
"grad_norm": 14.227143287658691,
"learning_rate": 3.96798289800231e-05,
"loss": 4.3058,
"step": 17300
},
{
"combined_loss": 6.485238552093506,
"distill_loss": 1.3285768032073975,
"epoch": 0.6337460619825628,
"step": 17300,
"student_mlm_loss": 11.641900062561035
},
{
"epoch": 0.6374093340171441,
"grad_norm": 27.379819869995117,
"learning_rate": 3.961839939061848e-05,
"loss": 3.3666,
"step": 17400
},
{
"combined_loss": 3.212083339691162,
"distill_loss": 1.3358004093170166,
"epoch": 0.6374093340171441,
"step": 17400,
"student_mlm_loss": 5.088366508483887
},
{
"epoch": 0.6410726060517254,
"grad_norm": 6.261890411376953,
"learning_rate": 3.955696980121385e-05,
"loss": 6.3216,
"step": 17500
},
{
"combined_loss": 1.8787257671356201,
"distill_loss": 1.3068917989730835,
"epoch": 0.6410726060517254,
"step": 17500,
"student_mlm_loss": 2.4505598545074463
},
{
"epoch": 0.6447358780863067,
"grad_norm": 4.643723011016846,
"learning_rate": 3.9495540211809226e-05,
"loss": 6.3659,
"step": 17600
},
{
"combined_loss": 1.9111711978912354,
"distill_loss": 1.315952181816101,
"epoch": 0.6447358780863067,
"step": 17600,
"student_mlm_loss": 2.506390333175659
},
{
"epoch": 0.648399150120888,
"grad_norm": 209.94358825683594,
"learning_rate": 3.9434110622404604e-05,
"loss": 3.1778,
"step": 17700
},
{
"combined_loss": 2.7990779876708984,
"distill_loss": 1.360758662223816,
"epoch": 0.648399150120888,
"step": 17700,
"student_mlm_loss": 4.237397193908691
},
{
"epoch": 0.6520624221554693,
"grad_norm": 25.861230850219727,
"learning_rate": 3.9372681032999975e-05,
"loss": 6.5636,
"step": 17800
},
{
"combined_loss": 3.8194119930267334,
"distill_loss": 1.45068359375,
"epoch": 0.6520624221554693,
"step": 17800,
"student_mlm_loss": 6.188140392303467
},
{
"epoch": 0.6557256941900506,
"grad_norm": 46.81015396118164,
"learning_rate": 3.931125144359535e-05,
"loss": 6.4281,
"step": 17900
},
{
"combined_loss": 1.8790740966796875,
"distill_loss": 1.2603598833084106,
"epoch": 0.6557256941900506,
"step": 17900,
"student_mlm_loss": 2.497788429260254
},
{
"epoch": 0.6593889662246318,
"grad_norm": 3.634798049926758,
"learning_rate": 3.924982185419073e-05,
"loss": 3.7705,
"step": 18000
},
{
"epoch": 0.6593889662246318,
"eval_loss": 3.4686477184295654,
"eval_runtime": 2.0476,
"eval_samples_per_second": 3416.619,
"eval_steps_per_second": 13.674,
"step": 18000
},
{
"combined_loss": 1.8001245260238647,
"distill_loss": 1.358407735824585,
"epoch": 0.6593889662246318,
"step": 18000,
"student_mlm_loss": 2.2418413162231445
},
{
"epoch": 0.6630522382592131,
"grad_norm": 14.09543514251709,
"learning_rate": 3.918839226478611e-05,
"loss": 7.2198,
"step": 18100
},
{
"combined_loss": 2.165346622467041,
"distill_loss": 1.3290469646453857,
"epoch": 0.6630522382592131,
"step": 18100,
"student_mlm_loss": 3.0016462802886963
},
{
"epoch": 0.6667155102937944,
"grad_norm": 4.29142951965332,
"learning_rate": 3.912696267538148e-05,
"loss": 4.3053,
"step": 18200
},
{
"combined_loss": 1.8569279909133911,
"distill_loss": 1.355130910873413,
"epoch": 0.6667155102937944,
"step": 18200,
"student_mlm_loss": 2.358725070953369
},
{
"epoch": 0.6703787823283757,
"grad_norm": 4.424899101257324,
"learning_rate": 3.906553308597686e-05,
"loss": 3.2385,
"step": 18300
},
{
"combined_loss": 2.083707094192505,
"distill_loss": 1.307104229927063,
"epoch": 0.6703787823283757,
"step": 18300,
"student_mlm_loss": 2.8603098392486572
},
{
"epoch": 0.6740420543629569,
"grad_norm": 8.061409950256348,
"learning_rate": 3.900410349657223e-05,
"loss": 2.9075,
"step": 18400
},
{
"combined_loss": 1.9213597774505615,
"distill_loss": 1.434320330619812,
"epoch": 0.6740420543629569,
"step": 18400,
"student_mlm_loss": 2.4083993434906006
},
{
"epoch": 0.6777053263975383,
"grad_norm": 55.50898361206055,
"learning_rate": 3.8942673907167606e-05,
"loss": 13.4077,
"step": 18500
},
{
"combined_loss": 2.01340389251709,
"distill_loss": 1.3991159200668335,
"epoch": 0.6777053263975383,
"step": 18500,
"student_mlm_loss": 2.6276917457580566
},
{
"epoch": 0.6813685984321196,
"grad_norm": 5.348477840423584,
"learning_rate": 3.8881244317762984e-05,
"loss": 6.8559,
"step": 18600
},
{
"combined_loss": 2.5955307483673096,
"distill_loss": 1.4375801086425781,
"epoch": 0.6813685984321196,
"step": 18600,
"student_mlm_loss": 3.753481388092041
},
{
"epoch": 0.6850318704667009,
"grad_norm": 26.911954879760742,
"learning_rate": 3.8819814728358355e-05,
"loss": 9.8471,
"step": 18700
},
{
"combined_loss": 2.3086562156677246,
"distill_loss": 1.4082762002944946,
"epoch": 0.6850318704667009,
"step": 18700,
"student_mlm_loss": 3.209036350250244
},
{
"epoch": 0.6886951425012822,
"grad_norm": 8.086039543151855,
"learning_rate": 3.875838513895373e-05,
"loss": 3.841,
"step": 18800
},
{
"combined_loss": 4.487699031829834,
"distill_loss": 1.4052667617797852,
"epoch": 0.6886951425012822,
"step": 18800,
"student_mlm_loss": 7.570131301879883
},
{
"epoch": 0.6923584145358634,
"grad_norm": 10.749812126159668,
"learning_rate": 3.869695554954911e-05,
"loss": 9.7279,
"step": 18900
},
{
"combined_loss": 3.3014779090881348,
"distill_loss": 1.246164083480835,
"epoch": 0.6923584145358634,
"step": 18900,
"student_mlm_loss": 5.3567914962768555
},
{
"epoch": 0.6960216865704447,
"grad_norm": 11.313789367675781,
"learning_rate": 3.863552596014449e-05,
"loss": 28.0849,
"step": 19000
},
{
"combined_loss": 4.825923919677734,
"distill_loss": 1.377113938331604,
"epoch": 0.6960216865704447,
"step": 19000,
"student_mlm_loss": 8.274733543395996
},
{
"epoch": 0.699684958605026,
"grad_norm": 3.8648459911346436,
"learning_rate": 3.857409637073986e-05,
"loss": 5.8981,
"step": 19100
},
{
"combined_loss": 3.4921586513519287,
"distill_loss": 1.4171725511550903,
"epoch": 0.699684958605026,
"step": 19100,
"student_mlm_loss": 5.567144870758057
},
{
"epoch": 0.7033482306396073,
"grad_norm": 18.98455238342285,
"learning_rate": 3.851266678133523e-05,
"loss": 2.5944,
"step": 19200
},
{
"combined_loss": 1.8949182033538818,
"distill_loss": 1.3743678331375122,
"epoch": 0.7033482306396073,
"step": 19200,
"student_mlm_loss": 2.415468692779541
},
{
"epoch": 0.7070115026741886,
"grad_norm": 27.53456687927246,
"learning_rate": 3.845123719193061e-05,
"loss": 2.8462,
"step": 19300
},
{
"combined_loss": 1.8077284097671509,
"distill_loss": 1.2764451503753662,
"epoch": 0.7070115026741886,
"step": 19300,
"student_mlm_loss": 2.3390116691589355
},
{
"epoch": 0.7106747747087698,
"grad_norm": 8.815896987915039,
"learning_rate": 3.8389807602525986e-05,
"loss": 3.403,
"step": 19400
},
{
"combined_loss": 2.2496674060821533,
"distill_loss": 1.408218264579773,
"epoch": 0.7106747747087698,
"step": 19400,
"student_mlm_loss": 3.091116428375244
},
{
"epoch": 0.7143380467433511,
"grad_norm": 20.02590560913086,
"learning_rate": 3.8328378013121364e-05,
"loss": 3.7767,
"step": 19500
},
{
"combined_loss": 2.6540353298187256,
"distill_loss": 1.451707124710083,
"epoch": 0.7143380467433511,
"step": 19500,
"student_mlm_loss": 3.856363534927368
},
{
"epoch": 0.7180013187779325,
"grad_norm": 48.139583587646484,
"learning_rate": 3.8266948423716735e-05,
"loss": 3.4148,
"step": 19600
},
{
"combined_loss": 3.5710411071777344,
"distill_loss": 1.2874888181686401,
"epoch": 0.7180013187779325,
"step": 19600,
"student_mlm_loss": 5.854593276977539
},
{
"epoch": 0.7216645908125138,
"grad_norm": 5.810763835906982,
"learning_rate": 3.820551883431211e-05,
"loss": 11.1815,
"step": 19700
},
{
"combined_loss": 2.022658586502075,
"distill_loss": 1.408826231956482,
"epoch": 0.7216645908125138,
"step": 19700,
"student_mlm_loss": 2.636491060256958
},
{
"epoch": 0.725327862847095,
"grad_norm": 5.03505277633667,
"learning_rate": 3.814408924490749e-05,
"loss": 3.5792,
"step": 19800
},
{
"combined_loss": 2.450950860977173,
"distill_loss": 1.3786026239395142,
"epoch": 0.725327862847095,
"step": 19800,
"student_mlm_loss": 3.523299217224121
},
{
"epoch": 0.7289911348816763,
"grad_norm": 44.703548431396484,
"learning_rate": 3.808265965550287e-05,
"loss": 14.0822,
"step": 19900
},
{
"combined_loss": 1.8448269367218018,
"distill_loss": 1.3061137199401855,
"epoch": 0.7289911348816763,
"step": 19900,
"student_mlm_loss": 2.383540153503418
},
{
"epoch": 0.7326544069162576,
"grad_norm": 73.46593475341797,
"learning_rate": 3.802123006609824e-05,
"loss": 3.5648,
"step": 20000
},
{
"epoch": 0.7326544069162576,
"eval_loss": 3.689605474472046,
"eval_runtime": 2.2951,
"eval_samples_per_second": 3048.261,
"eval_steps_per_second": 12.2,
"step": 20000
},
{
"combined_loss": 5.831945896148682,
"distill_loss": 1.2505719661712646,
"epoch": 0.7326544069162576,
"step": 20000,
"student_mlm_loss": 10.41331958770752
},
{
"epoch": 0.7363176789508389,
"grad_norm": 7.289074897766113,
"learning_rate": 3.795980047669361e-05,
"loss": 5.9452,
"step": 20100
},
{
"combined_loss": 14.608942985534668,
"distill_loss": 1.4141182899475098,
"epoch": 0.7363176789508389,
"step": 20100,
"student_mlm_loss": 27.803768157958984
},
{
"epoch": 0.7399809509854202,
"grad_norm": 15.717759132385254,
"learning_rate": 3.7898370887288995e-05,
"loss": 5.3196,
"step": 20200
},
{
"combined_loss": 2.34932279586792,
"distill_loss": 1.2641239166259766,
"epoch": 0.7399809509854202,
"step": 20200,
"student_mlm_loss": 3.434521436691284
},
{
"epoch": 0.7436442230200014,
"grad_norm": 75.113037109375,
"learning_rate": 3.7836941297884366e-05,
"loss": 3.4868,
"step": 20300
},
{
"combined_loss": 2.0885400772094727,
"distill_loss": 1.3560060262680054,
"epoch": 0.7436442230200014,
"step": 20300,
"student_mlm_loss": 2.8210740089416504
},
{
"epoch": 0.7473074950545827,
"grad_norm": 12.071985244750977,
"learning_rate": 3.7775511708479744e-05,
"loss": 3.1594,
"step": 20400
},
{
"combined_loss": 2.104968309402466,
"distill_loss": 1.456742286682129,
"epoch": 0.7473074950545827,
"step": 20400,
"student_mlm_loss": 2.7531943321228027
},
{
"epoch": 0.750970767089164,
"grad_norm": 49.17687225341797,
"learning_rate": 3.7714082119075115e-05,
"loss": 5.0772,
"step": 20500
},
{
"combined_loss": 1.9532296657562256,
"distill_loss": 1.2734321355819702,
"epoch": 0.750970767089164,
"step": 20500,
"student_mlm_loss": 2.6330270767211914
},
{
"epoch": 0.7546340391237454,
"grad_norm": 4.601011753082275,
"learning_rate": 3.765265252967049e-05,
"loss": 8.0874,
"step": 20600
},
{
"combined_loss": 1.8828588724136353,
"distill_loss": 1.35260009765625,
"epoch": 0.7546340391237454,
"step": 20600,
"student_mlm_loss": 2.4131176471710205
},
{
"epoch": 0.7582973111583267,
"grad_norm": 3.9183883666992188,
"learning_rate": 3.759122294026587e-05,
"loss": 3.1836,
"step": 20700
},
{
"combined_loss": 3.261841058731079,
"distill_loss": 1.35749351978302,
"epoch": 0.7582973111583267,
"step": 20700,
"student_mlm_loss": 5.166188716888428
},
{
"epoch": 0.7619605831929079,
"grad_norm": 59.35635757446289,
"learning_rate": 3.752979335086124e-05,
"loss": 3.446,
"step": 20800
},
{
"combined_loss": 2.0783181190490723,
"distill_loss": 1.3386023044586182,
"epoch": 0.7619605831929079,
"step": 20800,
"student_mlm_loss": 2.8180341720581055
},
{
"epoch": 0.7656238552274892,
"grad_norm": 14.875,
"learning_rate": 3.746836376145662e-05,
"loss": 8.5798,
"step": 20900
},
{
"combined_loss": 1.926416039466858,
"distill_loss": 1.3077542781829834,
"epoch": 0.7656238552274892,
"step": 20900,
"student_mlm_loss": 2.5450778007507324
},
{
"epoch": 0.7692871272620705,
"grad_norm": 23.419870376586914,
"learning_rate": 3.740693417205199e-05,
"loss": 5.2177,
"step": 21000
},
{
"combined_loss": 1.7290170192718506,
"distill_loss": 1.2258715629577637,
"epoch": 0.7692871272620705,
"step": 21000,
"student_mlm_loss": 2.2321624755859375
},
{
"epoch": 0.7729503992966518,
"grad_norm": 29.292964935302734,
"learning_rate": 3.7345504582647375e-05,
"loss": 13.8021,
"step": 21100
},
{
"combined_loss": 1.9402461051940918,
"distill_loss": 1.2749103307724,
"epoch": 0.7729503992966518,
"step": 21100,
"student_mlm_loss": 2.6055819988250732
},
{
"epoch": 0.776613671331233,
"grad_norm": 9.03995418548584,
"learning_rate": 3.7284074993242747e-05,
"loss": 6.547,
"step": 21200
},
{
"combined_loss": 2.2710204124450684,
"distill_loss": 1.312924861907959,
"epoch": 0.776613671331233,
"step": 21200,
"student_mlm_loss": 3.229116201400757
},
{
"epoch": 0.7802769433658143,
"grad_norm": 11.86938190460205,
"learning_rate": 3.7222645403838124e-05,
"loss": 12.9682,
"step": 21300
},
{
"combined_loss": 3.114459991455078,
"distill_loss": 1.318755865097046,
"epoch": 0.7802769433658143,
"step": 21300,
"student_mlm_loss": 4.910163879394531
},
{
"epoch": 0.7839402154003956,
"grad_norm": 14.11950969696045,
"learning_rate": 3.7161215814433495e-05,
"loss": 3.1257,
"step": 21400
},
{
"combined_loss": 3.882293224334717,
"distill_loss": 1.1930829286575317,
"epoch": 0.7839402154003956,
"step": 21400,
"student_mlm_loss": 6.571503639221191
},
{
"epoch": 0.7876034874349769,
"grad_norm": 22.7275447845459,
"learning_rate": 3.709978622502887e-05,
"loss": 3.1395,
"step": 21500
},
{
"combined_loss": 2.00057315826416,
"distill_loss": 1.3134089708328247,
"epoch": 0.7876034874349769,
"step": 21500,
"student_mlm_loss": 2.687737226486206
},
{
"epoch": 0.7912667594695582,
"grad_norm": 56.84143829345703,
"learning_rate": 3.703835663562425e-05,
"loss": 13.1799,
"step": 21600
},
{
"combined_loss": 2.094574213027954,
"distill_loss": 1.3792191743850708,
"epoch": 0.7912667594695582,
"step": 21600,
"student_mlm_loss": 2.809929370880127
},
{
"epoch": 0.7949300315041395,
"grad_norm": 30.655105590820312,
"learning_rate": 3.697692704621962e-05,
"loss": 4.1563,
"step": 21700
},
{
"combined_loss": 2.167109489440918,
"distill_loss": 1.3041900396347046,
"epoch": 0.7949300315041395,
"step": 21700,
"student_mlm_loss": 3.030029058456421
},
{
"epoch": 0.7985933035387208,
"grad_norm": 7.400668144226074,
"learning_rate": 3.6915497456815e-05,
"loss": 9.7848,
"step": 21800
},
{
"combined_loss": 2.2639806270599365,
"distill_loss": 1.3241550922393799,
"epoch": 0.7985933035387208,
"step": 21800,
"student_mlm_loss": 3.203806161880493
},
{
"epoch": 0.8022565755733021,
"grad_norm": 28.212512969970703,
"learning_rate": 3.685406786741038e-05,
"loss": 2.7595,
"step": 21900
},
{
"combined_loss": 1.9249264001846313,
"distill_loss": 1.337939739227295,
"epoch": 0.8022565755733021,
"step": 21900,
"student_mlm_loss": 2.5119130611419678
},
{
"epoch": 0.8059198476078834,
"grad_norm": 5.998919486999512,
"learning_rate": 3.6792638278005756e-05,
"loss": 5.9041,
"step": 22000
},
{
"epoch": 0.8059198476078834,
"eval_loss": 3.310230016708374,
"eval_runtime": 1.9252,
"eval_samples_per_second": 3633.98,
"eval_steps_per_second": 14.544,
"step": 22000
},
{
"combined_loss": 2.208944320678711,
"distill_loss": 1.2883169651031494,
"epoch": 0.8059198476078834,
"step": 22000,
"student_mlm_loss": 3.1295716762542725
},
{
"epoch": 0.8095831196424647,
"grad_norm": 42.16996383666992,
"learning_rate": 3.673120868860113e-05,
"loss": 10.4166,
"step": 22100
},
{
"combined_loss": 2.089421510696411,
"distill_loss": 1.3541114330291748,
"epoch": 0.8095831196424647,
"step": 22100,
"student_mlm_loss": 2.8247315883636475
},
{
"epoch": 0.8132463916770459,
"grad_norm": 10.702394485473633,
"learning_rate": 3.6669779099196505e-05,
"loss": 3.5812,
"step": 22200
},
{
"combined_loss": 1.8974239826202393,
"distill_loss": 1.3954590559005737,
"epoch": 0.8132463916770459,
"step": 22200,
"student_mlm_loss": 2.3993890285491943
},
{
"epoch": 0.8169096637116272,
"grad_norm": 149.82179260253906,
"learning_rate": 3.6608349509791876e-05,
"loss": 3.229,
"step": 22300
},
{
"combined_loss": 2.0663747787475586,
"distill_loss": 1.3880882263183594,
"epoch": 0.8169096637116272,
"step": 22300,
"student_mlm_loss": 2.7446610927581787
},
{
"epoch": 0.8205729357462085,
"grad_norm": 5.735169410705566,
"learning_rate": 3.6546919920387253e-05,
"loss": 13.0135,
"step": 22400
},
{
"combined_loss": 2.3801686763763428,
"distill_loss": 1.2296876907348633,
"epoch": 0.8205729357462085,
"step": 22400,
"student_mlm_loss": 3.5306496620178223
},
{
"epoch": 0.8242362077807898,
"grad_norm": 3.9154951572418213,
"learning_rate": 3.648549033098263e-05,
"loss": 3.0256,
"step": 22500
},
{
"combined_loss": 2.619138240814209,
"distill_loss": 1.369718313217163,
"epoch": 0.8242362077807898,
"step": 22500,
"student_mlm_loss": 3.868557929992676
},
{
"epoch": 0.827899479815371,
"grad_norm": 6.706686019897461,
"learning_rate": 3.6424060741578e-05,
"loss": 6.8373,
"step": 22600
},
{
"combined_loss": 3.571559429168701,
"distill_loss": 1.360285758972168,
"epoch": 0.827899479815371,
"step": 22600,
"student_mlm_loss": 5.782833099365234
},
{
"epoch": 0.8315627518499524,
"grad_norm": 63.70609664916992,
"learning_rate": 3.636263115217338e-05,
"loss": 3.1874,
"step": 22700
},
{
"combined_loss": 6.645792007446289,
"distill_loss": 1.3381716012954712,
"epoch": 0.8315627518499524,
"step": 22700,
"student_mlm_loss": 11.953412055969238
},
{
"epoch": 0.8352260238845337,
"grad_norm": 112.02607727050781,
"learning_rate": 3.630120156276876e-05,
"loss": 4.1698,
"step": 22800
},
{
"combined_loss": 2.399282455444336,
"distill_loss": 1.2190183401107788,
"epoch": 0.8352260238845337,
"step": 22800,
"student_mlm_loss": 3.5795464515686035
},
{
"epoch": 0.838889295919115,
"grad_norm": 319.05230712890625,
"learning_rate": 3.6239771973364136e-05,
"loss": 3.351,
"step": 22900
},
{
"combined_loss": 5.626018047332764,
"distill_loss": 1.3532286882400513,
"epoch": 0.838889295919115,
"step": 22900,
"student_mlm_loss": 9.898807525634766
},
{
"epoch": 0.8425525679536963,
"grad_norm": 4.46912956237793,
"learning_rate": 3.617834238395951e-05,
"loss": 3.1926,
"step": 23000
},
{
"combined_loss": 1.8462562561035156,
"distill_loss": 1.339337944984436,
"epoch": 0.8425525679536963,
"step": 23000,
"student_mlm_loss": 2.3531746864318848
},
{
"epoch": 0.8462158399882775,
"grad_norm": 15.756026268005371,
"learning_rate": 3.611691279455488e-05,
"loss": 11.7086,
"step": 23100
},
{
"combined_loss": 3.4101529121398926,
"distill_loss": 1.3407546281814575,
"epoch": 0.8462158399882775,
"step": 23100,
"student_mlm_loss": 5.479551315307617
},
{
"epoch": 0.8498791120228588,
"grad_norm": 12.350069046020508,
"learning_rate": 3.6055483205150256e-05,
"loss": 3.1203,
"step": 23200
},
{
"combined_loss": 2.5675039291381836,
"distill_loss": 1.2296205759048462,
"epoch": 0.8498791120228588,
"step": 23200,
"student_mlm_loss": 3.9053874015808105
},
{
"epoch": 0.8535423840574401,
"grad_norm": 11.17212963104248,
"learning_rate": 3.5994053615745634e-05,
"loss": 6.2935,
"step": 23300
},
{
"combined_loss": 2.901674270629883,
"distill_loss": 1.318871021270752,
"epoch": 0.8535423840574401,
"step": 23300,
"student_mlm_loss": 4.484477519989014
},
{
"epoch": 0.8572056560920214,
"grad_norm": 11.69430160522461,
"learning_rate": 3.593262402634101e-05,
"loss": 6.1123,
"step": 23400
},
{
"combined_loss": 1.962475061416626,
"distill_loss": 1.3837331533432007,
"epoch": 0.8572056560920214,
"step": 23400,
"student_mlm_loss": 2.541217088699341
},
{
"epoch": 0.8608689281266027,
"grad_norm": 6.221428394317627,
"learning_rate": 3.587119443693638e-05,
"loss": 5.0621,
"step": 23500
},
{
"combined_loss": 2.3063066005706787,
"distill_loss": 1.364685297012329,
"epoch": 0.8608689281266027,
"step": 23500,
"student_mlm_loss": 3.2479279041290283
},
{
"epoch": 0.8645322001611839,
"grad_norm": 3.200302839279175,
"learning_rate": 3.580976484753176e-05,
"loss": 3.1679,
"step": 23600
},
{
"combined_loss": 14.653901100158691,
"distill_loss": 1.3521461486816406,
"epoch": 0.8645322001611839,
"step": 23600,
"student_mlm_loss": 27.955656051635742
},
{
"epoch": 0.8681954721957652,
"grad_norm": 18.003841400146484,
"learning_rate": 3.574833525812714e-05,
"loss": 4.2524,
"step": 23700
},
{
"combined_loss": 2.05013108253479,
"distill_loss": 1.473749041557312,
"epoch": 0.8681954721957652,
"step": 23700,
"student_mlm_loss": 2.6265130043029785
},
{
"epoch": 0.8718587442303466,
"grad_norm": 16.64165687561035,
"learning_rate": 3.5686905668722516e-05,
"loss": 3.4139,
"step": 23800
},
{
"combined_loss": 3.8039913177490234,
"distill_loss": 1.3022387027740479,
"epoch": 0.8718587442303466,
"step": 23800,
"student_mlm_loss": 6.305744171142578
},
{
"epoch": 0.8755220162649279,
"grad_norm": 6.90595817565918,
"learning_rate": 3.562547607931789e-05,
"loss": 5.4512,
"step": 23900
},
{
"combined_loss": 2.0175633430480957,
"distill_loss": 1.2362921237945557,
"epoch": 0.8755220162649279,
"step": 23900,
"student_mlm_loss": 2.7988343238830566
},
{
"epoch": 0.8791852882995091,
"grad_norm": 26.792980194091797,
"learning_rate": 3.556404648991326e-05,
"loss": 6.622,
"step": 24000
},
{
"epoch": 0.8791852882995091,
"eval_loss": 3.643918991088867,
"eval_runtime": 1.9198,
"eval_samples_per_second": 3644.043,
"eval_steps_per_second": 14.585,
"step": 24000
},
{
"combined_loss": 2.1716020107269287,
"distill_loss": 1.3234556913375854,
"epoch": 0.8791852882995091,
"step": 24000,
"student_mlm_loss": 3.0197484493255615
},
{
"epoch": 0.8828485603340904,
"grad_norm": 4.8087568283081055,
"learning_rate": 3.550261690050864e-05,
"loss": 4.0542,
"step": 24100
},
{
"combined_loss": 13.035262107849121,
"distill_loss": 1.353433609008789,
"epoch": 0.8828485603340904,
"step": 24100,
"student_mlm_loss": 24.717090606689453
},
{
"epoch": 0.8865118323686717,
"grad_norm": 10.60560417175293,
"learning_rate": 3.5441187311104014e-05,
"loss": 3.1068,
"step": 24200
},
{
"combined_loss": 1.8867456912994385,
"distill_loss": 1.2289210557937622,
"epoch": 0.8865118323686717,
"step": 24200,
"student_mlm_loss": 2.544570207595825
},
{
"epoch": 0.890175104403253,
"grad_norm": 11.34473705291748,
"learning_rate": 3.537975772169939e-05,
"loss": 2.9801,
"step": 24300
},
{
"combined_loss": 1.7472858428955078,
"distill_loss": 1.229453206062317,
"epoch": 0.890175104403253,
"step": 24300,
"student_mlm_loss": 2.265118360519409
},
{
"epoch": 0.8938383764378343,
"grad_norm": 17.742507934570312,
"learning_rate": 3.531832813229476e-05,
"loss": 4.6617,
"step": 24400
},
{
"combined_loss": 1.9173786640167236,
"distill_loss": 1.3212807178497314,
"epoch": 0.8938383764378343,
"step": 24400,
"student_mlm_loss": 2.513476610183716
},
{
"epoch": 0.8975016484724155,
"grad_norm": 14.223791122436523,
"learning_rate": 3.525689854289014e-05,
"loss": 3.0537,
"step": 24500
},
{
"combined_loss": 1.7878549098968506,
"distill_loss": 1.2908958196640015,
"epoch": 0.8975016484724155,
"step": 24500,
"student_mlm_loss": 2.28481388092041
},
{
"epoch": 0.9011649205069968,
"grad_norm": 4.241771697998047,
"learning_rate": 3.519546895348552e-05,
"loss": 7.9255,
"step": 24600
},
{
"combined_loss": 1.8853719234466553,
"distill_loss": 1.3350555896759033,
"epoch": 0.9011649205069968,
"step": 24600,
"student_mlm_loss": 2.4356882572174072
},
{
"epoch": 0.9048281925415781,
"grad_norm": 5.793640613555908,
"learning_rate": 3.513403936408089e-05,
"loss": 2.9971,
"step": 24700
},
{
"combined_loss": 9.072087287902832,
"distill_loss": 1.2805593013763428,
"epoch": 0.9048281925415781,
"step": 24700,
"student_mlm_loss": 16.863615036010742
},
{
"epoch": 0.9084914645761595,
"grad_norm": 4.500351905822754,
"learning_rate": 3.507260977467627e-05,
"loss": 2.9841,
"step": 24800
},
{
"combined_loss": 4.229645252227783,
"distill_loss": 1.231893539428711,
"epoch": 0.9084914645761595,
"step": 24800,
"student_mlm_loss": 7.2273969650268555
},
{
"epoch": 0.9121547366107408,
"grad_norm": 24.93678855895996,
"learning_rate": 3.501118018527164e-05,
"loss": 5.2865,
"step": 24900
},
{
"combined_loss": 4.519498825073242,
"distill_loss": 1.35053288936615,
"epoch": 0.9121547366107408,
"step": 24900,
"student_mlm_loss": 7.688465118408203
},
{
"epoch": 0.915818008645322,
"grad_norm": 9.416017532348633,
"learning_rate": 3.494975059586702e-05,
"loss": 2.9688,
"step": 25000
},
{
"combined_loss": 4.33969783782959,
"distill_loss": 1.2811079025268555,
"epoch": 0.915818008645322,
"step": 25000,
"student_mlm_loss": 7.398288249969482
},
{
"epoch": 0.9194812806799033,
"grad_norm": 41.79585266113281,
"learning_rate": 3.4888321006462394e-05,
"loss": 12.352,
"step": 25100
},
{
"combined_loss": 2.398942232131958,
"distill_loss": 1.3129199743270874,
"epoch": 0.9194812806799033,
"step": 25100,
"student_mlm_loss": 3.484964609146118
},
{
"epoch": 0.9231445527144846,
"grad_norm": 27.67843246459961,
"learning_rate": 3.482689141705777e-05,
"loss": 4.6291,
"step": 25200
},
{
"combined_loss": 1.8275630474090576,
"distill_loss": 1.1290583610534668,
"epoch": 0.9231445527144846,
"step": 25200,
"student_mlm_loss": 2.5260677337646484
},
{
"epoch": 0.9268078247490659,
"grad_norm": 57.03019332885742,
"learning_rate": 3.476546182765314e-05,
"loss": 3.8226,
"step": 25300
},
{
"combined_loss": 1.8621808290481567,
"distill_loss": 1.3249785900115967,
"epoch": 0.9268078247490659,
"step": 25300,
"student_mlm_loss": 2.399383068084717
},
{
"epoch": 0.9304710967836471,
"grad_norm": 5.4275007247924805,
"learning_rate": 3.470403223824852e-05,
"loss": 3.7803,
"step": 25400
},
{
"combined_loss": 5.317490100860596,
"distill_loss": 1.3810964822769165,
"epoch": 0.9304710967836471,
"step": 25400,
"student_mlm_loss": 9.253883361816406
},
{
"epoch": 0.9341343688182284,
"grad_norm": 6.36318302154541,
"learning_rate": 3.46426026488439e-05,
"loss": 17.9114,
"step": 25500
},
{
"combined_loss": 4.816742897033691,
"distill_loss": 1.274537444114685,
"epoch": 0.9341343688182284,
"step": 25500,
"student_mlm_loss": 8.358948707580566
},
{
"epoch": 0.9377976408528097,
"grad_norm": 4.670822620391846,
"learning_rate": 3.458117305943927e-05,
"loss": 3.4352,
"step": 25600
},
{
"combined_loss": 1.7166364192962646,
"distill_loss": 1.2876447439193726,
"epoch": 0.9377976408528097,
"step": 25600,
"student_mlm_loss": 2.145627975463867
},
{
"epoch": 0.941460912887391,
"grad_norm": 16.301795959472656,
"learning_rate": 3.451974347003465e-05,
"loss": 2.591,
"step": 25700
},
{
"combined_loss": 1.8349076509475708,
"distill_loss": 1.3192713260650635,
"epoch": 0.941460912887391,
"step": 25700,
"student_mlm_loss": 2.350543975830078
},
{
"epoch": 0.9451241849219723,
"grad_norm": 4.464934349060059,
"learning_rate": 3.4458313880630025e-05,
"loss": 5.3202,
"step": 25800
},
{
"combined_loss": 2.022656202316284,
"distill_loss": 1.4582451581954956,
"epoch": 0.9451241849219723,
"step": 25800,
"student_mlm_loss": 2.587067127227783
},
{
"epoch": 0.9487874569565536,
"grad_norm": 13.280508041381836,
"learning_rate": 3.43968842912254e-05,
"loss": 3.2685,
"step": 25900
},
{
"combined_loss": 1.7409727573394775,
"distill_loss": 1.2449432611465454,
"epoch": 0.9487874569565536,
"step": 25900,
"student_mlm_loss": 2.23700213432312
},
{
"epoch": 0.9524507289911349,
"grad_norm": 34.54155349731445,
"learning_rate": 3.4335454701820774e-05,
"loss": 4.4614,
"step": 26000
},
{
"epoch": 0.9524507289911349,
"eval_loss": 3.371135950088501,
"eval_runtime": 1.9026,
"eval_samples_per_second": 3677.064,
"eval_steps_per_second": 14.717,
"step": 26000
},
{
"combined_loss": 2.1200222969055176,
"distill_loss": 1.4147942066192627,
"epoch": 0.9524507289911349,
"step": 26000,
"student_mlm_loss": 2.8252503871917725
},
{
"epoch": 0.9561140010257162,
"grad_norm": 12.063314437866211,
"learning_rate": 3.427402511241615e-05,
"loss": 3.8605,
"step": 26100
},
{
"combined_loss": 2.440842866897583,
"distill_loss": 1.4115891456604004,
"epoch": 0.9561140010257162,
"step": 26100,
"student_mlm_loss": 3.4700965881347656
},
{
"epoch": 0.9597772730602975,
"grad_norm": 3.154322862625122,
"learning_rate": 3.421259552301152e-05,
"loss": 3.4216,
"step": 26200
},
{
"combined_loss": 2.0511860847473145,
"distill_loss": 1.2086646556854248,
"epoch": 0.9597772730602975,
"step": 26200,
"student_mlm_loss": 2.893707752227783
},
{
"epoch": 0.9634405450948788,
"grad_norm": 4.469895839691162,
"learning_rate": 3.41511659336069e-05,
"loss": 8.4313,
"step": 26300
},
{
"combined_loss": 1.9184556007385254,
"distill_loss": 1.311684489250183,
"epoch": 0.9634405450948788,
"step": 26300,
"student_mlm_loss": 2.525226593017578
},
{
"epoch": 0.96710381712946,
"grad_norm": 37.47445297241211,
"learning_rate": 3.408973634420228e-05,
"loss": 3.33,
"step": 26400
},
{
"combined_loss": 1.8568530082702637,
"distill_loss": 1.3435510396957397,
"epoch": 0.96710381712946,
"step": 26400,
"student_mlm_loss": 2.370154857635498
},
{
"epoch": 0.9707670891640413,
"grad_norm": 5.385250091552734,
"learning_rate": 3.402830675479765e-05,
"loss": 3.0353,
"step": 26500
},
{
"combined_loss": 2.078137159347534,
"distill_loss": 1.4688613414764404,
"epoch": 0.9707670891640413,
"step": 26500,
"student_mlm_loss": 2.687412977218628
},
{
"epoch": 0.9744303611986226,
"grad_norm": 20.363506317138672,
"learning_rate": 3.396687716539303e-05,
"loss": 5.5902,
"step": 26600
},
{
"combined_loss": 2.420652151107788,
"distill_loss": 1.3566147089004517,
"epoch": 0.9744303611986226,
"step": 26600,
"student_mlm_loss": 3.484689474105835
},
{
"epoch": 0.9780936332332039,
"grad_norm": 5.678069591522217,
"learning_rate": 3.3905447575988405e-05,
"loss": 3.1063,
"step": 26700
},
{
"combined_loss": 2.2643003463745117,
"distill_loss": 1.3446204662322998,
"epoch": 0.9780936332332039,
"step": 26700,
"student_mlm_loss": 3.1839799880981445
},
{
"epoch": 0.9817569052677851,
"grad_norm": 8.722668647766113,
"learning_rate": 3.384401798658378e-05,
"loss": 9.3685,
"step": 26800
},
{
"combined_loss": 8.34331226348877,
"distill_loss": 1.3864542245864868,
"epoch": 0.9817569052677851,
"step": 26800,
"student_mlm_loss": 15.3001708984375
},
{
"epoch": 0.9854201773023665,
"grad_norm": 5.101404190063477,
"learning_rate": 3.3782588397179154e-05,
"loss": 3.1112,
"step": 26900
},
{
"combined_loss": 30.241453170776367,
"distill_loss": 1.3818217515945435,
"epoch": 0.9854201773023665,
"step": 26900,
"student_mlm_loss": 59.1010856628418
},
{
"epoch": 0.9890834493369478,
"grad_norm": 3.8359858989715576,
"learning_rate": 3.3721158807774525e-05,
"loss": 3.348,
"step": 27000
},
{
"combined_loss": 1.8264105319976807,
"distill_loss": 1.2956147193908691,
"epoch": 0.9890834493369478,
"step": 27000,
"student_mlm_loss": 2.357206344604492
},
{
"epoch": 0.9927467213715291,
"grad_norm": 33.43736267089844,
"learning_rate": 3.36597292183699e-05,
"loss": 3.5437,
"step": 27100
},
{
"combined_loss": 2.331777572631836,
"distill_loss": 1.3274433612823486,
"epoch": 0.9927467213715291,
"step": 27100,
"student_mlm_loss": 3.3361120223999023
},
{
"epoch": 0.9964099934061104,
"grad_norm": 2.9736690521240234,
"learning_rate": 3.359829962896528e-05,
"loss": 2.828,
"step": 27200
},
{
"combined_loss": 2.0438201427459717,
"distill_loss": 1.334372639656067,
"epoch": 0.9964099934061104,
"step": 27200,
"student_mlm_loss": 2.753267526626587
},
{
"epoch": 1.0000732654406916,
"grad_norm": 3.6774871349334717,
"learning_rate": 3.353687003956066e-05,
"loss": 3.168,
"step": 27300
},
{
"combined_loss": 3.4676733016967773,
"distill_loss": 1.2681790590286255,
"epoch": 1.0000732654406916,
"step": 27300,
"student_mlm_loss": 5.667167663574219
},
{
"epoch": 1.003736537475273,
"grad_norm": 20.265796661376953,
"learning_rate": 3.347544045015603e-05,
"loss": 4.9071,
"step": 27400
},
{
"combined_loss": 1.740236520767212,
"distill_loss": 1.1595730781555176,
"epoch": 1.003736537475273,
"step": 27400,
"student_mlm_loss": 2.3208999633789062
},
{
"epoch": 1.0073998095098542,
"grad_norm": 14.427675247192383,
"learning_rate": 3.341401086075141e-05,
"loss": 3.1375,
"step": 27500
},
{
"combined_loss": 2.0229873657226562,
"distill_loss": 1.3961925506591797,
"epoch": 1.0073998095098542,
"step": 27500,
"student_mlm_loss": 2.6497819423675537
},
{
"epoch": 1.0110630815444355,
"grad_norm": 3.032438039779663,
"learning_rate": 3.3352581271346786e-05,
"loss": 2.7581,
"step": 27600
},
{
"combined_loss": 1.9314367771148682,
"distill_loss": 1.2618595361709595,
"epoch": 1.0110630815444355,
"step": 27600,
"student_mlm_loss": 2.6010141372680664
},
{
"epoch": 1.0147263535790167,
"grad_norm": 6.167496681213379,
"learning_rate": 3.3291151681942163e-05,
"loss": 6.7788,
"step": 27700
},
{
"combined_loss": 2.247697353363037,
"distill_loss": 1.4385483264923096,
"epoch": 1.0147263535790167,
"step": 27700,
"student_mlm_loss": 3.0568461418151855
},
{
"epoch": 1.018389625613598,
"grad_norm": 4.82693338394165,
"learning_rate": 3.3229722092537534e-05,
"loss": 5.9229,
"step": 27800
},
{
"combined_loss": 3.4328160285949707,
"distill_loss": 1.319059133529663,
"epoch": 1.018389625613598,
"step": 27800,
"student_mlm_loss": 5.546572685241699
},
{
"epoch": 1.0220528976481793,
"grad_norm": 13.18911361694336,
"learning_rate": 3.3168292503132906e-05,
"loss": 3.5041,
"step": 27900
},
{
"combined_loss": 3.720487594604492,
"distill_loss": 1.233067274093628,
"epoch": 1.0220528976481793,
"step": 27900,
"student_mlm_loss": 6.207907676696777
},
{
"epoch": 1.0257161696827606,
"grad_norm": 10.725250244140625,
"learning_rate": 3.310686291372829e-05,
"loss": 2.9279,
"step": 28000
},
{
"epoch": 1.0257161696827606,
"eval_loss": 3.3177244663238525,
"eval_runtime": 2.0821,
"eval_samples_per_second": 3360.034,
"eval_steps_per_second": 13.448,
"step": 28000
},
{
"combined_loss": 2.0106987953186035,
"distill_loss": 1.3163011074066162,
"epoch": 1.0257161696827606,
"step": 28000,
"student_mlm_loss": 2.70509672164917
},
{
"epoch": 1.0293794417173419,
"grad_norm": 5.406506538391113,
"learning_rate": 3.304543332432366e-05,
"loss": 3.2149,
"step": 28100
},
{
"combined_loss": 2.042628288269043,
"distill_loss": 1.3173636198043823,
"epoch": 1.0293794417173419,
"step": 28100,
"student_mlm_loss": 2.767892837524414
},
{
"epoch": 1.0330427137519231,
"grad_norm": 3.2733256816864014,
"learning_rate": 3.298400373491904e-05,
"loss": 6.3856,
"step": 28200
},
{
"combined_loss": 1.9145760536193848,
"distill_loss": 1.438834309577942,
"epoch": 1.0330427137519231,
"step": 28200,
"student_mlm_loss": 2.390317916870117
},
{
"epoch": 1.0367059857865044,
"grad_norm": 10.546121597290039,
"learning_rate": 3.292257414551441e-05,
"loss": 3.5422,
"step": 28300
},
{
"combined_loss": 2.6431736946105957,
"distill_loss": 1.367489218711853,
"epoch": 1.0367059857865044,
"step": 28300,
"student_mlm_loss": 3.918858289718628
},
{
"epoch": 1.0403692578210857,
"grad_norm": 25.674352645874023,
"learning_rate": 3.286114455610979e-05,
"loss": 6.2258,
"step": 28400
},
{
"combined_loss": 1.8416577577590942,
"distill_loss": 1.2867157459259033,
"epoch": 1.0403692578210857,
"step": 28400,
"student_mlm_loss": 2.396599769592285
},
{
"epoch": 1.044032529855667,
"grad_norm": 3.6745688915252686,
"learning_rate": 3.2799714966705166e-05,
"loss": 5.0647,
"step": 28500
},
{
"combined_loss": 1.9693520069122314,
"distill_loss": 1.3039644956588745,
"epoch": 1.044032529855667,
"step": 28500,
"student_mlm_loss": 2.634739637374878
},
{
"epoch": 1.0476958018902485,
"grad_norm": 40.79129409790039,
"learning_rate": 3.273828537730054e-05,
"loss": 2.6424,
"step": 28600
},
{
"combined_loss": 2.4251365661621094,
"distill_loss": 1.3121291399002075,
"epoch": 1.0476958018902485,
"step": 28600,
"student_mlm_loss": 3.5381438732147217
},
{
"epoch": 1.0513590739248297,
"grad_norm": 7.185906410217285,
"learning_rate": 3.2676855787895915e-05,
"loss": 2.9095,
"step": 28700
},
{
"combined_loss": 5.781175136566162,
"distill_loss": 1.3236074447631836,
"epoch": 1.0513590739248297,
"step": 28700,
"student_mlm_loss": 10.23874282836914
},
{
"epoch": 1.055022345959411,
"grad_norm": 7.2639079093933105,
"learning_rate": 3.2615426198491286e-05,
"loss": 3.0536,
"step": 28800
},
{
"combined_loss": 1.8534462451934814,
"distill_loss": 1.433970332145691,
"epoch": 1.055022345959411,
"step": 28800,
"student_mlm_loss": 2.2729220390319824
},
{
"epoch": 1.0586856179939923,
"grad_norm": 82.9974365234375,
"learning_rate": 3.255399660908667e-05,
"loss": 3.4605,
"step": 28900
},
{
"combined_loss": 2.385720729827881,
"distill_loss": 1.319982647895813,
"epoch": 1.0586856179939923,
"step": 28900,
"student_mlm_loss": 3.4514589309692383
},
{
"epoch": 1.0623488900285736,
"grad_norm": 8.101861000061035,
"learning_rate": 3.249256701968204e-05,
"loss": 2.9531,
"step": 29000
},
{
"combined_loss": 1.9569958448410034,
"distill_loss": 1.350255012512207,
"epoch": 1.0623488900285736,
"step": 29000,
"student_mlm_loss": 2.5637366771698
},
{
"epoch": 1.0660121620631549,
"grad_norm": 42.843135833740234,
"learning_rate": 3.243113743027742e-05,
"loss": 3.5336,
"step": 29100
},
{
"combined_loss": 2.0199599266052246,
"distill_loss": 1.1558183431625366,
"epoch": 1.0660121620631549,
"step": 29100,
"student_mlm_loss": 2.884101390838623
},
{
"epoch": 1.0696754340977361,
"grad_norm": 10.401261329650879,
"learning_rate": 3.236970784087279e-05,
"loss": 2.6909,
"step": 29200
},
{
"combined_loss": 1.898897409439087,
"distill_loss": 1.2361267805099487,
"epoch": 1.0696754340977361,
"step": 29200,
"student_mlm_loss": 2.5616679191589355
},
{
"epoch": 1.0733387061323174,
"grad_norm": 13.08026123046875,
"learning_rate": 3.230827825146817e-05,
"loss": 10.7499,
"step": 29300
},
{
"combined_loss": 2.385263442993164,
"distill_loss": 1.2960166931152344,
"epoch": 1.0733387061323174,
"step": 29300,
"student_mlm_loss": 3.4745099544525146
},
{
"epoch": 1.0770019781668987,
"grad_norm": 6.8822431564331055,
"learning_rate": 3.2246848662063546e-05,
"loss": 3.0651,
"step": 29400
},
{
"combined_loss": 2.1257505416870117,
"distill_loss": 1.3224972486495972,
"epoch": 1.0770019781668987,
"step": 29400,
"student_mlm_loss": 2.929003953933716
},
{
"epoch": 1.08066525020148,
"grad_norm": 3.4312744140625,
"learning_rate": 3.218541907265892e-05,
"loss": 3.1323,
"step": 29500
},
{
"combined_loss": 2.0117716789245605,
"distill_loss": 1.2447552680969238,
"epoch": 1.08066525020148,
"step": 29500,
"student_mlm_loss": 2.7787880897521973
},
{
"epoch": 1.0843285222360612,
"grad_norm": 3.970820426940918,
"learning_rate": 3.2123989483254295e-05,
"loss": 3.7427,
"step": 29600
},
{
"combined_loss": 2.493256092071533,
"distill_loss": 1.27970290184021,
"epoch": 1.0843285222360612,
"step": 29600,
"student_mlm_loss": 3.7068092823028564
},
{
"epoch": 1.0879917942706425,
"grad_norm": 5.8632426261901855,
"learning_rate": 3.206255989384967e-05,
"loss": 3.0698,
"step": 29700
},
{
"combined_loss": 2.017867088317871,
"distill_loss": 1.408115029335022,
"epoch": 1.0879917942706425,
"step": 29700,
"student_mlm_loss": 2.6276190280914307
},
{
"epoch": 1.0916550663052238,
"grad_norm": 7.350955963134766,
"learning_rate": 3.200113030444505e-05,
"loss": 10.1517,
"step": 29800
},
{
"combined_loss": 3.020230770111084,
"distill_loss": 1.1870992183685303,
"epoch": 1.0916550663052238,
"step": 29800,
"student_mlm_loss": 4.853362083435059
},
{
"epoch": 1.095318338339805,
"grad_norm": 14.347647666931152,
"learning_rate": 3.193970071504042e-05,
"loss": 2.8345,
"step": 29900
},
{
"combined_loss": 1.8037035465240479,
"distill_loss": 1.2421637773513794,
"epoch": 1.095318338339805,
"step": 29900,
"student_mlm_loss": 2.365243434906006
},
{
"epoch": 1.0989816103743864,
"grad_norm": 8.716060638427734,
"learning_rate": 3.18782711256358e-05,
"loss": 4.9073,
"step": 30000
},
{
"epoch": 1.0989816103743864,
"eval_loss": 3.289705753326416,
"eval_runtime": 2.6398,
"eval_samples_per_second": 2650.179,
"eval_steps_per_second": 10.607,
"step": 30000
},
{
"combined_loss": 3.3838839530944824,
"distill_loss": 1.2657897472381592,
"epoch": 1.0989816103743864,
"step": 30000,
"student_mlm_loss": 5.501977920532227
},
{
"epoch": 1.1026448824089676,
"grad_norm": 9.78013801574707,
"learning_rate": 3.181684153623117e-05,
"loss": 6.1366,
"step": 30100
},
{
"combined_loss": 1.8116616010665894,
"distill_loss": 1.3585631847381592,
"epoch": 1.1026448824089676,
"step": 30100,
"student_mlm_loss": 2.2647600173950195
},
{
"epoch": 1.106308154443549,
"grad_norm": 20.41010856628418,
"learning_rate": 3.175541194682655e-05,
"loss": 4.7028,
"step": 30200
},
{
"combined_loss": 1.9074151515960693,
"distill_loss": 1.119224190711975,
"epoch": 1.106308154443549,
"step": 30200,
"student_mlm_loss": 2.695605993270874
},
{
"epoch": 1.1099714264781302,
"grad_norm": 7.005733966827393,
"learning_rate": 3.1693982357421926e-05,
"loss": 4.9073,
"step": 30300
},
{
"combined_loss": 1.7690558433532715,
"distill_loss": 1.2762707471847534,
"epoch": 1.1099714264781302,
"step": 30300,
"student_mlm_loss": 2.2618408203125
},
{
"epoch": 1.1136346985127115,
"grad_norm": 4.290195465087891,
"learning_rate": 3.16325527680173e-05,
"loss": 4.1257,
"step": 30400
},
{
"combined_loss": 15.505983352661133,
"distill_loss": 1.252361536026001,
"epoch": 1.1136346985127115,
"step": 30400,
"student_mlm_loss": 29.759605407714844
},
{
"epoch": 1.1172979705472927,
"grad_norm": 27.59025764465332,
"learning_rate": 3.1571123178612675e-05,
"loss": 3.6319,
"step": 30500
},
{
"combined_loss": 3.190175771713257,
"distill_loss": 1.237632155418396,
"epoch": 1.1172979705472927,
"step": 30500,
"student_mlm_loss": 5.142719268798828
},
{
"epoch": 1.120961242581874,
"grad_norm": 35.681365966796875,
"learning_rate": 3.150969358920805e-05,
"loss": 5.2866,
"step": 30600
},
{
"combined_loss": 2.1486501693725586,
"distill_loss": 1.3570821285247803,
"epoch": 1.120961242581874,
"step": 30600,
"student_mlm_loss": 2.940218448638916
},
{
"epoch": 1.1246245146164555,
"grad_norm": 28.920949935913086,
"learning_rate": 3.144826399980343e-05,
"loss": 11.35,
"step": 30700
},
{
"combined_loss": 3.544619560241699,
"distill_loss": 1.3219174146652222,
"epoch": 1.1246245146164555,
"step": 30700,
"student_mlm_loss": 5.767321586608887
},
{
"epoch": 1.1282877866510368,
"grad_norm": 36.29865264892578,
"learning_rate": 3.13868344103988e-05,
"loss": 8.8748,
"step": 30800
},
{
"combined_loss": 3.136960744857788,
"distill_loss": 1.4069170951843262,
"epoch": 1.1282877866510368,
"step": 30800,
"student_mlm_loss": 4.86700439453125
},
{
"epoch": 1.131951058685618,
"grad_norm": 8.498424530029297,
"learning_rate": 3.132540482099417e-05,
"loss": 2.6175,
"step": 30900
},
{
"combined_loss": 2.584123373031616,
"distill_loss": 1.3318666219711304,
"epoch": 1.131951058685618,
"step": 30900,
"student_mlm_loss": 3.8363800048828125
},
{
"epoch": 1.1356143307201993,
"grad_norm": 8.784627914428711,
"learning_rate": 3.126397523158955e-05,
"loss": 3.7912,
"step": 31000
},
{
"combined_loss": 4.065792083740234,
"distill_loss": 1.279055118560791,
"epoch": 1.1356143307201993,
"step": 31000,
"student_mlm_loss": 6.8525285720825195
},
{
"epoch": 1.1392776027547806,
"grad_norm": 15.763399124145508,
"learning_rate": 3.120254564218493e-05,
"loss": 7.3671,
"step": 31100
},
{
"combined_loss": 1.9532334804534912,
"distill_loss": 1.2137418985366821,
"epoch": 1.1392776027547806,
"step": 31100,
"student_mlm_loss": 2.6927249431610107
},
{
"epoch": 1.142940874789362,
"grad_norm": 6.777341842651367,
"learning_rate": 3.1141116052780306e-05,
"loss": 2.8877,
"step": 31200
},
{
"combined_loss": 3.5847015380859375,
"distill_loss": 1.3712694644927979,
"epoch": 1.142940874789362,
"step": 31200,
"student_mlm_loss": 5.798133850097656
},
{
"epoch": 1.1466041468239432,
"grad_norm": 6.115112781524658,
"learning_rate": 3.107968646337568e-05,
"loss": 3.3763,
"step": 31300
},
{
"combined_loss": 1.899533748626709,
"distill_loss": 1.2805981636047363,
"epoch": 1.1466041468239432,
"step": 31300,
"student_mlm_loss": 2.5184693336486816
},
{
"epoch": 1.1502674188585245,
"grad_norm": 3.3896713256835938,
"learning_rate": 3.1018256873971055e-05,
"loss": 3.2932,
"step": 31400
},
{
"combined_loss": 1.9794254302978516,
"distill_loss": 1.3896270990371704,
"epoch": 1.1502674188585245,
"step": 31400,
"student_mlm_loss": 2.5692238807678223
},
{
"epoch": 1.1539306908931057,
"grad_norm": 12.824034690856934,
"learning_rate": 3.095682728456643e-05,
"loss": 3.5341,
"step": 31500
},
{
"combined_loss": 2.5983529090881348,
"distill_loss": 1.2135576009750366,
"epoch": 1.1539306908931057,
"step": 31500,
"student_mlm_loss": 3.9831480979919434
},
{
"epoch": 1.157593962927687,
"grad_norm": 73.47982025146484,
"learning_rate": 3.089539769516181e-05,
"loss": 2.9879,
"step": 31600
},
{
"combined_loss": 1.8584779500961304,
"distill_loss": 1.3214514255523682,
"epoch": 1.157593962927687,
"step": 31600,
"student_mlm_loss": 2.3955044746398926
},
{
"epoch": 1.1612572349622683,
"grad_norm": 5.6778340339660645,
"learning_rate": 3.083396810575718e-05,
"loss": 2.9781,
"step": 31700
},
{
"combined_loss": 4.854001045227051,
"distill_loss": 1.2088978290557861,
"epoch": 1.1612572349622683,
"step": 31700,
"student_mlm_loss": 8.499104499816895
},
{
"epoch": 1.1649205069968496,
"grad_norm": 17.93754768371582,
"learning_rate": 3.077253851635255e-05,
"loss": 3.5773,
"step": 31800
},
{
"combined_loss": 1.9064607620239258,
"distill_loss": 1.363638997077942,
"epoch": 1.1649205069968496,
"step": 31800,
"student_mlm_loss": 2.449282646179199
},
{
"epoch": 1.1685837790314308,
"grad_norm": 8.912027359008789,
"learning_rate": 3.071110892694794e-05,
"loss": 3.0949,
"step": 31900
},
{
"combined_loss": 1.9666361808776855,
"distill_loss": 1.3997029066085815,
"epoch": 1.1685837790314308,
"step": 31900,
"student_mlm_loss": 2.5335693359375
},
{
"epoch": 1.1722470510660121,
"grad_norm": 21.05866050720215,
"learning_rate": 3.064967933754331e-05,
"loss": 2.965,
"step": 32000
},
{
"epoch": 1.1722470510660121,
"eval_loss": 3.516061544418335,
"eval_runtime": 2.6391,
"eval_samples_per_second": 2650.903,
"eval_steps_per_second": 10.61,
"step": 32000
},
{
"combined_loss": 2.466904640197754,
"distill_loss": 1.2619636058807373,
"epoch": 1.1722470510660121,
"step": 32000,
"student_mlm_loss": 3.6718459129333496
},
{
"epoch": 1.1759103231005934,
"grad_norm": 14.288066864013672,
"learning_rate": 3.0588249748138686e-05,
"loss": 6.5656,
"step": 32100
},
{
"combined_loss": 5.987391471862793,
"distill_loss": 1.3964972496032715,
"epoch": 1.1759103231005934,
"step": 32100,
"student_mlm_loss": 10.578286170959473
},
{
"epoch": 1.1795735951351747,
"grad_norm": 10.953961372375488,
"learning_rate": 3.052682015873406e-05,
"loss": 7.1246,
"step": 32200
},
{
"combined_loss": 1.758845567703247,
"distill_loss": 1.2731348276138306,
"epoch": 1.1795735951351747,
"step": 32200,
"student_mlm_loss": 2.244556188583374
},
{
"epoch": 1.183236867169756,
"grad_norm": 17.076087951660156,
"learning_rate": 3.046539056932944e-05,
"loss": 7.3734,
"step": 32300
},
{
"combined_loss": 1.7941749095916748,
"distill_loss": 1.282630205154419,
"epoch": 1.183236867169756,
"step": 32300,
"student_mlm_loss": 2.3057196140289307
},
{
"epoch": 1.1869001392043372,
"grad_norm": 11.33812427520752,
"learning_rate": 3.040396097992481e-05,
"loss": 5.4979,
"step": 32400
},
{
"combined_loss": 2.379426956176758,
"distill_loss": 1.2975032329559326,
"epoch": 1.1869001392043372,
"step": 32400,
"student_mlm_loss": 3.461350917816162
},
{
"epoch": 1.1905634112389185,
"grad_norm": 3.6378591060638428,
"learning_rate": 3.0342531390520184e-05,
"loss": 5.077,
"step": 32500
},
{
"combined_loss": 1.835166573524475,
"distill_loss": 1.294168472290039,
"epoch": 1.1905634112389185,
"step": 32500,
"student_mlm_loss": 2.376164674758911
},
{
"epoch": 1.1942266832735,
"grad_norm": 23.017444610595703,
"learning_rate": 3.0281101801115562e-05,
"loss": 3.1428,
"step": 32600
},
{
"combined_loss": 1.8867619037628174,
"distill_loss": 1.2372292280197144,
"epoch": 1.1942266832735,
"step": 32600,
"student_mlm_loss": 2.536294460296631
},
{
"epoch": 1.197889955308081,
"grad_norm": 7.055652141571045,
"learning_rate": 3.0219672211710937e-05,
"loss": 8.7118,
"step": 32700
},
{
"combined_loss": 6.59044075012207,
"distill_loss": 1.3554973602294922,
"epoch": 1.197889955308081,
"step": 32700,
"student_mlm_loss": 11.825384140014648
},
{
"epoch": 1.2015532273426626,
"grad_norm": 6.935373783111572,
"learning_rate": 3.0158242622306314e-05,
"loss": 7.5763,
"step": 32800
},
{
"combined_loss": 2.4971964359283447,
"distill_loss": 1.2960432767868042,
"epoch": 1.2015532273426626,
"step": 32800,
"student_mlm_loss": 3.698349714279175
},
{
"epoch": 1.2052164993772438,
"grad_norm": 19.48725700378418,
"learning_rate": 3.009681303290169e-05,
"loss": 5.1993,
"step": 32900
},
{
"combined_loss": 2.639206886291504,
"distill_loss": 1.2536990642547607,
"epoch": 1.2052164993772438,
"step": 32900,
"student_mlm_loss": 4.024714469909668
},
{
"epoch": 1.2088797714118251,
"grad_norm": 215.4875946044922,
"learning_rate": 3.0035383443497067e-05,
"loss": 3.9297,
"step": 33000
},
{
"combined_loss": 2.1888670921325684,
"distill_loss": 1.4587746858596802,
"epoch": 1.2088797714118251,
"step": 33000,
"student_mlm_loss": 2.918959379196167
},
{
"epoch": 1.2125430434464064,
"grad_norm": 5.346382141113281,
"learning_rate": 2.997395385409244e-05,
"loss": 3.3704,
"step": 33100
},
{
"combined_loss": 2.5722949504852295,
"distill_loss": 1.2250982522964478,
"epoch": 1.2125430434464064,
"step": 33100,
"student_mlm_loss": 3.9194915294647217
},
{
"epoch": 1.2162063154809877,
"grad_norm": 21.193038940429688,
"learning_rate": 2.991252426468782e-05,
"loss": 3.22,
"step": 33200
},
{
"combined_loss": 1.8822517395019531,
"distill_loss": 1.264020323753357,
"epoch": 1.2162063154809877,
"step": 33200,
"student_mlm_loss": 2.5004830360412598
},
{
"epoch": 1.219869587515569,
"grad_norm": 8.840603828430176,
"learning_rate": 2.9851094675283193e-05,
"loss": 13.091,
"step": 33300
},
{
"combined_loss": 2.0461645126342773,
"distill_loss": 1.3376085758209229,
"epoch": 1.219869587515569,
"step": 33300,
"student_mlm_loss": 2.7547202110290527
},
{
"epoch": 1.2235328595501502,
"grad_norm": 16.414852142333984,
"learning_rate": 2.9789665085878564e-05,
"loss": 3.6096,
"step": 33400
},
{
"combined_loss": 1.8437246084213257,
"distill_loss": 1.2731173038482666,
"epoch": 1.2235328595501502,
"step": 33400,
"student_mlm_loss": 2.4143319129943848
},
{
"epoch": 1.2271961315847315,
"grad_norm": 5.047356605529785,
"learning_rate": 2.9728235496473946e-05,
"loss": 10.6014,
"step": 33500
},
{
"combined_loss": 2.0613672733306885,
"distill_loss": 1.1784592866897583,
"epoch": 1.2271961315847315,
"step": 33500,
"student_mlm_loss": 2.944275140762329
},
{
"epoch": 1.2308594036193128,
"grad_norm": 8.502574920654297,
"learning_rate": 2.9666805907069317e-05,
"loss": 12.6532,
"step": 33600
},
{
"combined_loss": 2.301725149154663,
"distill_loss": 1.2482868432998657,
"epoch": 1.2308594036193128,
"step": 33600,
"student_mlm_loss": 3.355163335800171
},
{
"epoch": 1.234522675653894,
"grad_norm": 25.97445297241211,
"learning_rate": 2.9605376317664695e-05,
"loss": 3.1296,
"step": 33700
},
{
"combined_loss": 1.8135402202606201,
"distill_loss": 1.309229850769043,
"epoch": 1.234522675653894,
"step": 33700,
"student_mlm_loss": 2.3178505897521973
},
{
"epoch": 1.2381859476884753,
"grad_norm": 7.912507057189941,
"learning_rate": 2.954394672826007e-05,
"loss": 2.9749,
"step": 33800
},
{
"combined_loss": 1.9506487846374512,
"distill_loss": 1.3808802366256714,
"epoch": 1.2381859476884753,
"step": 33800,
"student_mlm_loss": 2.5204174518585205
},
{
"epoch": 1.2418492197230566,
"grad_norm": 28.239988327026367,
"learning_rate": 2.9482517138855447e-05,
"loss": 5.7527,
"step": 33900
},
{
"combined_loss": 1.881349802017212,
"distill_loss": 1.3489292860031128,
"epoch": 1.2418492197230566,
"step": 33900,
"student_mlm_loss": 2.4137701988220215
},
{
"epoch": 1.245512491757638,
"grad_norm": 25.953353881835938,
"learning_rate": 2.942108754945082e-05,
"loss": 4.0339,
"step": 34000
},
{
"epoch": 1.245512491757638,
"eval_loss": 3.297154188156128,
"eval_runtime": 2.3826,
"eval_samples_per_second": 2936.248,
"eval_steps_per_second": 11.752,
"step": 34000
},
{
"combined_loss": 2.5429787635803223,
"distill_loss": 1.2718520164489746,
"epoch": 1.245512491757638,
"step": 34000,
"student_mlm_loss": 3.814105272293091
},
{
"epoch": 1.2491757637922192,
"grad_norm": 48.45500183105469,
"learning_rate": 2.9359657960046196e-05,
"loss": 6.1408,
"step": 34100
},
{
"combined_loss": 4.794422626495361,
"distill_loss": 1.3052036762237549,
"epoch": 1.2491757637922192,
"step": 34100,
"student_mlm_loss": 8.283641815185547
},
{
"epoch": 1.2528390358268005,
"grad_norm": 6.028234004974365,
"learning_rate": 2.9298228370641574e-05,
"loss": 2.9116,
"step": 34200
},
{
"combined_loss": 2.125443458557129,
"distill_loss": 1.25053071975708,
"epoch": 1.2528390358268005,
"step": 34200,
"student_mlm_loss": 3.0003561973571777
},
{
"epoch": 1.2565023078613817,
"grad_norm": 15.824817657470703,
"learning_rate": 2.9236798781236945e-05,
"loss": 3.5834,
"step": 34300
},
{
"combined_loss": 2.156796932220459,
"distill_loss": 1.1805670261383057,
"epoch": 1.2565023078613817,
"step": 34300,
"student_mlm_loss": 3.1330268383026123
},
{
"epoch": 1.260165579895963,
"grad_norm": 8.438326835632324,
"learning_rate": 2.9175369191832326e-05,
"loss": 5.0724,
"step": 34400
},
{
"combined_loss": 3.144615888595581,
"distill_loss": 1.2467416524887085,
"epoch": 1.260165579895963,
"step": 34400,
"student_mlm_loss": 5.042490005493164
},
{
"epoch": 1.2638288519305443,
"grad_norm": 3.7252449989318848,
"learning_rate": 2.9113939602427697e-05,
"loss": 2.9306,
"step": 34500
},
{
"combined_loss": 4.309004783630371,
"distill_loss": 1.2629985809326172,
"epoch": 1.2638288519305443,
"step": 34500,
"student_mlm_loss": 7.355010986328125
},
{
"epoch": 1.2674921239651256,
"grad_norm": 14.86426067352295,
"learning_rate": 2.9052510013023078e-05,
"loss": 3.059,
"step": 34600
},
{
"combined_loss": 2.128227472305298,
"distill_loss": 1.3674236536026,
"epoch": 1.2674921239651256,
"step": 34600,
"student_mlm_loss": 2.889031171798706
},
{
"epoch": 1.271155395999707,
"grad_norm": 14.947731018066406,
"learning_rate": 2.899108042361845e-05,
"loss": 3.0461,
"step": 34700
},
{
"combined_loss": 1.9557018280029297,
"distill_loss": 1.3122907876968384,
"epoch": 1.271155395999707,
"step": 34700,
"student_mlm_loss": 2.5991127490997314
},
{
"epoch": 1.2748186680342881,
"grad_norm": 4.714714527130127,
"learning_rate": 2.8929650834213824e-05,
"loss": 3.0221,
"step": 34800
},
{
"combined_loss": 1.7830932140350342,
"distill_loss": 1.278725028038025,
"epoch": 1.2748186680342881,
"step": 34800,
"student_mlm_loss": 2.287461519241333
},
{
"epoch": 1.2784819400688696,
"grad_norm": 13.885130882263184,
"learning_rate": 2.88682212448092e-05,
"loss": 8.529,
"step": 34900
},
{
"combined_loss": 4.974426746368408,
"distill_loss": 1.4173694849014282,
"epoch": 1.2784819400688696,
"step": 34900,
"student_mlm_loss": 8.53148365020752
},
{
"epoch": 1.2821452121034507,
"grad_norm": 6.786545753479004,
"learning_rate": 2.8806791655404576e-05,
"loss": 3.563,
"step": 35000
},
{
"combined_loss": 1.7134695053100586,
"distill_loss": 1.2251827716827393,
"epoch": 1.2821452121034507,
"step": 35000,
"student_mlm_loss": 2.201756238937378
},
{
"epoch": 1.2858084841380322,
"grad_norm": 18.235891342163086,
"learning_rate": 2.8745362065999954e-05,
"loss": 6.9188,
"step": 35100
},
{
"combined_loss": 6.00921106338501,
"distill_loss": 1.3103188276290894,
"epoch": 1.2858084841380322,
"step": 35100,
"student_mlm_loss": 10.70810317993164
},
{
"epoch": 1.2894717561726134,
"grad_norm": 6.3708696365356445,
"learning_rate": 2.8683932476595328e-05,
"loss": 6.7695,
"step": 35200
},
{
"combined_loss": 2.2400052547454834,
"distill_loss": 1.3289698362350464,
"epoch": 1.2894717561726134,
"step": 35200,
"student_mlm_loss": 3.151040554046631
},
{
"epoch": 1.2931350282071947,
"grad_norm": 7.5602946281433105,
"learning_rate": 2.8622502887190706e-05,
"loss": 9.8005,
"step": 35300
},
{
"combined_loss": 1.848390817642212,
"distill_loss": 1.2897430658340454,
"epoch": 1.2931350282071947,
"step": 35300,
"student_mlm_loss": 2.407038688659668
},
{
"epoch": 1.296798300241776,
"grad_norm": 24.799640655517578,
"learning_rate": 2.8561073297786077e-05,
"loss": 3.2996,
"step": 35400
},
{
"combined_loss": 4.894403457641602,
"distill_loss": 1.282358169555664,
"epoch": 1.296798300241776,
"step": 35400,
"student_mlm_loss": 8.506448745727539
},
{
"epoch": 1.3004615722763573,
"grad_norm": 34.4364013671875,
"learning_rate": 2.849964370838146e-05,
"loss": 3.399,
"step": 35500
},
{
"combined_loss": 1.7965787649154663,
"distill_loss": 1.3232142925262451,
"epoch": 1.3004615722763573,
"step": 35500,
"student_mlm_loss": 2.2699432373046875
},
{
"epoch": 1.3041248443109386,
"grad_norm": 7.9551825523376465,
"learning_rate": 2.843821411897683e-05,
"loss": 3.1887,
"step": 35600
},
{
"combined_loss": 1.855729579925537,
"distill_loss": 1.2217527627944946,
"epoch": 1.3041248443109386,
"step": 35600,
"student_mlm_loss": 2.48970627784729
},
{
"epoch": 1.3077881163455198,
"grad_norm": 5.838754177093506,
"learning_rate": 2.8376784529572204e-05,
"loss": 3.1524,
"step": 35700
},
{
"combined_loss": 2.3417129516601562,
"distill_loss": 1.2872867584228516,
"epoch": 1.3077881163455198,
"step": 35700,
"student_mlm_loss": 3.39613938331604
},
{
"epoch": 1.3114513883801011,
"grad_norm": 4.118559837341309,
"learning_rate": 2.831535494016758e-05,
"loss": 7.9754,
"step": 35800
},
{
"combined_loss": 3.906961679458618,
"distill_loss": 1.2905327081680298,
"epoch": 1.3114513883801011,
"step": 35800,
"student_mlm_loss": 6.523390769958496
},
{
"epoch": 1.3151146604146824,
"grad_norm": 5.229255199432373,
"learning_rate": 2.8253925350762956e-05,
"loss": 3.6586,
"step": 35900
},
{
"combined_loss": 2.6259002685546875,
"distill_loss": 1.217278003692627,
"epoch": 1.3151146604146824,
"step": 35900,
"student_mlm_loss": 4.034522533416748
},
{
"epoch": 1.3187779324492637,
"grad_norm": 9.182631492614746,
"learning_rate": 2.8192495761358334e-05,
"loss": 8.5789,
"step": 36000
},
{
"epoch": 1.3187779324492637,
"eval_loss": 3.3097567558288574,
"eval_runtime": 1.9861,
"eval_samples_per_second": 3522.525,
"eval_steps_per_second": 14.098,
"step": 36000
},
{
"combined_loss": 15.921034812927246,
"distill_loss": 1.2575896978378296,
"epoch": 1.3187779324492637,
"step": 36000,
"student_mlm_loss": 30.58448028564453
},
{
"epoch": 1.322441204483845,
"grad_norm": 5.999209880828857,
"learning_rate": 2.813106617195371e-05,
"loss": 3.6109,
"step": 36100
},
{
"combined_loss": 204.92184448242188,
"distill_loss": 1.2291535139083862,
"epoch": 1.322441204483845,
"step": 36100,
"student_mlm_loss": 408.6145324707031
},
{
"epoch": 1.3261044765184262,
"grad_norm": 8.351846694946289,
"learning_rate": 2.8069636582549086e-05,
"loss": 5.9753,
"step": 36200
},
{
"combined_loss": 3.7332310676574707,
"distill_loss": 1.377110481262207,
"epoch": 1.3261044765184262,
"step": 36200,
"student_mlm_loss": 6.089351654052734
},
{
"epoch": 1.3297677485530075,
"grad_norm": 4.738751411437988,
"learning_rate": 2.800820699314446e-05,
"loss": 2.8706,
"step": 36300
},
{
"combined_loss": 1.949210286140442,
"distill_loss": 1.1820151805877686,
"epoch": 1.3297677485530075,
"step": 36300,
"student_mlm_loss": 2.7164053916931152
},
{
"epoch": 1.3334310205875888,
"grad_norm": 3.7835421562194824,
"learning_rate": 2.7946777403739832e-05,
"loss": 3.5794,
"step": 36400
},
{
"combined_loss": 1.7922800779342651,
"distill_loss": 1.2455928325653076,
"epoch": 1.3334310205875888,
"step": 36400,
"student_mlm_loss": 2.3389673233032227
},
{
"epoch": 1.33709429262217,
"grad_norm": 22.528881072998047,
"learning_rate": 2.788534781433521e-05,
"loss": 3.8623,
"step": 36500
},
{
"combined_loss": 1.788147211074829,
"distill_loss": 1.2254056930541992,
"epoch": 1.33709429262217,
"step": 36500,
"student_mlm_loss": 2.350888729095459
},
{
"epoch": 1.3407575646567513,
"grad_norm": 5.876169681549072,
"learning_rate": 2.7823918224930584e-05,
"loss": 8.4137,
"step": 36600
},
{
"combined_loss": 2.0377962589263916,
"distill_loss": 1.2204126119613647,
"epoch": 1.3407575646567513,
"step": 36600,
"student_mlm_loss": 2.855179786682129
},
{
"epoch": 1.3444208366913326,
"grad_norm": 20.921276092529297,
"learning_rate": 2.7762488635525962e-05,
"loss": 3.5857,
"step": 36700
},
{
"combined_loss": 1.9521321058273315,
"distill_loss": 1.249513864517212,
"epoch": 1.3444208366913326,
"step": 36700,
"student_mlm_loss": 2.654750347137451
},
{
"epoch": 1.348084108725914,
"grad_norm": 13.851704597473145,
"learning_rate": 2.7701059046121336e-05,
"loss": 3.8678,
"step": 36800
},
{
"combined_loss": 2.2560389041900635,
"distill_loss": 1.2315130233764648,
"epoch": 1.348084108725914,
"step": 36800,
"student_mlm_loss": 3.280564785003662
},
{
"epoch": 1.3517473807604952,
"grad_norm": 16.56214714050293,
"learning_rate": 2.7639629456716714e-05,
"loss": 3.3998,
"step": 36900
},
{
"combined_loss": 3.098896026611328,
"distill_loss": 1.3377043008804321,
"epoch": 1.3517473807604952,
"step": 36900,
"student_mlm_loss": 4.860087871551514
},
{
"epoch": 1.3554106527950767,
"grad_norm": 35.91291809082031,
"learning_rate": 2.757819986731209e-05,
"loss": 3.761,
"step": 37000
},
{
"combined_loss": 1.9794631004333496,
"distill_loss": 1.3087836503982544,
"epoch": 1.3554106527950767,
"step": 37000,
"student_mlm_loss": 2.6501426696777344
},
{
"epoch": 1.3590739248296577,
"grad_norm": 11.776296615600586,
"learning_rate": 2.7516770277907466e-05,
"loss": 3.9886,
"step": 37100
},
{
"combined_loss": 2.3107573986053467,
"distill_loss": 1.268768310546875,
"epoch": 1.3590739248296577,
"step": 37100,
"student_mlm_loss": 3.3527464866638184
},
{
"epoch": 1.3627371968642392,
"grad_norm": 13.237029075622559,
"learning_rate": 2.745534068850284e-05,
"loss": 5.3161,
"step": 37200
},
{
"combined_loss": 4.210747718811035,
"distill_loss": 1.4009877443313599,
"epoch": 1.3627371968642392,
"step": 37200,
"student_mlm_loss": 7.0205078125
},
{
"epoch": 1.3664004688988205,
"grad_norm": 18.256624221801758,
"learning_rate": 2.7393911099098212e-05,
"loss": 3.3122,
"step": 37300
},
{
"combined_loss": 2.467655658721924,
"distill_loss": 1.3313319683074951,
"epoch": 1.3664004688988205,
"step": 37300,
"student_mlm_loss": 3.6039793491363525
},
{
"epoch": 1.3700637409334018,
"grad_norm": 3.6821129322052,
"learning_rate": 2.7332481509693593e-05,
"loss": 2.5638,
"step": 37400
},
{
"combined_loss": 4.0961503982543945,
"distill_loss": 1.2590566873550415,
"epoch": 1.3700637409334018,
"step": 37400,
"student_mlm_loss": 6.933243751525879
},
{
"epoch": 1.373727012967983,
"grad_norm": 9.491351127624512,
"learning_rate": 2.7271051920288964e-05,
"loss": 5.2572,
"step": 37500
},
{
"combined_loss": 1.8323596715927124,
"distill_loss": 1.2323403358459473,
"epoch": 1.373727012967983,
"step": 37500,
"student_mlm_loss": 2.4323790073394775
},
{
"epoch": 1.3773902850025643,
"grad_norm": 10.13337516784668,
"learning_rate": 2.7209622330884342e-05,
"loss": 2.9805,
"step": 37600
},
{
"combined_loss": 2.7236733436584473,
"distill_loss": 1.2598845958709717,
"epoch": 1.3773902850025643,
"step": 37600,
"student_mlm_loss": 4.187462329864502
},
{
"epoch": 1.3810535570371456,
"grad_norm": 22.098358154296875,
"learning_rate": 2.7148192741479716e-05,
"loss": 3.1095,
"step": 37700
},
{
"combined_loss": 1.7910634279251099,
"distill_loss": 1.271672010421753,
"epoch": 1.3810535570371456,
"step": 37700,
"student_mlm_loss": 2.310454845428467
},
{
"epoch": 1.3847168290717269,
"grad_norm": 233.01779174804688,
"learning_rate": 2.7086763152075094e-05,
"loss": 3.0334,
"step": 37800
},
{
"combined_loss": 2.449730396270752,
"distill_loss": 1.343329906463623,
"epoch": 1.3847168290717269,
"step": 37800,
"student_mlm_loss": 3.556130886077881
},
{
"epoch": 1.3883801011063082,
"grad_norm": 7.459797382354736,
"learning_rate": 2.702533356267047e-05,
"loss": 5.0088,
"step": 37900
},
{
"combined_loss": 2.047302722930908,
"distill_loss": 1.2358465194702148,
"epoch": 1.3883801011063082,
"step": 37900,
"student_mlm_loss": 2.8587586879730225
},
{
"epoch": 1.3920433731408894,
"grad_norm": 3.9627275466918945,
"learning_rate": 2.6963903973265843e-05,
"loss": 2.7476,
"step": 38000
},
{
"epoch": 1.3920433731408894,
"eval_loss": 4.346156120300293,
"eval_runtime": 1.974,
"eval_samples_per_second": 3544.088,
"eval_steps_per_second": 14.184,
"step": 38000
},
{
"combined_loss": 2.4468555450439453,
"distill_loss": 1.166190505027771,
"epoch": 1.3920433731408894,
"step": 38000,
"student_mlm_loss": 3.72752046585083
},
{
"epoch": 1.3957066451754707,
"grad_norm": 11.812987327575684,
"learning_rate": 2.690247438386122e-05,
"loss": 3.8226,
"step": 38100
},
{
"combined_loss": 2.274935245513916,
"distill_loss": 1.3503799438476562,
"epoch": 1.3957066451754707,
"step": 38100,
"student_mlm_loss": 3.199490785598755
},
{
"epoch": 1.399369917210052,
"grad_norm": 6.545460224151611,
"learning_rate": 2.6841044794456592e-05,
"loss": 4.1598,
"step": 38200
},
{
"combined_loss": 2.1577343940734863,
"distill_loss": 1.2623993158340454,
"epoch": 1.399369917210052,
"step": 38200,
"student_mlm_loss": 3.0530693531036377
},
{
"epoch": 1.4030331892446333,
"grad_norm": 7.286951541900635,
"learning_rate": 2.6779615205051973e-05,
"loss": 3.8211,
"step": 38300
},
{
"combined_loss": 2.479806900024414,
"distill_loss": 1.2152717113494873,
"epoch": 1.4030331892446333,
"step": 38300,
"student_mlm_loss": 3.74434232711792
},
{
"epoch": 1.4066964612792145,
"grad_norm": 18.360294342041016,
"learning_rate": 2.6718185615647344e-05,
"loss": 3.3871,
"step": 38400
},
{
"combined_loss": 1.7289254665374756,
"distill_loss": 1.3171356916427612,
"epoch": 1.4066964612792145,
"step": 38400,
"student_mlm_loss": 2.1407151222229004
},
{
"epoch": 1.4103597333137958,
"grad_norm": 8.086026191711426,
"learning_rate": 2.6656756026242726e-05,
"loss": 2.6337,
"step": 38500
},
{
"combined_loss": 1.9621633291244507,
"distill_loss": 1.3215687274932861,
"epoch": 1.4103597333137958,
"step": 38500,
"student_mlm_loss": 2.6027579307556152
},
{
"epoch": 1.414023005348377,
"grad_norm": 13.378824234008789,
"learning_rate": 2.6595326436838097e-05,
"loss": 3.4032,
"step": 38600
},
{
"combined_loss": 37.448326110839844,
"distill_loss": 1.2198776006698608,
"epoch": 1.414023005348377,
"step": 38600,
"student_mlm_loss": 73.67677307128906
},
{
"epoch": 1.4176862773829584,
"grad_norm": 5.834230422973633,
"learning_rate": 2.653389684743347e-05,
"loss": 6.724,
"step": 38700
},
{
"combined_loss": 1.8702625036239624,
"distill_loss": 1.2802906036376953,
"epoch": 1.4176862773829584,
"step": 38700,
"student_mlm_loss": 2.4602344036102295
},
{
"epoch": 1.4213495494175397,
"grad_norm": 3.5685741901397705,
"learning_rate": 2.647246725802885e-05,
"loss": 3.2721,
"step": 38800
},
{
"combined_loss": 1.7411483526229858,
"distill_loss": 1.285083532333374,
"epoch": 1.4213495494175397,
"step": 38800,
"student_mlm_loss": 2.1972131729125977
},
{
"epoch": 1.4250128214521212,
"grad_norm": 8.644251823425293,
"learning_rate": 2.6411037668624223e-05,
"loss": 13.6859,
"step": 38900
},
{
"combined_loss": 3.234241008758545,
"distill_loss": 1.2654619216918945,
"epoch": 1.4250128214521212,
"step": 38900,
"student_mlm_loss": 5.203020095825195
},
{
"epoch": 1.4286760934867022,
"grad_norm": 15.043992042541504,
"learning_rate": 2.63496080792196e-05,
"loss": 4.3161,
"step": 39000
},
{
"combined_loss": 2.013312339782715,
"distill_loss": 1.2555652856826782,
"epoch": 1.4286760934867022,
"step": 39000,
"student_mlm_loss": 2.771059274673462
},
{
"epoch": 1.4323393655212837,
"grad_norm": 35.315345764160156,
"learning_rate": 2.6288178489814976e-05,
"loss": 6.3089,
"step": 39100
},
{
"combined_loss": 1.7854509353637695,
"distill_loss": 1.2994376420974731,
"epoch": 1.4323393655212837,
"step": 39100,
"student_mlm_loss": 2.2714641094207764
},
{
"epoch": 1.4360026375558648,
"grad_norm": 8.155647277832031,
"learning_rate": 2.6226748900410353e-05,
"loss": 3.3881,
"step": 39200
},
{
"combined_loss": 1.8790473937988281,
"distill_loss": 1.2656193971633911,
"epoch": 1.4360026375558648,
"step": 39200,
"student_mlm_loss": 2.4924752712249756
},
{
"epoch": 1.4396659095904463,
"grad_norm": 4.777060508728027,
"learning_rate": 2.6165319311005725e-05,
"loss": 3.0181,
"step": 39300
},
{
"combined_loss": 2.2714784145355225,
"distill_loss": 1.2724400758743286,
"epoch": 1.4396659095904463,
"step": 39300,
"student_mlm_loss": 3.270516872406006
},
{
"epoch": 1.4433291816250275,
"grad_norm": 3.7660317420959473,
"learning_rate": 2.6103889721601106e-05,
"loss": 3.3045,
"step": 39400
},
{
"combined_loss": 1.9759800434112549,
"distill_loss": 1.1767717599868774,
"epoch": 1.4433291816250275,
"step": 39400,
"student_mlm_loss": 2.775188446044922
},
{
"epoch": 1.4469924536596088,
"grad_norm": 55.78919982910156,
"learning_rate": 2.6042460132196477e-05,
"loss": 3.5094,
"step": 39500
},
{
"combined_loss": 2.5586395263671875,
"distill_loss": 1.3177176713943481,
"epoch": 1.4469924536596088,
"step": 39500,
"student_mlm_loss": 3.7995612621307373
},
{
"epoch": 1.45065572569419,
"grad_norm": 11.648473739624023,
"learning_rate": 2.598103054279185e-05,
"loss": 6.3066,
"step": 39600
},
{
"combined_loss": 1.8263496160507202,
"distill_loss": 1.2649195194244385,
"epoch": 1.45065572569419,
"step": 39600,
"student_mlm_loss": 2.387779712677002
},
{
"epoch": 1.4543189977287714,
"grad_norm": 4.982020378112793,
"learning_rate": 2.591960095338723e-05,
"loss": 3.1475,
"step": 39700
},
{
"combined_loss": 4.95673131942749,
"distill_loss": 1.2415388822555542,
"epoch": 1.4543189977287714,
"step": 39700,
"student_mlm_loss": 8.671923637390137
},
{
"epoch": 1.4579822697633527,
"grad_norm": 4.551340103149414,
"learning_rate": 2.5858171363982604e-05,
"loss": 6.0043,
"step": 39800
},
{
"combined_loss": 2.124246597290039,
"distill_loss": 1.197386384010315,
"epoch": 1.4579822697633527,
"step": 39800,
"student_mlm_loss": 3.0511069297790527
},
{
"epoch": 1.461645541797934,
"grad_norm": 41.217533111572266,
"learning_rate": 2.579674177457798e-05,
"loss": 2.7216,
"step": 39900
},
{
"combined_loss": 1.8579926490783691,
"distill_loss": 1.1948734521865845,
"epoch": 1.461645541797934,
"step": 39900,
"student_mlm_loss": 2.5211119651794434
},
{
"epoch": 1.4653088138325152,
"grad_norm": 3.3428897857666016,
"learning_rate": 2.5735312185173356e-05,
"loss": 3.5888,
"step": 40000
},
{
"epoch": 1.4653088138325152,
"eval_loss": 3.433469295501709,
"eval_runtime": 2.0987,
"eval_samples_per_second": 3333.452,
"eval_steps_per_second": 13.341,
"step": 40000
},
{
"combined_loss": 3.9790029525756836,
"distill_loss": 1.2571158409118652,
"epoch": 1.4653088138325152,
"step": 40000,
"student_mlm_loss": 6.700890064239502
},
{
"epoch": 1.4689720858670965,
"grad_norm": 24.387128829956055,
"learning_rate": 2.5673882595768734e-05,
"loss": 3.3546,
"step": 40100
},
{
"combined_loss": 2.113370418548584,
"distill_loss": 1.2904696464538574,
"epoch": 1.4689720858670965,
"step": 40100,
"student_mlm_loss": 2.9362711906433105
},
{
"epoch": 1.4726353579016778,
"grad_norm": 11.271422386169434,
"learning_rate": 2.5612453006364108e-05,
"loss": 9.1182,
"step": 40200
},
{
"combined_loss": 1.7249795198440552,
"distill_loss": 1.2220125198364258,
"epoch": 1.4726353579016778,
"step": 40200,
"student_mlm_loss": 2.2279465198516846
},
{
"epoch": 1.476298629936259,
"grad_norm": 88.92086029052734,
"learning_rate": 2.555102341695948e-05,
"loss": 5.5622,
"step": 40300
},
{
"combined_loss": 3.5107364654541016,
"distill_loss": 1.2663298845291138,
"epoch": 1.476298629936259,
"step": 40300,
"student_mlm_loss": 5.755143165588379
},
{
"epoch": 1.4799619019708403,
"grad_norm": 4.677048683166504,
"learning_rate": 2.5489593827554857e-05,
"loss": 5.3278,
"step": 40400
},
{
"combined_loss": 3.5298116207122803,
"distill_loss": 1.1846145391464233,
"epoch": 1.4799619019708403,
"step": 40400,
"student_mlm_loss": 5.875008583068848
},
{
"epoch": 1.4836251740054216,
"grad_norm": 21.207704544067383,
"learning_rate": 2.542816423815023e-05,
"loss": 2.9588,
"step": 40500
},
{
"combined_loss": 2.6109657287597656,
"distill_loss": 1.2608091831207275,
"epoch": 1.4836251740054216,
"step": 40500,
"student_mlm_loss": 3.9611220359802246
},
{
"epoch": 1.4872884460400029,
"grad_norm": 7.7415876388549805,
"learning_rate": 2.536673464874561e-05,
"loss": 2.706,
"step": 40600
},
{
"combined_loss": 2.455023765563965,
"distill_loss": 1.3175585269927979,
"epoch": 1.4872884460400029,
"step": 40600,
"student_mlm_loss": 3.5924887657165527
},
{
"epoch": 1.4909517180745842,
"grad_norm": 19.366378784179688,
"learning_rate": 2.5305305059340984e-05,
"loss": 2.7981,
"step": 40700
},
{
"combined_loss": 3.624007225036621,
"distill_loss": 1.1402699947357178,
"epoch": 1.4909517180745842,
"step": 40700,
"student_mlm_loss": 6.1077446937561035
},
{
"epoch": 1.4946149901091654,
"grad_norm": 7.310671806335449,
"learning_rate": 2.524387546993636e-05,
"loss": 29.272,
"step": 40800
},
{
"combined_loss": 2.2329726219177246,
"distill_loss": 1.303555965423584,
"epoch": 1.4946149901091654,
"step": 40800,
"student_mlm_loss": 3.1623895168304443
},
{
"epoch": 1.4982782621437467,
"grad_norm": 48.7297477722168,
"learning_rate": 2.5182445880531736e-05,
"loss": 3.1319,
"step": 40900
},
{
"combined_loss": 1.8255285024642944,
"distill_loss": 1.1643202304840088,
"epoch": 1.4982782621437467,
"step": 40900,
"student_mlm_loss": 2.48673677444458
},
{
"epoch": 1.5019415341783282,
"grad_norm": 32.60409927368164,
"learning_rate": 2.5121016291127114e-05,
"loss": 8.524,
"step": 41000
},
{
"combined_loss": 2.896923542022705,
"distill_loss": 1.3571655750274658,
"epoch": 1.5019415341783282,
"step": 41000,
"student_mlm_loss": 4.436681747436523
},
{
"epoch": 1.5056048062129093,
"grad_norm": 4.127974510192871,
"learning_rate": 2.5059586701722488e-05,
"loss": 6.3087,
"step": 41100
},
{
"combined_loss": 2.145819664001465,
"distill_loss": 1.2983198165893555,
"epoch": 1.5056048062129093,
"step": 41100,
"student_mlm_loss": 2.993319511413574
},
{
"epoch": 1.5092680782474908,
"grad_norm": 3.873206853866577,
"learning_rate": 2.4998157112317863e-05,
"loss": 5.279,
"step": 41200
},
{
"combined_loss": 4.8266730308532715,
"distill_loss": 1.1676665544509888,
"epoch": 1.5092680782474908,
"step": 41200,
"student_mlm_loss": 8.485679626464844
},
{
"epoch": 1.5129313502820718,
"grad_norm": 6.902312755584717,
"learning_rate": 2.493672752291324e-05,
"loss": 5.3583,
"step": 41300
},
{
"combined_loss": 1.7068848609924316,
"distill_loss": 1.1335561275482178,
"epoch": 1.5129313502820718,
"step": 41300,
"student_mlm_loss": 2.2802135944366455
},
{
"epoch": 1.5165946223166533,
"grad_norm": 17.415306091308594,
"learning_rate": 2.487529793350861e-05,
"loss": 2.8319,
"step": 41400
},
{
"combined_loss": 1.5696630477905273,
"distill_loss": 1.152633786201477,
"epoch": 1.5165946223166533,
"step": 41400,
"student_mlm_loss": 1.9866924285888672
},
{
"epoch": 1.5202578943512344,
"grad_norm": 11.67779541015625,
"learning_rate": 2.481386834410399e-05,
"loss": 3.0117,
"step": 41500
},
{
"combined_loss": 1.9209272861480713,
"distill_loss": 1.2611881494522095,
"epoch": 1.5202578943512344,
"step": 41500,
"student_mlm_loss": 2.5806663036346436
},
{
"epoch": 1.5239211663858159,
"grad_norm": 9.814743041992188,
"learning_rate": 2.4752438754699364e-05,
"loss": 2.8479,
"step": 41600
},
{
"combined_loss": 4.1822404861450195,
"distill_loss": 1.254117488861084,
"epoch": 1.5239211663858159,
"step": 41600,
"student_mlm_loss": 7.110363960266113
},
{
"epoch": 1.5275844384203972,
"grad_norm": 11.7344970703125,
"learning_rate": 2.4691009165294742e-05,
"loss": 3.2502,
"step": 41700
},
{
"combined_loss": 1.7558622360229492,
"distill_loss": 1.1821727752685547,
"epoch": 1.5275844384203972,
"step": 41700,
"student_mlm_loss": 2.3295516967773438
},
{
"epoch": 1.5312477104549784,
"grad_norm": 8.426025390625,
"learning_rate": 2.4629579575890116e-05,
"loss": 3.3169,
"step": 41800
},
{
"combined_loss": 1.843000054359436,
"distill_loss": 1.1456735134124756,
"epoch": 1.5312477104549784,
"step": 41800,
"student_mlm_loss": 2.5403265953063965
},
{
"epoch": 1.5349109824895597,
"grad_norm": 3.654872417449951,
"learning_rate": 2.456814998648549e-05,
"loss": 2.6259,
"step": 41900
},
{
"combined_loss": 1.7651002407073975,
"distill_loss": 1.1741529703140259,
"epoch": 1.5349109824895597,
"step": 41900,
"student_mlm_loss": 2.3560476303100586
},
{
"epoch": 1.538574254524141,
"grad_norm": 18.605615615844727,
"learning_rate": 2.450672039708087e-05,
"loss": 2.4854,
"step": 42000
},
{
"epoch": 1.538574254524141,
"eval_loss": 3.4032058715820312,
"eval_runtime": 1.8747,
"eval_samples_per_second": 3731.788,
"eval_steps_per_second": 14.936,
"step": 42000
},
{
"combined_loss": 2.60400390625,
"distill_loss": 1.2034615278244019,
"epoch": 1.538574254524141,
"step": 42000,
"student_mlm_loss": 4.004546165466309
},
{
"epoch": 1.5422375265587223,
"grad_norm": 6.775146484375,
"learning_rate": 2.4445290807676243e-05,
"loss": 2.8405,
"step": 42100
},
{
"combined_loss": 1.7485601902008057,
"distill_loss": 1.1682909727096558,
"epoch": 1.5422375265587223,
"step": 42100,
"student_mlm_loss": 2.328829288482666
},
{
"epoch": 1.5459007985933035,
"grad_norm": 24.79000473022461,
"learning_rate": 2.4383861218271617e-05,
"loss": 2.9811,
"step": 42200
},
{
"combined_loss": 2.2294323444366455,
"distill_loss": 1.262848138809204,
"epoch": 1.5459007985933035,
"step": 42200,
"student_mlm_loss": 3.196016550064087
},
{
"epoch": 1.5495640706278848,
"grad_norm": 11.027627944946289,
"learning_rate": 2.4322431628866992e-05,
"loss": 3.7109,
"step": 42300
},
{
"combined_loss": 1.8129802942276,
"distill_loss": 1.205324411392212,
"epoch": 1.5495640706278848,
"step": 42300,
"student_mlm_loss": 2.4206361770629883
},
{
"epoch": 1.553227342662466,
"grad_norm": 6.328401565551758,
"learning_rate": 2.426100203946237e-05,
"loss": 31.168,
"step": 42400
},
{
"combined_loss": 2.391860246658325,
"distill_loss": 1.1356655359268188,
"epoch": 1.553227342662466,
"step": 42400,
"student_mlm_loss": 3.648054838180542
},
{
"epoch": 1.5568906146970474,
"grad_norm": 26.61184310913086,
"learning_rate": 2.4199572450057744e-05,
"loss": 6.4259,
"step": 42500
},
{
"combined_loss": 3.222200870513916,
"distill_loss": 1.3243845701217651,
"epoch": 1.5568906146970474,
"step": 42500,
"student_mlm_loss": 5.120017051696777
},
{
"epoch": 1.5605538867316286,
"grad_norm": 78.89910888671875,
"learning_rate": 2.4138142860653122e-05,
"loss": 3.3441,
"step": 42600
},
{
"combined_loss": 1.7442145347595215,
"distill_loss": 1.282542109489441,
"epoch": 1.5605538867316286,
"step": 42600,
"student_mlm_loss": 2.2058870792388916
},
{
"epoch": 1.56421715876621,
"grad_norm": 88.92566680908203,
"learning_rate": 2.4076713271248496e-05,
"loss": 2.8234,
"step": 42700
},
{
"combined_loss": 2.366835117340088,
"distill_loss": 1.1711124181747437,
"epoch": 1.56421715876621,
"step": 42700,
"student_mlm_loss": 3.5625579357147217
},
{
"epoch": 1.5678804308007912,
"grad_norm": 6.83758544921875,
"learning_rate": 2.4015283681843874e-05,
"loss": 5.4491,
"step": 42800
},
{
"combined_loss": 4.174956798553467,
"distill_loss": 1.0669249296188354,
"epoch": 1.5678804308007912,
"step": 42800,
"student_mlm_loss": 7.282988548278809
},
{
"epoch": 1.5715437028353727,
"grad_norm": 5.723924160003662,
"learning_rate": 2.395385409243925e-05,
"loss": 3.1108,
"step": 42900
},
{
"combined_loss": 2.3197238445281982,
"distill_loss": 1.2763570547103882,
"epoch": 1.5715437028353727,
"step": 42900,
"student_mlm_loss": 3.3630905151367188
},
{
"epoch": 1.5752069748699538,
"grad_norm": 14.807353973388672,
"learning_rate": 2.3892424503034623e-05,
"loss": 6.4113,
"step": 43000
},
{
"combined_loss": 1.7868092060089111,
"distill_loss": 1.1304634809494019,
"epoch": 1.5752069748699538,
"step": 43000,
"student_mlm_loss": 2.44315505027771
},
{
"epoch": 1.5788702469045353,
"grad_norm": 8.68276596069336,
"learning_rate": 2.3830994913629998e-05,
"loss": 5.1213,
"step": 43100
},
{
"combined_loss": 19.46100425720215,
"distill_loss": 1.259545087814331,
"epoch": 1.5788702469045353,
"step": 43100,
"student_mlm_loss": 37.6624641418457
},
{
"epoch": 1.5825335189391163,
"grad_norm": 4.91242790222168,
"learning_rate": 2.3769565324225372e-05,
"loss": 3.2674,
"step": 43200
},
{
"combined_loss": 1.797656536102295,
"distill_loss": 1.3039189577102661,
"epoch": 1.5825335189391163,
"step": 43200,
"student_mlm_loss": 2.2913942337036133
},
{
"epoch": 1.5861967909736978,
"grad_norm": 52.68294906616211,
"learning_rate": 2.370813573482075e-05,
"loss": 3.7711,
"step": 43300
},
{
"combined_loss": 1.8017528057098389,
"distill_loss": 1.1734706163406372,
"epoch": 1.5861967909736978,
"step": 43300,
"student_mlm_loss": 2.43003511428833
},
{
"epoch": 1.5898600630082789,
"grad_norm": 11.869544982910156,
"learning_rate": 2.3646706145416124e-05,
"loss": 9.8177,
"step": 43400
},
{
"combined_loss": 2.760119915008545,
"distill_loss": 1.2446471452713013,
"epoch": 1.5898600630082789,
"step": 43400,
"student_mlm_loss": 4.275592803955078
},
{
"epoch": 1.5935233350428604,
"grad_norm": 3.7819387912750244,
"learning_rate": 2.3585276556011502e-05,
"loss": 4.6552,
"step": 43500
},
{
"combined_loss": 4.660012245178223,
"distill_loss": 1.1187530755996704,
"epoch": 1.5935233350428604,
"step": 43500,
"student_mlm_loss": 8.201271057128906
},
{
"epoch": 1.5971866070774414,
"grad_norm": 21.269559860229492,
"learning_rate": 2.3523846966606877e-05,
"loss": 8.5404,
"step": 43600
},
{
"combined_loss": 2.3045759201049805,
"distill_loss": 1.3545589447021484,
"epoch": 1.5971866070774414,
"step": 43600,
"student_mlm_loss": 3.2545931339263916
},
{
"epoch": 1.600849879112023,
"grad_norm": 8.289508819580078,
"learning_rate": 2.3462417377202254e-05,
"loss": 2.7135,
"step": 43700
},
{
"combined_loss": 3.0867691040039062,
"distill_loss": 1.1124651432037354,
"epoch": 1.600849879112023,
"step": 43700,
"student_mlm_loss": 5.061073303222656
},
{
"epoch": 1.6045131511466042,
"grad_norm": 22.303661346435547,
"learning_rate": 2.3400987787797625e-05,
"loss": 3.6364,
"step": 43800
},
{
"combined_loss": 1.7930564880371094,
"distill_loss": 1.2114512920379639,
"epoch": 1.6045131511466042,
"step": 43800,
"student_mlm_loss": 2.374661684036255
},
{
"epoch": 1.6081764231811855,
"grad_norm": 4.351790904998779,
"learning_rate": 2.3339558198393003e-05,
"loss": 5.6887,
"step": 43900
},
{
"combined_loss": 1.7365663051605225,
"distill_loss": 1.2089755535125732,
"epoch": 1.6081764231811855,
"step": 43900,
"student_mlm_loss": 2.2641570568084717
},
{
"epoch": 1.6118396952157668,
"grad_norm": 13.450850486755371,
"learning_rate": 2.3278128608988378e-05,
"loss": 3.6702,
"step": 44000
},
{
"epoch": 1.6118396952157668,
"eval_loss": 3.194415330886841,
"eval_runtime": 1.9274,
"eval_samples_per_second": 3629.828,
"eval_steps_per_second": 14.528,
"step": 44000
},
{
"combined_loss": 1.760496735572815,
"distill_loss": 1.1514201164245605,
"epoch": 1.6118396952157668,
"step": 44000,
"student_mlm_loss": 2.3695733547210693
},
{
"epoch": 1.615502967250348,
"grad_norm": 7.381774425506592,
"learning_rate": 2.3216699019583756e-05,
"loss": 2.9269,
"step": 44100
},
{
"combined_loss": 4.663776397705078,
"distill_loss": 1.307958722114563,
"epoch": 1.615502967250348,
"step": 44100,
"student_mlm_loss": 8.019594192504883
},
{
"epoch": 1.6191662392849293,
"grad_norm": 10.999051094055176,
"learning_rate": 2.315526943017913e-05,
"loss": 3.0334,
"step": 44200
},
{
"combined_loss": 1.9191560745239258,
"distill_loss": 1.3481658697128296,
"epoch": 1.6191662392849293,
"step": 44200,
"student_mlm_loss": 2.4901461601257324
},
{
"epoch": 1.6228295113195106,
"grad_norm": 6.187446594238281,
"learning_rate": 2.3093839840774504e-05,
"loss": 30.6923,
"step": 44300
},
{
"combined_loss": 12.122703552246094,
"distill_loss": 1.1659897565841675,
"epoch": 1.6228295113195106,
"step": 44300,
"student_mlm_loss": 23.079418182373047
},
{
"epoch": 1.6264927833540919,
"grad_norm": 6.142828941345215,
"learning_rate": 2.3032410251369882e-05,
"loss": 7.4162,
"step": 44400
},
{
"combined_loss": 1.9456160068511963,
"distill_loss": 1.257858157157898,
"epoch": 1.6264927833540919,
"step": 44400,
"student_mlm_loss": 2.633373737335205
},
{
"epoch": 1.6301560553886731,
"grad_norm": 15.393942832946777,
"learning_rate": 2.2970980661965257e-05,
"loss": 4.8003,
"step": 44500
},
{
"combined_loss": 2.7578635215759277,
"distill_loss": 1.1640808582305908,
"epoch": 1.6301560553886731,
"step": 44500,
"student_mlm_loss": 4.351646423339844
},
{
"epoch": 1.6338193274232544,
"grad_norm": 18.73512077331543,
"learning_rate": 2.290955107256063e-05,
"loss": 5.3592,
"step": 44600
},
{
"combined_loss": 3.758654832839966,
"distill_loss": 1.260606288909912,
"epoch": 1.6338193274232544,
"step": 44600,
"student_mlm_loss": 6.2567033767700195
},
{
"epoch": 1.6374825994578357,
"grad_norm": 6.1570048332214355,
"learning_rate": 2.2848121483156006e-05,
"loss": 10.8594,
"step": 44700
},
{
"combined_loss": 3.205047845840454,
"distill_loss": 1.1495074033737183,
"epoch": 1.6374825994578357,
"step": 44700,
"student_mlm_loss": 5.2605881690979
},
{
"epoch": 1.641145871492417,
"grad_norm": 8.748614311218262,
"learning_rate": 2.2786691893751383e-05,
"loss": 2.611,
"step": 44800
},
{
"combined_loss": 2.7548794746398926,
"distill_loss": 1.153849482536316,
"epoch": 1.641145871492417,
"step": 44800,
"student_mlm_loss": 4.35590934753418
},
{
"epoch": 1.6448091435269983,
"grad_norm": 9.594339370727539,
"learning_rate": 2.2725262304346758e-05,
"loss": 3.621,
"step": 44900
},
{
"combined_loss": 2.63676381111145,
"distill_loss": 1.144437313079834,
"epoch": 1.6448091435269983,
"step": 44900,
"student_mlm_loss": 4.129090309143066
},
{
"epoch": 1.6484724155615798,
"grad_norm": 8.756010055541992,
"learning_rate": 2.2663832714942136e-05,
"loss": 5.0762,
"step": 45000
},
{
"combined_loss": 2.0047507286071777,
"distill_loss": 1.203262209892273,
"epoch": 1.6484724155615798,
"step": 45000,
"student_mlm_loss": 2.806239366531372
},
{
"epoch": 1.6521356875961608,
"grad_norm": 16.163911819458008,
"learning_rate": 2.260240312553751e-05,
"loss": 3.1675,
"step": 45100
},
{
"combined_loss": 1.822305679321289,
"distill_loss": 1.187317967414856,
"epoch": 1.6521356875961608,
"step": 45100,
"student_mlm_loss": 2.4572935104370117
},
{
"epoch": 1.6557989596307423,
"grad_norm": 4.047428607940674,
"learning_rate": 2.2540973536132888e-05,
"loss": 2.6406,
"step": 45200
},
{
"combined_loss": 2.431349039077759,
"distill_loss": 1.2643455266952515,
"epoch": 1.6557989596307423,
"step": 45200,
"student_mlm_loss": 3.5983526706695557
},
{
"epoch": 1.6594622316653234,
"grad_norm": 28.598485946655273,
"learning_rate": 2.247954394672826e-05,
"loss": 3.7667,
"step": 45300
},
{
"combined_loss": 2.274944543838501,
"distill_loss": 1.266087293624878,
"epoch": 1.6594622316653234,
"step": 45300,
"student_mlm_loss": 3.283801794052124
},
{
"epoch": 1.6631255036999049,
"grad_norm": 11.642946243286133,
"learning_rate": 2.2418114357323637e-05,
"loss": 3.0131,
"step": 45400
},
{
"combined_loss": 2.064805507659912,
"distill_loss": 1.2423893213272095,
"epoch": 1.6631255036999049,
"step": 45400,
"student_mlm_loss": 2.8872218132019043
},
{
"epoch": 1.666788775734486,
"grad_norm": 7.227854251861572,
"learning_rate": 2.235668476791901e-05,
"loss": 7.556,
"step": 45500
},
{
"combined_loss": 1.8626993894577026,
"distill_loss": 1.153686761856079,
"epoch": 1.666788775734486,
"step": 45500,
"student_mlm_loss": 2.571712017059326
},
{
"epoch": 1.6704520477690674,
"grad_norm": 11.972105026245117,
"learning_rate": 2.229525517851439e-05,
"loss": 3.9606,
"step": 45600
},
{
"combined_loss": 1.7529842853546143,
"distill_loss": 1.2637630701065063,
"epoch": 1.6704520477690674,
"step": 45600,
"student_mlm_loss": 2.2422056198120117
},
{
"epoch": 1.6741153198036485,
"grad_norm": 4.263253211975098,
"learning_rate": 2.2233825589109764e-05,
"loss": 3.0922,
"step": 45700
},
{
"combined_loss": 2.6089985370635986,
"distill_loss": 1.2136098146438599,
"epoch": 1.6741153198036485,
"step": 45700,
"student_mlm_loss": 4.004387378692627
},
{
"epoch": 1.67777859183823,
"grad_norm": 24.4074764251709,
"learning_rate": 2.2172395999705138e-05,
"loss": 3.2329,
"step": 45800
},
{
"combined_loss": 1.6919562816619873,
"distill_loss": 1.139168381690979,
"epoch": 1.67777859183823,
"step": 45800,
"student_mlm_loss": 2.244744300842285
},
{
"epoch": 1.6814418638728112,
"grad_norm": 5.1518778800964355,
"learning_rate": 2.2110966410300516e-05,
"loss": 9.4019,
"step": 45900
},
{
"combined_loss": 2.1822292804718018,
"distill_loss": 1.3423482179641724,
"epoch": 1.6814418638728112,
"step": 45900,
"student_mlm_loss": 3.0221104621887207
},
{
"epoch": 1.6851051359073925,
"grad_norm": 18.045368194580078,
"learning_rate": 2.204953682089589e-05,
"loss": 3.3662,
"step": 46000
},
{
"epoch": 1.6851051359073925,
"eval_loss": 3.070533275604248,
"eval_runtime": 1.9768,
"eval_samples_per_second": 3539.063,
"eval_steps_per_second": 14.164,
"step": 46000
},
{
"combined_loss": 1.8376495838165283,
"distill_loss": 1.261283278465271,
"epoch": 1.6851051359073925,
"step": 46000,
"student_mlm_loss": 2.414015769958496
},
{
"epoch": 1.6887684079419738,
"grad_norm": 5.69982385635376,
"learning_rate": 2.1988107231491265e-05,
"loss": 3.3451,
"step": 46100
},
{
"combined_loss": 1.7916219234466553,
"distill_loss": 1.2525031566619873,
"epoch": 1.6887684079419738,
"step": 46100,
"student_mlm_loss": 2.3307406902313232
},
{
"epoch": 1.692431679976555,
"grad_norm": 27.134151458740234,
"learning_rate": 2.192667764208664e-05,
"loss": 9.1006,
"step": 46200
},
{
"combined_loss": 59.0687141418457,
"distill_loss": 1.1848413944244385,
"epoch": 1.692431679976555,
"step": 46200,
"student_mlm_loss": 116.95258331298828
},
{
"epoch": 1.6960949520111364,
"grad_norm": 6.624229431152344,
"learning_rate": 2.1865248052682017e-05,
"loss": 3.0016,
"step": 46300
},
{
"combined_loss": 2.7997608184814453,
"distill_loss": 1.1524275541305542,
"epoch": 1.6960949520111364,
"step": 46300,
"student_mlm_loss": 4.447093963623047
},
{
"epoch": 1.6997582240457176,
"grad_norm": 5.472049236297607,
"learning_rate": 2.180381846327739e-05,
"loss": 20.0915,
"step": 46400
},
{
"combined_loss": 1.7153997421264648,
"distill_loss": 1.237658143043518,
"epoch": 1.6997582240457176,
"step": 46400,
"student_mlm_loss": 2.193141460418701
},
{
"epoch": 1.703421496080299,
"grad_norm": 14.290247917175293,
"learning_rate": 2.174238887387277e-05,
"loss": 4.5936,
"step": 46500
},
{
"combined_loss": 1.709627628326416,
"distill_loss": 1.2791212797164917,
"epoch": 1.703421496080299,
"step": 46500,
"student_mlm_loss": 2.140133857727051
},
{
"epoch": 1.7070847681148802,
"grad_norm": 17.962997436523438,
"learning_rate": 2.1680959284468144e-05,
"loss": 3.3627,
"step": 46600
},
{
"combined_loss": 7.8201751708984375,
"distill_loss": 1.3012824058532715,
"epoch": 1.7070847681148802,
"step": 46600,
"student_mlm_loss": 14.339067459106445
},
{
"epoch": 1.7107480401494615,
"grad_norm": 6.800339698791504,
"learning_rate": 2.161952969506352e-05,
"loss": 6.7955,
"step": 46700
},
{
"combined_loss": 1.809753656387329,
"distill_loss": 1.2891262769699097,
"epoch": 1.7107480401494615,
"step": 46700,
"student_mlm_loss": 2.330381155014038
},
{
"epoch": 1.7144113121840427,
"grad_norm": 12.281099319458008,
"learning_rate": 2.1558100105658896e-05,
"loss": 10.3436,
"step": 46800
},
{
"combined_loss": 3.3808600902557373,
"distill_loss": 1.2777303457260132,
"epoch": 1.7144113121840427,
"step": 46800,
"student_mlm_loss": 5.483989715576172
},
{
"epoch": 1.718074584218624,
"grad_norm": 3.3210408687591553,
"learning_rate": 2.149667051625427e-05,
"loss": 2.8055,
"step": 46900
},
{
"combined_loss": 2.1092348098754883,
"distill_loss": 1.2058593034744263,
"epoch": 1.718074584218624,
"step": 46900,
"student_mlm_loss": 3.0126101970672607
},
{
"epoch": 1.7217378562532053,
"grad_norm": 11.694738388061523,
"learning_rate": 2.1435240926849645e-05,
"loss": 4.6311,
"step": 47000
},
{
"combined_loss": 2.2222890853881836,
"distill_loss": 1.218597173690796,
"epoch": 1.7217378562532053,
"step": 47000,
"student_mlm_loss": 3.2259812355041504
},
{
"epoch": 1.7254011282877868,
"grad_norm": 23.036334991455078,
"learning_rate": 2.137381133744502e-05,
"loss": 2.5923,
"step": 47100
},
{
"combined_loss": 1.882810354232788,
"distill_loss": 1.2441027164459229,
"epoch": 1.7254011282877868,
"step": 47100,
"student_mlm_loss": 2.5215179920196533
},
{
"epoch": 1.7290644003223679,
"grad_norm": 65.06354522705078,
"learning_rate": 2.1312381748040397e-05,
"loss": 3.3375,
"step": 47200
},
{
"combined_loss": 1.84983229637146,
"distill_loss": 1.224557876586914,
"epoch": 1.7290644003223679,
"step": 47200,
"student_mlm_loss": 2.475106716156006
},
{
"epoch": 1.7327276723569494,
"grad_norm": 9.202945709228516,
"learning_rate": 2.1250952158635772e-05,
"loss": 3.0094,
"step": 47300
},
{
"combined_loss": 1.6417255401611328,
"distill_loss": 1.2296794652938843,
"epoch": 1.7327276723569494,
"step": 47300,
"student_mlm_loss": 2.053771734237671
},
{
"epoch": 1.7363909443915304,
"grad_norm": 7.1568193435668945,
"learning_rate": 2.118952256923115e-05,
"loss": 3.3413,
"step": 47400
},
{
"combined_loss": 2.165384531021118,
"distill_loss": 1.2572156190872192,
"epoch": 1.7363909443915304,
"step": 47400,
"student_mlm_loss": 3.0735535621643066
},
{
"epoch": 1.740054216426112,
"grad_norm": 39.054439544677734,
"learning_rate": 2.1128092979826524e-05,
"loss": 4.8522,
"step": 47500
},
{
"combined_loss": 2.6122236251831055,
"distill_loss": 1.1487023830413818,
"epoch": 1.740054216426112,
"step": 47500,
"student_mlm_loss": 4.07574462890625
},
{
"epoch": 1.743717488460693,
"grad_norm": 3.18758487701416,
"learning_rate": 2.1066663390421902e-05,
"loss": 4.3993,
"step": 47600
},
{
"combined_loss": 6.344114303588867,
"distill_loss": 1.1341725587844849,
"epoch": 1.743717488460693,
"step": 47600,
"student_mlm_loss": 11.554056167602539
},
{
"epoch": 1.7473807604952745,
"grad_norm": 9.418896675109863,
"learning_rate": 2.1005233801017273e-05,
"loss": 8.7279,
"step": 47700
},
{
"combined_loss": 2.8721518516540527,
"distill_loss": 1.2175838947296143,
"epoch": 1.7473807604952745,
"step": 47700,
"student_mlm_loss": 4.526719570159912
},
{
"epoch": 1.7510440325298555,
"grad_norm": 4.730939865112305,
"learning_rate": 2.094380421161265e-05,
"loss": 2.74,
"step": 47800
},
{
"combined_loss": 1.8483730554580688,
"distill_loss": 1.2789607048034668,
"epoch": 1.7510440325298555,
"step": 47800,
"student_mlm_loss": 2.417785406112671
},
{
"epoch": 1.754707304564437,
"grad_norm": 4.566458225250244,
"learning_rate": 2.0882374622208025e-05,
"loss": 2.63,
"step": 47900
},
{
"combined_loss": 1.8073049783706665,
"distill_loss": 1.3073413372039795,
"epoch": 1.754707304564437,
"step": 47900,
"student_mlm_loss": 2.3072686195373535
},
{
"epoch": 1.7583705765990183,
"grad_norm": 14.967068672180176,
"learning_rate": 2.0820945032803403e-05,
"loss": 2.5821,
"step": 48000
},
{
"epoch": 1.7583705765990183,
"eval_loss": 3.2400870323181152,
"eval_runtime": 1.8322,
"eval_samples_per_second": 3818.29,
"eval_steps_per_second": 15.282,
"step": 48000
}
],
"logging_steps": 100,
"max_steps": 81894,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7150683130961408e+16,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}