|
{ |
|
"best_metric": 3.070533275604248, |
|
"best_model_checkpoint": "./distilled3/checkpoint-46000", |
|
"epoch": 1.7583705765990183, |
|
"eval_steps": 2000, |
|
"global_step": 48000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"combined_loss": 13.355602264404297, |
|
"distill_loss": 1.4010732173919678, |
|
"epoch": 0, |
|
"step": 0, |
|
"student_mlm_loss": 25.310131072998047 |
|
}, |
|
{ |
|
"epoch": 0.003663272034581288, |
|
"grad_norm": 11.128765106201172, |
|
"learning_rate": 1e-05, |
|
"loss": 17.4544, |
|
"step": 100 |
|
}, |
|
{ |
|
"combined_loss": 9.379831314086914, |
|
"distill_loss": 1.5227235555648804, |
|
"epoch": 0.003663272034581288, |
|
"step": 100, |
|
"student_mlm_loss": 17.2369384765625 |
|
}, |
|
{ |
|
"epoch": 0.007326544069162576, |
|
"grad_norm": 14.151921272277832, |
|
"learning_rate": 2e-05, |
|
"loss": 16.0099, |
|
"step": 200 |
|
}, |
|
{ |
|
"combined_loss": 28.136512756347656, |
|
"distill_loss": 1.571045160293579, |
|
"epoch": 0.007326544069162576, |
|
"step": 200, |
|
"student_mlm_loss": 54.70198059082031 |
|
}, |
|
{ |
|
"epoch": 0.010989816103743864, |
|
"grad_norm": 11.68195915222168, |
|
"learning_rate": 3e-05, |
|
"loss": 18.8223, |
|
"step": 300 |
|
}, |
|
{ |
|
"combined_loss": 15.699158668518066, |
|
"distill_loss": 1.5519400835037231, |
|
"epoch": 0.010989816103743864, |
|
"step": 300, |
|
"student_mlm_loss": 29.846376419067383 |
|
}, |
|
{ |
|
"epoch": 0.014653088138325152, |
|
"grad_norm": 8.982569694519043, |
|
"learning_rate": 4e-05, |
|
"loss": 16.9008, |
|
"step": 400 |
|
}, |
|
{ |
|
"combined_loss": 3.035900592803955, |
|
"distill_loss": 1.4880340099334717, |
|
"epoch": 0.014653088138325152, |
|
"step": 400, |
|
"student_mlm_loss": 4.583766937255859 |
|
}, |
|
{ |
|
"epoch": 0.01831636017290644, |
|
"grad_norm": 7.045658111572266, |
|
"learning_rate": 5e-05, |
|
"loss": 8.812, |
|
"step": 500 |
|
}, |
|
{ |
|
"combined_loss": 7.002770900726318, |
|
"distill_loss": 1.351847529411316, |
|
"epoch": 0.01831636017290644, |
|
"step": 500, |
|
"student_mlm_loss": 12.653694152832031 |
|
}, |
|
{ |
|
"epoch": 0.021979632207487727, |
|
"grad_norm": 4.265043258666992, |
|
"learning_rate": 4.9938570410595373e-05, |
|
"loss": 16.8853, |
|
"step": 600 |
|
}, |
|
{ |
|
"combined_loss": 3.2060928344726562, |
|
"distill_loss": 1.2962806224822998, |
|
"epoch": 0.021979632207487727, |
|
"step": 600, |
|
"student_mlm_loss": 5.115904808044434 |
|
}, |
|
{ |
|
"epoch": 0.025642904242069015, |
|
"grad_norm": 7.744924545288086, |
|
"learning_rate": 4.987714082119075e-05, |
|
"loss": 7.1609, |
|
"step": 700 |
|
}, |
|
{ |
|
"combined_loss": 2.2816712856292725, |
|
"distill_loss": 1.5105196237564087, |
|
"epoch": 0.025642904242069015, |
|
"step": 700, |
|
"student_mlm_loss": 3.052823066711426 |
|
}, |
|
{ |
|
"epoch": 0.029306176276650303, |
|
"grad_norm": 12.44052791595459, |
|
"learning_rate": 4.981571123178613e-05, |
|
"loss": 13.0471, |
|
"step": 800 |
|
}, |
|
{ |
|
"combined_loss": 3.225351095199585, |
|
"distill_loss": 1.5753816366195679, |
|
"epoch": 0.029306176276650303, |
|
"step": 800, |
|
"student_mlm_loss": 4.8753204345703125 |
|
}, |
|
{ |
|
"epoch": 0.032969448311231594, |
|
"grad_norm": 6.2059645652771, |
|
"learning_rate": 4.975428164238151e-05, |
|
"loss": 6.2833, |
|
"step": 900 |
|
}, |
|
{ |
|
"combined_loss": 8.580605506896973, |
|
"distill_loss": 1.530474066734314, |
|
"epoch": 0.032969448311231594, |
|
"step": 900, |
|
"student_mlm_loss": 15.630736351013184 |
|
}, |
|
{ |
|
"epoch": 0.03663272034581288, |
|
"grad_norm": 14.731459617614746, |
|
"learning_rate": 4.969285205297688e-05, |
|
"loss": 5.8549, |
|
"step": 1000 |
|
}, |
|
{ |
|
"combined_loss": 3.7085845470428467, |
|
"distill_loss": 1.4659323692321777, |
|
"epoch": 0.03663272034581288, |
|
"step": 1000, |
|
"student_mlm_loss": 5.951236724853516 |
|
}, |
|
{ |
|
"epoch": 0.04029599238039417, |
|
"grad_norm": 9.745060920715332, |
|
"learning_rate": 4.9631422463572256e-05, |
|
"loss": 5.174, |
|
"step": 1100 |
|
}, |
|
{ |
|
"combined_loss": 4.752764701843262, |
|
"distill_loss": 1.4000483751296997, |
|
"epoch": 0.04029599238039417, |
|
"step": 1100, |
|
"student_mlm_loss": 8.105481147766113 |
|
}, |
|
{ |
|
"epoch": 0.043959264414975455, |
|
"grad_norm": 13.801424026489258, |
|
"learning_rate": 4.9569992874167634e-05, |
|
"loss": 19.8368, |
|
"step": 1200 |
|
}, |
|
{ |
|
"combined_loss": 3.1324005126953125, |
|
"distill_loss": 1.404078483581543, |
|
"epoch": 0.043959264414975455, |
|
"step": 1200, |
|
"student_mlm_loss": 4.860722541809082 |
|
}, |
|
{ |
|
"epoch": 0.047622536449556746, |
|
"grad_norm": 52.244632720947266, |
|
"learning_rate": 4.9508563284763005e-05, |
|
"loss": 5.547, |
|
"step": 1300 |
|
}, |
|
{ |
|
"combined_loss": 3.1176328659057617, |
|
"distill_loss": 1.3057805299758911, |
|
"epoch": 0.047622536449556746, |
|
"step": 1300, |
|
"student_mlm_loss": 4.929485321044922 |
|
}, |
|
{ |
|
"epoch": 0.05128580848413803, |
|
"grad_norm": 47.002349853515625, |
|
"learning_rate": 4.944713369535838e-05, |
|
"loss": 4.7784, |
|
"step": 1400 |
|
}, |
|
{ |
|
"combined_loss": 3.871903657913208, |
|
"distill_loss": 1.5537463426589966, |
|
"epoch": 0.05128580848413803, |
|
"step": 1400, |
|
"student_mlm_loss": 6.190061092376709 |
|
}, |
|
{ |
|
"epoch": 0.05494908051871932, |
|
"grad_norm": 11.417911529541016, |
|
"learning_rate": 4.9385704105953754e-05, |
|
"loss": 5.9593, |
|
"step": 1500 |
|
}, |
|
{ |
|
"combined_loss": 6.293668270111084, |
|
"distill_loss": 1.3082151412963867, |
|
"epoch": 0.05494908051871932, |
|
"step": 1500, |
|
"student_mlm_loss": 11.279121398925781 |
|
}, |
|
{ |
|
"epoch": 0.058612352553300606, |
|
"grad_norm": 24.519105911254883, |
|
"learning_rate": 4.932427451654914e-05, |
|
"loss": 7.2762, |
|
"step": 1600 |
|
}, |
|
{ |
|
"combined_loss": 3.350501775741577, |
|
"distill_loss": 1.4593900442123413, |
|
"epoch": 0.058612352553300606, |
|
"step": 1600, |
|
"student_mlm_loss": 5.241613388061523 |
|
}, |
|
{ |
|
"epoch": 0.0622756245878819, |
|
"grad_norm": 42.58499526977539, |
|
"learning_rate": 4.926284492714451e-05, |
|
"loss": 7.1364, |
|
"step": 1700 |
|
}, |
|
{ |
|
"combined_loss": 10.976073265075684, |
|
"distill_loss": 1.594639539718628, |
|
"epoch": 0.0622756245878819, |
|
"step": 1700, |
|
"student_mlm_loss": 20.357507705688477 |
|
}, |
|
{ |
|
"epoch": 0.06593889662246319, |
|
"grad_norm": 105.27689361572266, |
|
"learning_rate": 4.920141533773989e-05, |
|
"loss": 5.7662, |
|
"step": 1800 |
|
}, |
|
{ |
|
"combined_loss": 4.272126197814941, |
|
"distill_loss": 1.4649100303649902, |
|
"epoch": 0.06593889662246319, |
|
"step": 1800, |
|
"student_mlm_loss": 7.079341888427734 |
|
}, |
|
{ |
|
"epoch": 0.06960216865704447, |
|
"grad_norm": 9.272991180419922, |
|
"learning_rate": 4.913998574833526e-05, |
|
"loss": 4.9898, |
|
"step": 1900 |
|
}, |
|
{ |
|
"combined_loss": 2.2884514331817627, |
|
"distill_loss": 1.5105092525482178, |
|
"epoch": 0.06960216865704447, |
|
"step": 1900, |
|
"student_mlm_loss": 3.0663936138153076 |
|
}, |
|
{ |
|
"epoch": 0.07326544069162576, |
|
"grad_norm": 15.299578666687012, |
|
"learning_rate": 4.9078556158930636e-05, |
|
"loss": 6.8909, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.07326544069162576, |
|
"eval_loss": 6.166979789733887, |
|
"eval_runtime": 2.1158, |
|
"eval_samples_per_second": 3306.616, |
|
"eval_steps_per_second": 13.234, |
|
"step": 2000 |
|
}, |
|
{ |
|
"combined_loss": 5.612101078033447, |
|
"distill_loss": 1.332657814025879, |
|
"epoch": 0.07326544069162576, |
|
"step": 2000, |
|
"student_mlm_loss": 9.891544342041016 |
|
}, |
|
{ |
|
"epoch": 0.07692871272620705, |
|
"grad_norm": 12.242279052734375, |
|
"learning_rate": 4.9017126569526014e-05, |
|
"loss": 8.6608, |
|
"step": 2100 |
|
}, |
|
{ |
|
"combined_loss": 2.035828113555908, |
|
"distill_loss": 1.3731106519699097, |
|
"epoch": 0.07692871272620705, |
|
"step": 2100, |
|
"student_mlm_loss": 2.6985456943511963 |
|
}, |
|
{ |
|
"epoch": 0.08059198476078834, |
|
"grad_norm": 27.212379455566406, |
|
"learning_rate": 4.8955696980121385e-05, |
|
"loss": 9.4649, |
|
"step": 2200 |
|
}, |
|
{ |
|
"combined_loss": 2.5593996047973633, |
|
"distill_loss": 1.5456775426864624, |
|
"epoch": 0.08059198476078834, |
|
"step": 2200, |
|
"student_mlm_loss": 3.5731217861175537 |
|
}, |
|
{ |
|
"epoch": 0.08425525679536962, |
|
"grad_norm": 9.444129943847656, |
|
"learning_rate": 4.889426739071676e-05, |
|
"loss": 12.6304, |
|
"step": 2300 |
|
}, |
|
{ |
|
"combined_loss": 3.0112435817718506, |
|
"distill_loss": 1.268593192100525, |
|
"epoch": 0.08425525679536962, |
|
"step": 2300, |
|
"student_mlm_loss": 4.753893852233887 |
|
}, |
|
{ |
|
"epoch": 0.08791852882995091, |
|
"grad_norm": 6.72172212600708, |
|
"learning_rate": 4.8832837801312134e-05, |
|
"loss": 4.2453, |
|
"step": 2400 |
|
}, |
|
{ |
|
"combined_loss": 2.3823843002319336, |
|
"distill_loss": 1.3674836158752441, |
|
"epoch": 0.08791852882995091, |
|
"step": 2400, |
|
"student_mlm_loss": 3.397284984588623 |
|
}, |
|
{ |
|
"epoch": 0.0915818008645322, |
|
"grad_norm": 88.5478744506836, |
|
"learning_rate": 4.877140821190752e-05, |
|
"loss": 4.6849, |
|
"step": 2500 |
|
}, |
|
{ |
|
"combined_loss": 3.8919034004211426, |
|
"distill_loss": 1.523806095123291, |
|
"epoch": 0.0915818008645322, |
|
"step": 2500, |
|
"student_mlm_loss": 6.260000705718994 |
|
}, |
|
{ |
|
"epoch": 0.09524507289911349, |
|
"grad_norm": 11.671692848205566, |
|
"learning_rate": 4.870997862250289e-05, |
|
"loss": 4.8686, |
|
"step": 2600 |
|
}, |
|
{ |
|
"combined_loss": 2.8186635971069336, |
|
"distill_loss": 1.313085913658142, |
|
"epoch": 0.09524507289911349, |
|
"step": 2600, |
|
"student_mlm_loss": 4.3242411613464355 |
|
}, |
|
{ |
|
"epoch": 0.09890834493369477, |
|
"grad_norm": 7.681136131286621, |
|
"learning_rate": 4.864854903309827e-05, |
|
"loss": 14.7468, |
|
"step": 2700 |
|
}, |
|
{ |
|
"combined_loss": 2.6350021362304688, |
|
"distill_loss": 1.5300695896148682, |
|
"epoch": 0.09890834493369477, |
|
"step": 2700, |
|
"student_mlm_loss": 3.7399346828460693 |
|
}, |
|
{ |
|
"epoch": 0.10257161696827606, |
|
"grad_norm": 10.245522499084473, |
|
"learning_rate": 4.858711944369364e-05, |
|
"loss": 4.7465, |
|
"step": 2800 |
|
}, |
|
{ |
|
"combined_loss": 1.9805179834365845, |
|
"distill_loss": 1.3671844005584717, |
|
"epoch": 0.10257161696827606, |
|
"step": 2800, |
|
"student_mlm_loss": 2.5938515663146973 |
|
}, |
|
{ |
|
"epoch": 0.10623488900285735, |
|
"grad_norm": 51.705352783203125, |
|
"learning_rate": 4.8525689854289016e-05, |
|
"loss": 3.8985, |
|
"step": 2900 |
|
}, |
|
{ |
|
"combined_loss": 1.9335501194000244, |
|
"distill_loss": 1.3294615745544434, |
|
"epoch": 0.10623488900285735, |
|
"step": 2900, |
|
"student_mlm_loss": 2.5376386642456055 |
|
}, |
|
{ |
|
"epoch": 0.10989816103743864, |
|
"grad_norm": 7.661074161529541, |
|
"learning_rate": 4.8464260264884394e-05, |
|
"loss": 3.9846, |
|
"step": 3000 |
|
}, |
|
{ |
|
"combined_loss": 2.815329074859619, |
|
"distill_loss": 1.5120948553085327, |
|
"epoch": 0.10989816103743864, |
|
"step": 3000, |
|
"student_mlm_loss": 4.118563175201416 |
|
}, |
|
{ |
|
"epoch": 0.11356143307201993, |
|
"grad_norm": 3.9512596130371094, |
|
"learning_rate": 4.8402830675479765e-05, |
|
"loss": 5.6509, |
|
"step": 3100 |
|
}, |
|
{ |
|
"combined_loss": 5.329846382141113, |
|
"distill_loss": 1.5839005708694458, |
|
"epoch": 0.11356143307201993, |
|
"step": 3100, |
|
"student_mlm_loss": 9.07579231262207 |
|
}, |
|
{ |
|
"epoch": 0.11722470510660121, |
|
"grad_norm": 21.47922134399414, |
|
"learning_rate": 4.834140108607514e-05, |
|
"loss": 4.5437, |
|
"step": 3200 |
|
}, |
|
{ |
|
"combined_loss": 3.32517147064209, |
|
"distill_loss": 1.4834882020950317, |
|
"epoch": 0.11722470510660121, |
|
"step": 3200, |
|
"student_mlm_loss": 5.1668548583984375 |
|
}, |
|
{ |
|
"epoch": 0.1208879771411825, |
|
"grad_norm": 11.865033149719238, |
|
"learning_rate": 4.827997149667052e-05, |
|
"loss": 5.0218, |
|
"step": 3300 |
|
}, |
|
{ |
|
"combined_loss": 2.84318208694458, |
|
"distill_loss": 1.302217960357666, |
|
"epoch": 0.1208879771411825, |
|
"step": 3300, |
|
"student_mlm_loss": 4.384146213531494 |
|
}, |
|
{ |
|
"epoch": 0.1245512491757638, |
|
"grad_norm": 13.824487686157227, |
|
"learning_rate": 4.82185419072659e-05, |
|
"loss": 33.2949, |
|
"step": 3400 |
|
}, |
|
{ |
|
"combined_loss": 2.065192937850952, |
|
"distill_loss": 1.3474924564361572, |
|
"epoch": 0.1245512491757638, |
|
"step": 3400, |
|
"student_mlm_loss": 2.782893419265747 |
|
}, |
|
{ |
|
"epoch": 0.12821452121034507, |
|
"grad_norm": 34.21382522583008, |
|
"learning_rate": 4.815711231786127e-05, |
|
"loss": 12.5775, |
|
"step": 3500 |
|
}, |
|
{ |
|
"combined_loss": 2.2148988246917725, |
|
"distill_loss": 1.616875171661377, |
|
"epoch": 0.12821452121034507, |
|
"step": 3500, |
|
"student_mlm_loss": 2.812922477722168 |
|
}, |
|
{ |
|
"epoch": 0.13187779324492638, |
|
"grad_norm": 8.859841346740723, |
|
"learning_rate": 4.809568272845665e-05, |
|
"loss": 4.6975, |
|
"step": 3600 |
|
}, |
|
{ |
|
"combined_loss": 4.478976726531982, |
|
"distill_loss": 1.3554083108901978, |
|
"epoch": 0.13187779324492638, |
|
"step": 3600, |
|
"student_mlm_loss": 7.602544784545898 |
|
}, |
|
{ |
|
"epoch": 0.13554106527950766, |
|
"grad_norm": 12.680179595947266, |
|
"learning_rate": 4.803425313905202e-05, |
|
"loss": 4.5414, |
|
"step": 3700 |
|
}, |
|
{ |
|
"combined_loss": 6.908867835998535, |
|
"distill_loss": 1.3570021390914917, |
|
"epoch": 0.13554106527950766, |
|
"step": 3700, |
|
"student_mlm_loss": 12.460733413696289 |
|
}, |
|
{ |
|
"epoch": 0.13920433731408893, |
|
"grad_norm": 18.478200912475586, |
|
"learning_rate": 4.7972823549647396e-05, |
|
"loss": 35.1443, |
|
"step": 3800 |
|
}, |
|
{ |
|
"combined_loss": 13.97608757019043, |
|
"distill_loss": 1.418832778930664, |
|
"epoch": 0.13920433731408893, |
|
"step": 3800, |
|
"student_mlm_loss": 26.533342361450195 |
|
}, |
|
{ |
|
"epoch": 0.14286760934867024, |
|
"grad_norm": 10.53610897064209, |
|
"learning_rate": 4.7911393960242774e-05, |
|
"loss": 13.766, |
|
"step": 3900 |
|
}, |
|
{ |
|
"combined_loss": 2.1997413635253906, |
|
"distill_loss": 1.4529953002929688, |
|
"epoch": 0.14286760934867024, |
|
"step": 3900, |
|
"student_mlm_loss": 2.9464874267578125 |
|
}, |
|
{ |
|
"epoch": 0.14653088138325152, |
|
"grad_norm": 42.095558166503906, |
|
"learning_rate": 4.7849964370838145e-05, |
|
"loss": 3.297, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.14653088138325152, |
|
"eval_loss": 4.568027496337891, |
|
"eval_runtime": 2.0693, |
|
"eval_samples_per_second": 3380.818, |
|
"eval_steps_per_second": 13.531, |
|
"step": 4000 |
|
}, |
|
{ |
|
"combined_loss": 2.278163433074951, |
|
"distill_loss": 1.5395259857177734, |
|
"epoch": 0.14653088138325152, |
|
"step": 4000, |
|
"student_mlm_loss": 3.016800880432129 |
|
}, |
|
{ |
|
"epoch": 0.15019415341783282, |
|
"grad_norm": 15.655592918395996, |
|
"learning_rate": 4.778853478143352e-05, |
|
"loss": 4.5795, |
|
"step": 4100 |
|
}, |
|
{ |
|
"combined_loss": 2.117962598800659, |
|
"distill_loss": 1.5073814392089844, |
|
"epoch": 0.15019415341783282, |
|
"step": 4100, |
|
"student_mlm_loss": 2.728543758392334 |
|
}, |
|
{ |
|
"epoch": 0.1538574254524141, |
|
"grad_norm": 9.47999382019043, |
|
"learning_rate": 4.77271051920289e-05, |
|
"loss": 4.6384, |
|
"step": 4200 |
|
}, |
|
{ |
|
"combined_loss": 2.2614216804504395, |
|
"distill_loss": 1.3999947309494019, |
|
"epoch": 0.1538574254524141, |
|
"step": 4200, |
|
"student_mlm_loss": 3.1228485107421875 |
|
}, |
|
{ |
|
"epoch": 0.15752069748699538, |
|
"grad_norm": 12.137129783630371, |
|
"learning_rate": 4.766567560262428e-05, |
|
"loss": 3.6101, |
|
"step": 4300 |
|
}, |
|
{ |
|
"combined_loss": 1.9776763916015625, |
|
"distill_loss": 1.4785245656967163, |
|
"epoch": 0.15752069748699538, |
|
"step": 4300, |
|
"student_mlm_loss": 2.476828098297119 |
|
}, |
|
{ |
|
"epoch": 0.16118396952157668, |
|
"grad_norm": 74.8094253540039, |
|
"learning_rate": 4.760424601321965e-05, |
|
"loss": 4.9111, |
|
"step": 4400 |
|
}, |
|
{ |
|
"combined_loss": 3.0158274173736572, |
|
"distill_loss": 1.2940564155578613, |
|
"epoch": 0.16118396952157668, |
|
"step": 4400, |
|
"student_mlm_loss": 4.737598419189453 |
|
}, |
|
{ |
|
"epoch": 0.16484724155615796, |
|
"grad_norm": 5.339694499969482, |
|
"learning_rate": 4.754281642381502e-05, |
|
"loss": 3.4013, |
|
"step": 4500 |
|
}, |
|
{ |
|
"combined_loss": 2.176065683364868, |
|
"distill_loss": 1.5688632726669312, |
|
"epoch": 0.16484724155615796, |
|
"step": 4500, |
|
"student_mlm_loss": 2.7832682132720947 |
|
}, |
|
{ |
|
"epoch": 0.16851051359073924, |
|
"grad_norm": 12.745500564575195, |
|
"learning_rate": 4.74813868344104e-05, |
|
"loss": 3.1244, |
|
"step": 4600 |
|
}, |
|
{ |
|
"combined_loss": 2.4230682849884033, |
|
"distill_loss": 1.46636962890625, |
|
"epoch": 0.16851051359073924, |
|
"step": 4600, |
|
"student_mlm_loss": 3.3797669410705566 |
|
}, |
|
{ |
|
"epoch": 0.17217378562532054, |
|
"grad_norm": 14.515507698059082, |
|
"learning_rate": 4.7419957245005777e-05, |
|
"loss": 4.9862, |
|
"step": 4700 |
|
}, |
|
{ |
|
"combined_loss": 6.772428512573242, |
|
"distill_loss": 1.6445391178131104, |
|
"epoch": 0.17217378562532054, |
|
"step": 4700, |
|
"student_mlm_loss": 11.900318145751953 |
|
}, |
|
{ |
|
"epoch": 0.17583705765990182, |
|
"grad_norm": 10.036664962768555, |
|
"learning_rate": 4.7358527655601154e-05, |
|
"loss": 3.72, |
|
"step": 4800 |
|
}, |
|
{ |
|
"combined_loss": 27.606048583984375, |
|
"distill_loss": 1.4302338361740112, |
|
"epoch": 0.17583705765990182, |
|
"step": 4800, |
|
"student_mlm_loss": 53.781864166259766 |
|
}, |
|
{ |
|
"epoch": 0.17950032969448312, |
|
"grad_norm": 14.220582008361816, |
|
"learning_rate": 4.7297098066196525e-05, |
|
"loss": 9.0684, |
|
"step": 4900 |
|
}, |
|
{ |
|
"combined_loss": 7.97739839553833, |
|
"distill_loss": 1.4764257669448853, |
|
"epoch": 0.17950032969448312, |
|
"step": 4900, |
|
"student_mlm_loss": 14.478370666503906 |
|
}, |
|
{ |
|
"epoch": 0.1831636017290644, |
|
"grad_norm": 8.734748840332031, |
|
"learning_rate": 4.72356684767919e-05, |
|
"loss": 13.2974, |
|
"step": 5000 |
|
}, |
|
{ |
|
"combined_loss": 3.3007736206054688, |
|
"distill_loss": 1.5111989974975586, |
|
"epoch": 0.1831636017290644, |
|
"step": 5000, |
|
"student_mlm_loss": 5.090348243713379 |
|
}, |
|
{ |
|
"epoch": 0.18682687376364568, |
|
"grad_norm": 23.457653045654297, |
|
"learning_rate": 4.717423888738728e-05, |
|
"loss": 4.4811, |
|
"step": 5100 |
|
}, |
|
{ |
|
"combined_loss": 2.695789337158203, |
|
"distill_loss": 1.4495799541473389, |
|
"epoch": 0.18682687376364568, |
|
"step": 5100, |
|
"student_mlm_loss": 3.9419989585876465 |
|
}, |
|
{ |
|
"epoch": 0.19049014579822698, |
|
"grad_norm": 11.504470825195312, |
|
"learning_rate": 4.711280929798265e-05, |
|
"loss": 3.2576, |
|
"step": 5200 |
|
}, |
|
{ |
|
"combined_loss": 3.5765743255615234, |
|
"distill_loss": 1.3500127792358398, |
|
"epoch": 0.19049014579822698, |
|
"step": 5200, |
|
"student_mlm_loss": 5.803135871887207 |
|
}, |
|
{ |
|
"epoch": 0.19415341783280826, |
|
"grad_norm": 34.68207550048828, |
|
"learning_rate": 4.705137970857803e-05, |
|
"loss": 5.8403, |
|
"step": 5300 |
|
}, |
|
{ |
|
"combined_loss": 4.304483413696289, |
|
"distill_loss": 1.4075747728347778, |
|
"epoch": 0.19415341783280826, |
|
"step": 5300, |
|
"student_mlm_loss": 7.20139217376709 |
|
}, |
|
{ |
|
"epoch": 0.19781668986738954, |
|
"grad_norm": 22.416582107543945, |
|
"learning_rate": 4.69899501191734e-05, |
|
"loss": 4.045, |
|
"step": 5400 |
|
}, |
|
{ |
|
"combined_loss": 1.9111289978027344, |
|
"distill_loss": 1.321276307106018, |
|
"epoch": 0.19781668986738954, |
|
"step": 5400, |
|
"student_mlm_loss": 2.500981569290161 |
|
}, |
|
{ |
|
"epoch": 0.20147996190197084, |
|
"grad_norm": 27.66775894165039, |
|
"learning_rate": 4.6928520529768786e-05, |
|
"loss": 3.8896, |
|
"step": 5500 |
|
}, |
|
{ |
|
"combined_loss": 2.142390251159668, |
|
"distill_loss": 1.4025957584381104, |
|
"epoch": 0.20147996190197084, |
|
"step": 5500, |
|
"student_mlm_loss": 2.8821845054626465 |
|
}, |
|
{ |
|
"epoch": 0.20514323393655212, |
|
"grad_norm": 35.84339141845703, |
|
"learning_rate": 4.686709094036416e-05, |
|
"loss": 4.94, |
|
"step": 5600 |
|
}, |
|
{ |
|
"combined_loss": 2.1642816066741943, |
|
"distill_loss": 1.392912745475769, |
|
"epoch": 0.20514323393655212, |
|
"step": 5600, |
|
"student_mlm_loss": 2.935650587081909 |
|
}, |
|
{ |
|
"epoch": 0.20880650597113343, |
|
"grad_norm": 18.43452262878418, |
|
"learning_rate": 4.6805661350959535e-05, |
|
"loss": 7.4575, |
|
"step": 5700 |
|
}, |
|
{ |
|
"combined_loss": 2.354356288909912, |
|
"distill_loss": 1.3411612510681152, |
|
"epoch": 0.20880650597113343, |
|
"step": 5700, |
|
"student_mlm_loss": 3.36755108833313 |
|
}, |
|
{ |
|
"epoch": 0.2124697780057147, |
|
"grad_norm": 5.364467144012451, |
|
"learning_rate": 4.6744231761554906e-05, |
|
"loss": 3.2172, |
|
"step": 5800 |
|
}, |
|
{ |
|
"combined_loss": 2.129748821258545, |
|
"distill_loss": 1.4555408954620361, |
|
"epoch": 0.2124697780057147, |
|
"step": 5800, |
|
"student_mlm_loss": 2.8039567470550537 |
|
}, |
|
{ |
|
"epoch": 0.21613305004029598, |
|
"grad_norm": 12.704414367675781, |
|
"learning_rate": 4.6682802172150283e-05, |
|
"loss": 9.9214, |
|
"step": 5900 |
|
}, |
|
{ |
|
"combined_loss": 5.396609783172607, |
|
"distill_loss": 1.3954136371612549, |
|
"epoch": 0.21613305004029598, |
|
"step": 5900, |
|
"student_mlm_loss": 9.397806167602539 |
|
}, |
|
{ |
|
"epoch": 0.2197963220748773, |
|
"grad_norm": 9.411243438720703, |
|
"learning_rate": 4.662137258274566e-05, |
|
"loss": 4.6268, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.2197963220748773, |
|
"eval_loss": 4.474331855773926, |
|
"eval_runtime": 2.0765, |
|
"eval_samples_per_second": 3369.116, |
|
"eval_steps_per_second": 13.484, |
|
"step": 6000 |
|
}, |
|
{ |
|
"combined_loss": 2.3863794803619385, |
|
"distill_loss": 1.4665789604187012, |
|
"epoch": 0.2197963220748773, |
|
"step": 6000, |
|
"student_mlm_loss": 3.306180000305176 |
|
}, |
|
{ |
|
"epoch": 0.22345959410945856, |
|
"grad_norm": 15.34604263305664, |
|
"learning_rate": 4.655994299334103e-05, |
|
"loss": 3.586, |
|
"step": 6100 |
|
}, |
|
{ |
|
"combined_loss": 2.5740702152252197, |
|
"distill_loss": 1.5186127424240112, |
|
"epoch": 0.22345959410945856, |
|
"step": 6100, |
|
"student_mlm_loss": 3.6295275688171387 |
|
}, |
|
{ |
|
"epoch": 0.22712286614403987, |
|
"grad_norm": 10.821826934814453, |
|
"learning_rate": 4.649851340393641e-05, |
|
"loss": 5.516, |
|
"step": 6200 |
|
}, |
|
{ |
|
"combined_loss": 4.770940780639648, |
|
"distill_loss": 1.5328683853149414, |
|
"epoch": 0.22712286614403987, |
|
"step": 6200, |
|
"student_mlm_loss": 8.009013175964355 |
|
}, |
|
{ |
|
"epoch": 0.23078613817862115, |
|
"grad_norm": 45.33203887939453, |
|
"learning_rate": 4.643708381453178e-05, |
|
"loss": 6.4937, |
|
"step": 6300 |
|
}, |
|
{ |
|
"combined_loss": 2.257235050201416, |
|
"distill_loss": 1.4594223499298096, |
|
"epoch": 0.23078613817862115, |
|
"step": 6300, |
|
"student_mlm_loss": 3.0550475120544434 |
|
}, |
|
{ |
|
"epoch": 0.23444941021320242, |
|
"grad_norm": 24.137001037597656, |
|
"learning_rate": 4.6375654225127166e-05, |
|
"loss": 2.8761, |
|
"step": 6400 |
|
}, |
|
{ |
|
"combined_loss": 3.673408031463623, |
|
"distill_loss": 1.5113860368728638, |
|
"epoch": 0.23444941021320242, |
|
"step": 6400, |
|
"student_mlm_loss": 5.835430145263672 |
|
}, |
|
{ |
|
"epoch": 0.23811268224778373, |
|
"grad_norm": 89.53437042236328, |
|
"learning_rate": 4.631422463572254e-05, |
|
"loss": 4.9469, |
|
"step": 6500 |
|
}, |
|
{ |
|
"combined_loss": 2.289175271987915, |
|
"distill_loss": 1.6255369186401367, |
|
"epoch": 0.23811268224778373, |
|
"step": 6500, |
|
"student_mlm_loss": 2.9528136253356934 |
|
}, |
|
{ |
|
"epoch": 0.241775954282365, |
|
"grad_norm": 29.47341537475586, |
|
"learning_rate": 4.6252795046317915e-05, |
|
"loss": 3.2857, |
|
"step": 6600 |
|
}, |
|
{ |
|
"combined_loss": 2.986036777496338, |
|
"distill_loss": 1.3628634214401245, |
|
"epoch": 0.241775954282365, |
|
"step": 6600, |
|
"student_mlm_loss": 4.609210014343262 |
|
}, |
|
{ |
|
"epoch": 0.24543922631694629, |
|
"grad_norm": 8.413643836975098, |
|
"learning_rate": 4.6191365456913286e-05, |
|
"loss": 4.1874, |
|
"step": 6700 |
|
}, |
|
{ |
|
"combined_loss": 4.9381103515625, |
|
"distill_loss": 1.5604116916656494, |
|
"epoch": 0.24543922631694629, |
|
"step": 6700, |
|
"student_mlm_loss": 8.31580924987793 |
|
}, |
|
{ |
|
"epoch": 0.2491024983515276, |
|
"grad_norm": 19.279678344726562, |
|
"learning_rate": 4.6129935867508664e-05, |
|
"loss": 5.5581, |
|
"step": 6800 |
|
}, |
|
{ |
|
"combined_loss": 4.7175493240356445, |
|
"distill_loss": 1.5657355785369873, |
|
"epoch": 0.2491024983515276, |
|
"step": 6800, |
|
"student_mlm_loss": 7.869362831115723 |
|
}, |
|
{ |
|
"epoch": 0.25276577038610887, |
|
"grad_norm": 14.9283447265625, |
|
"learning_rate": 4.606850627810404e-05, |
|
"loss": 4.6319, |
|
"step": 6900 |
|
}, |
|
{ |
|
"combined_loss": 5.707411766052246, |
|
"distill_loss": 1.566019058227539, |
|
"epoch": 0.25276577038610887, |
|
"step": 6900, |
|
"student_mlm_loss": 9.848804473876953 |
|
}, |
|
{ |
|
"epoch": 0.25642904242069015, |
|
"grad_norm": 5.006555557250977, |
|
"learning_rate": 4.600707668869941e-05, |
|
"loss": 6.1192, |
|
"step": 7000 |
|
}, |
|
{ |
|
"combined_loss": 4.373297691345215, |
|
"distill_loss": 1.4654217958450317, |
|
"epoch": 0.25642904242069015, |
|
"step": 7000, |
|
"student_mlm_loss": 7.281173229217529 |
|
}, |
|
{ |
|
"epoch": 0.2600923144552714, |
|
"grad_norm": 15.025683403015137, |
|
"learning_rate": 4.594564709929479e-05, |
|
"loss": 3.472, |
|
"step": 7100 |
|
}, |
|
{ |
|
"combined_loss": 5.1388630867004395, |
|
"distill_loss": 1.5254905223846436, |
|
"epoch": 0.2600923144552714, |
|
"step": 7100, |
|
"student_mlm_loss": 8.752235412597656 |
|
}, |
|
{ |
|
"epoch": 0.26375558648985276, |
|
"grad_norm": 44.157169342041016, |
|
"learning_rate": 4.588421750989017e-05, |
|
"loss": 8.8482, |
|
"step": 7200 |
|
}, |
|
{ |
|
"combined_loss": 2.1565892696380615, |
|
"distill_loss": 1.2985585927963257, |
|
"epoch": 0.26375558648985276, |
|
"step": 7200, |
|
"student_mlm_loss": 3.014619827270508 |
|
}, |
|
{ |
|
"epoch": 0.26741885852443403, |
|
"grad_norm": 5.755523204803467, |
|
"learning_rate": 4.5822787920485546e-05, |
|
"loss": 5.7829, |
|
"step": 7300 |
|
}, |
|
{ |
|
"combined_loss": 2.5404441356658936, |
|
"distill_loss": 1.5058717727661133, |
|
"epoch": 0.26741885852443403, |
|
"step": 7300, |
|
"student_mlm_loss": 3.575016498565674 |
|
}, |
|
{ |
|
"epoch": 0.2710821305590153, |
|
"grad_norm": 15.252013206481934, |
|
"learning_rate": 4.576135833108092e-05, |
|
"loss": 7.9361, |
|
"step": 7400 |
|
}, |
|
{ |
|
"combined_loss": 2.5752511024475098, |
|
"distill_loss": 1.5916697978973389, |
|
"epoch": 0.2710821305590153, |
|
"step": 7400, |
|
"student_mlm_loss": 3.5588326454162598 |
|
}, |
|
{ |
|
"epoch": 0.2747454025935966, |
|
"grad_norm": 26.218740463256836, |
|
"learning_rate": 4.5699928741676295e-05, |
|
"loss": 4.8534, |
|
"step": 7500 |
|
}, |
|
{ |
|
"combined_loss": 2.1656486988067627, |
|
"distill_loss": 1.4179739952087402, |
|
"epoch": 0.2747454025935966, |
|
"step": 7500, |
|
"student_mlm_loss": 2.913323402404785 |
|
}, |
|
{ |
|
"epoch": 0.27840867462817787, |
|
"grad_norm": 6.031148910522461, |
|
"learning_rate": 4.5638499152271666e-05, |
|
"loss": 6.4535, |
|
"step": 7600 |
|
}, |
|
{ |
|
"combined_loss": 2.8603813648223877, |
|
"distill_loss": 1.5837383270263672, |
|
"epoch": 0.27840867462817787, |
|
"step": 7600, |
|
"student_mlm_loss": 4.137024402618408 |
|
}, |
|
{ |
|
"epoch": 0.2820719466627592, |
|
"grad_norm": 107.95591735839844, |
|
"learning_rate": 4.5577069562867044e-05, |
|
"loss": 3.2702, |
|
"step": 7700 |
|
}, |
|
{ |
|
"combined_loss": 1.8474111557006836, |
|
"distill_loss": 1.437280297279358, |
|
"epoch": 0.2820719466627592, |
|
"step": 7700, |
|
"student_mlm_loss": 2.257542133331299 |
|
}, |
|
{ |
|
"epoch": 0.2857352186973405, |
|
"grad_norm": 5.394913673400879, |
|
"learning_rate": 4.551563997346242e-05, |
|
"loss": 2.8998, |
|
"step": 7800 |
|
}, |
|
{ |
|
"combined_loss": 4.77987813949585, |
|
"distill_loss": 1.5358555316925049, |
|
"epoch": 0.2857352186973405, |
|
"step": 7800, |
|
"student_mlm_loss": 8.023900985717773 |
|
}, |
|
{ |
|
"epoch": 0.28939849073192175, |
|
"grad_norm": 7.790286540985107, |
|
"learning_rate": 4.545421038405779e-05, |
|
"loss": 2.9018, |
|
"step": 7900 |
|
}, |
|
{ |
|
"combined_loss": 3.34071946144104, |
|
"distill_loss": 1.3893283605575562, |
|
"epoch": 0.28939849073192175, |
|
"step": 7900, |
|
"student_mlm_loss": 5.292110443115234 |
|
}, |
|
{ |
|
"epoch": 0.29306176276650303, |
|
"grad_norm": 10.3685941696167, |
|
"learning_rate": 4.539278079465317e-05, |
|
"loss": 3.5884, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.29306176276650303, |
|
"eval_loss": 3.7581117153167725, |
|
"eval_runtime": 2.0302, |
|
"eval_samples_per_second": 3446.049, |
|
"eval_steps_per_second": 13.792, |
|
"step": 8000 |
|
}, |
|
{ |
|
"combined_loss": 2.8955559730529785, |
|
"distill_loss": 1.3627426624298096, |
|
"epoch": 0.29306176276650303, |
|
"step": 8000, |
|
"student_mlm_loss": 4.428369522094727 |
|
}, |
|
{ |
|
"epoch": 0.2967250348010843, |
|
"grad_norm": 49.06619644165039, |
|
"learning_rate": 4.533135120524855e-05, |
|
"loss": 3.5788, |
|
"step": 8100 |
|
}, |
|
{ |
|
"combined_loss": 4.52724552154541, |
|
"distill_loss": 1.3924285173416138, |
|
"epoch": 0.2967250348010843, |
|
"step": 8100, |
|
"student_mlm_loss": 7.662062644958496 |
|
}, |
|
{ |
|
"epoch": 0.30038830683566564, |
|
"grad_norm": 27.40319061279297, |
|
"learning_rate": 4.5269921615843926e-05, |
|
"loss": 3.9229, |
|
"step": 8200 |
|
}, |
|
{ |
|
"combined_loss": 3.3075461387634277, |
|
"distill_loss": 1.5311795473098755, |
|
"epoch": 0.30038830683566564, |
|
"step": 8200, |
|
"student_mlm_loss": 5.0839128494262695 |
|
}, |
|
{ |
|
"epoch": 0.3040515788702469, |
|
"grad_norm": 31.07562255859375, |
|
"learning_rate": 4.52084920264393e-05, |
|
"loss": 3.9566, |
|
"step": 8300 |
|
}, |
|
{ |
|
"combined_loss": 1.9784274101257324, |
|
"distill_loss": 1.41036057472229, |
|
"epoch": 0.3040515788702469, |
|
"step": 8300, |
|
"student_mlm_loss": 2.546494245529175 |
|
}, |
|
{ |
|
"epoch": 0.3077148509048282, |
|
"grad_norm": 4.548298358917236, |
|
"learning_rate": 4.514706243703467e-05, |
|
"loss": 5.1591, |
|
"step": 8400 |
|
}, |
|
{ |
|
"combined_loss": 1.9796760082244873, |
|
"distill_loss": 1.408158302307129, |
|
"epoch": 0.3077148509048282, |
|
"step": 8400, |
|
"student_mlm_loss": 2.5511937141418457 |
|
}, |
|
{ |
|
"epoch": 0.3113781229394095, |
|
"grad_norm": 8.897561073303223, |
|
"learning_rate": 4.5085632847630046e-05, |
|
"loss": 5.7057, |
|
"step": 8500 |
|
}, |
|
{ |
|
"combined_loss": 2.080671548843384, |
|
"distill_loss": 1.4321857690811157, |
|
"epoch": 0.3113781229394095, |
|
"step": 8500, |
|
"student_mlm_loss": 2.7291574478149414 |
|
}, |
|
{ |
|
"epoch": 0.31504139497399075, |
|
"grad_norm": 10.005053520202637, |
|
"learning_rate": 4.5024203258225424e-05, |
|
"loss": 7.7928, |
|
"step": 8600 |
|
}, |
|
{ |
|
"combined_loss": 2.6395342350006104, |
|
"distill_loss": 1.5675503015518188, |
|
"epoch": 0.31504139497399075, |
|
"step": 8600, |
|
"student_mlm_loss": 3.7115182876586914 |
|
}, |
|
{ |
|
"epoch": 0.31870466700857203, |
|
"grad_norm": 5.425146579742432, |
|
"learning_rate": 4.49627736688208e-05, |
|
"loss": 3.7716, |
|
"step": 8700 |
|
}, |
|
{ |
|
"combined_loss": 2.9848690032958984, |
|
"distill_loss": 1.592170000076294, |
|
"epoch": 0.31870466700857203, |
|
"step": 8700, |
|
"student_mlm_loss": 4.377568244934082 |
|
}, |
|
{ |
|
"epoch": 0.32236793904315336, |
|
"grad_norm": 5.64302396774292, |
|
"learning_rate": 4.490134407941617e-05, |
|
"loss": 6.8888, |
|
"step": 8800 |
|
}, |
|
{ |
|
"combined_loss": 4.167844772338867, |
|
"distill_loss": 1.4308810234069824, |
|
"epoch": 0.32236793904315336, |
|
"step": 8800, |
|
"student_mlm_loss": 6.904808044433594 |
|
}, |
|
{ |
|
"epoch": 0.32603121107773464, |
|
"grad_norm": 99.88166809082031, |
|
"learning_rate": 4.483991449001155e-05, |
|
"loss": 3.988, |
|
"step": 8900 |
|
}, |
|
{ |
|
"combined_loss": 2.484290599822998, |
|
"distill_loss": 1.3509743213653564, |
|
"epoch": 0.32603121107773464, |
|
"step": 8900, |
|
"student_mlm_loss": 3.6176071166992188 |
|
}, |
|
{ |
|
"epoch": 0.3296944831123159, |
|
"grad_norm": 74.52608489990234, |
|
"learning_rate": 4.477848490060693e-05, |
|
"loss": 7.0959, |
|
"step": 9000 |
|
}, |
|
{ |
|
"combined_loss": 3.0457074642181396, |
|
"distill_loss": 1.3116565942764282, |
|
"epoch": 0.3296944831123159, |
|
"step": 9000, |
|
"student_mlm_loss": 4.779758453369141 |
|
}, |
|
{ |
|
"epoch": 0.3333577551468972, |
|
"grad_norm": 11.735849380493164, |
|
"learning_rate": 4.47170553112023e-05, |
|
"loss": 3.3274, |
|
"step": 9100 |
|
}, |
|
{ |
|
"combined_loss": 4.452191352844238, |
|
"distill_loss": 1.3943032026290894, |
|
"epoch": 0.3333577551468972, |
|
"step": 9100, |
|
"student_mlm_loss": 7.510079860687256 |
|
}, |
|
{ |
|
"epoch": 0.33702102718147847, |
|
"grad_norm": 9.601778030395508, |
|
"learning_rate": 4.465562572179768e-05, |
|
"loss": 3.8928, |
|
"step": 9200 |
|
}, |
|
{ |
|
"combined_loss": 4.875356197357178, |
|
"distill_loss": 1.4536867141723633, |
|
"epoch": 0.33702102718147847, |
|
"step": 9200, |
|
"student_mlm_loss": 8.297025680541992 |
|
}, |
|
{ |
|
"epoch": 0.3406842992160598, |
|
"grad_norm": 9.49219799041748, |
|
"learning_rate": 4.459419613239305e-05, |
|
"loss": 3.7362, |
|
"step": 9300 |
|
}, |
|
{ |
|
"combined_loss": 2.9027719497680664, |
|
"distill_loss": 1.3480241298675537, |
|
"epoch": 0.3406842992160598, |
|
"step": 9300, |
|
"student_mlm_loss": 4.45751953125 |
|
}, |
|
{ |
|
"epoch": 0.3443475712506411, |
|
"grad_norm": 7.6804728507995605, |
|
"learning_rate": 4.453276654298843e-05, |
|
"loss": 4.4018, |
|
"step": 9400 |
|
}, |
|
{ |
|
"combined_loss": 2.7022647857666016, |
|
"distill_loss": 1.3614214658737183, |
|
"epoch": 0.3443475712506411, |
|
"step": 9400, |
|
"student_mlm_loss": 4.043107986450195 |
|
}, |
|
{ |
|
"epoch": 0.34801084328522236, |
|
"grad_norm": 38.41388702392578, |
|
"learning_rate": 4.4471336953583804e-05, |
|
"loss": 3.0632, |
|
"step": 9500 |
|
}, |
|
{ |
|
"combined_loss": 1.9494025707244873, |
|
"distill_loss": 1.3876396417617798, |
|
"epoch": 0.34801084328522236, |
|
"step": 9500, |
|
"student_mlm_loss": 2.5111656188964844 |
|
}, |
|
{ |
|
"epoch": 0.35167411531980364, |
|
"grad_norm": 37.10932540893555, |
|
"learning_rate": 4.440990736417918e-05, |
|
"loss": 3.3258, |
|
"step": 9600 |
|
}, |
|
{ |
|
"combined_loss": 2.6435036659240723, |
|
"distill_loss": 1.3941702842712402, |
|
"epoch": 0.35167411531980364, |
|
"step": 9600, |
|
"student_mlm_loss": 3.8928370475769043 |
|
}, |
|
{ |
|
"epoch": 0.3553373873543849, |
|
"grad_norm": 17.652099609375, |
|
"learning_rate": 4.434847777477455e-05, |
|
"loss": 8.3854, |
|
"step": 9700 |
|
}, |
|
{ |
|
"combined_loss": 2.336359977722168, |
|
"distill_loss": 1.5497583150863647, |
|
"epoch": 0.3553373873543849, |
|
"step": 9700, |
|
"student_mlm_loss": 3.1229615211486816 |
|
}, |
|
{ |
|
"epoch": 0.35900065938896625, |
|
"grad_norm": 58.41902160644531, |
|
"learning_rate": 4.428704818536993e-05, |
|
"loss": 6.9624, |
|
"step": 9800 |
|
}, |
|
{ |
|
"combined_loss": 2.6561923027038574, |
|
"distill_loss": 1.5154696702957153, |
|
"epoch": 0.35900065938896625, |
|
"step": 9800, |
|
"student_mlm_loss": 3.796915054321289 |
|
}, |
|
{ |
|
"epoch": 0.3626639314235475, |
|
"grad_norm": 23.230680465698242, |
|
"learning_rate": 4.422561859596531e-05, |
|
"loss": 3.4226, |
|
"step": 9900 |
|
}, |
|
{ |
|
"combined_loss": 1.9643871784210205, |
|
"distill_loss": 1.3770619630813599, |
|
"epoch": 0.3626639314235475, |
|
"step": 9900, |
|
"student_mlm_loss": 2.5517125129699707 |
|
}, |
|
{ |
|
"epoch": 0.3663272034581288, |
|
"grad_norm": 11.580951690673828, |
|
"learning_rate": 4.416418900656068e-05, |
|
"loss": 4.7414, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.3663272034581288, |
|
"eval_loss": 3.8432743549346924, |
|
"eval_runtime": 2.2879, |
|
"eval_samples_per_second": 3057.772, |
|
"eval_steps_per_second": 12.238, |
|
"step": 10000 |
|
}, |
|
{ |
|
"combined_loss": 2.395519971847534, |
|
"distill_loss": 1.382614254951477, |
|
"epoch": 0.3663272034581288, |
|
"step": 10000, |
|
"student_mlm_loss": 3.408425807952881 |
|
}, |
|
{ |
|
"epoch": 0.3699904754927101, |
|
"grad_norm": 19.014955520629883, |
|
"learning_rate": 4.410275941715606e-05, |
|
"loss": 6.6365, |
|
"step": 10100 |
|
}, |
|
{ |
|
"combined_loss": 2.1697921752929688, |
|
"distill_loss": 1.5128508806228638, |
|
"epoch": 0.3699904754927101, |
|
"step": 10100, |
|
"student_mlm_loss": 2.8267335891723633 |
|
}, |
|
{ |
|
"epoch": 0.37365374752729136, |
|
"grad_norm": 6.532296180725098, |
|
"learning_rate": 4.404132982775143e-05, |
|
"loss": 3.199, |
|
"step": 10200 |
|
}, |
|
{ |
|
"combined_loss": 1.8516874313354492, |
|
"distill_loss": 1.413927674293518, |
|
"epoch": 0.37365374752729136, |
|
"step": 10200, |
|
"student_mlm_loss": 2.289447069168091 |
|
}, |
|
{ |
|
"epoch": 0.3773170195618727, |
|
"grad_norm": 25.607181549072266, |
|
"learning_rate": 4.397990023834681e-05, |
|
"loss": 3.822, |
|
"step": 10300 |
|
}, |
|
{ |
|
"combined_loss": 3.3827946186065674, |
|
"distill_loss": 1.4635933637619019, |
|
"epoch": 0.3773170195618727, |
|
"step": 10300, |
|
"student_mlm_loss": 5.301995754241943 |
|
}, |
|
{ |
|
"epoch": 0.38098029159645397, |
|
"grad_norm": 12.52314567565918, |
|
"learning_rate": 4.3918470648942184e-05, |
|
"loss": 6.9491, |
|
"step": 10400 |
|
}, |
|
{ |
|
"combined_loss": 1.9748457670211792, |
|
"distill_loss": 1.445707082748413, |
|
"epoch": 0.38098029159645397, |
|
"step": 10400, |
|
"student_mlm_loss": 2.5039844512939453 |
|
}, |
|
{ |
|
"epoch": 0.38464356363103525, |
|
"grad_norm": 12.69713306427002, |
|
"learning_rate": 4.385704105953756e-05, |
|
"loss": 9.4794, |
|
"step": 10500 |
|
}, |
|
{ |
|
"combined_loss": 3.5582261085510254, |
|
"distill_loss": 1.4324952363967896, |
|
"epoch": 0.38464356363103525, |
|
"step": 10500, |
|
"student_mlm_loss": 5.683957099914551 |
|
}, |
|
{ |
|
"epoch": 0.3883068356656165, |
|
"grad_norm": 9.131495475769043, |
|
"learning_rate": 4.379561147013293e-05, |
|
"loss": 7.1932, |
|
"step": 10600 |
|
}, |
|
{ |
|
"combined_loss": 6.080216407775879, |
|
"distill_loss": 1.477283000946045, |
|
"epoch": 0.3883068356656165, |
|
"step": 10600, |
|
"student_mlm_loss": 10.683149337768555 |
|
}, |
|
{ |
|
"epoch": 0.3919701077001978, |
|
"grad_norm": 24.739810943603516, |
|
"learning_rate": 4.373418188072831e-05, |
|
"loss": 5.6399, |
|
"step": 10700 |
|
}, |
|
{ |
|
"combined_loss": 3.7993698120117188, |
|
"distill_loss": 1.452317476272583, |
|
"epoch": 0.3919701077001978, |
|
"step": 10700, |
|
"student_mlm_loss": 6.146422386169434 |
|
}, |
|
{ |
|
"epoch": 0.3956333797347791, |
|
"grad_norm": 42.44218063354492, |
|
"learning_rate": 4.367275229132369e-05, |
|
"loss": 4.2291, |
|
"step": 10800 |
|
}, |
|
{ |
|
"combined_loss": 2.037079095840454, |
|
"distill_loss": 1.4349570274353027, |
|
"epoch": 0.3956333797347791, |
|
"step": 10800, |
|
"student_mlm_loss": 2.6392011642456055 |
|
}, |
|
{ |
|
"epoch": 0.3992966517693604, |
|
"grad_norm": 231.26116943359375, |
|
"learning_rate": 4.361132270191906e-05, |
|
"loss": 4.6188, |
|
"step": 10900 |
|
}, |
|
{ |
|
"combined_loss": 182.1781768798828, |
|
"distill_loss": 1.4427307844161987, |
|
"epoch": 0.3992966517693604, |
|
"step": 10900, |
|
"student_mlm_loss": 362.91363525390625 |
|
}, |
|
{ |
|
"epoch": 0.4029599238039417, |
|
"grad_norm": 16.01262092590332, |
|
"learning_rate": 4.354989311251444e-05, |
|
"loss": 4.8535, |
|
"step": 11000 |
|
}, |
|
{ |
|
"combined_loss": 3.2922308444976807, |
|
"distill_loss": 1.7308834791183472, |
|
"epoch": 0.4029599238039417, |
|
"step": 11000, |
|
"student_mlm_loss": 4.853578090667725 |
|
}, |
|
{ |
|
"epoch": 0.40662319583852297, |
|
"grad_norm": 23.69573974609375, |
|
"learning_rate": 4.3488463523109816e-05, |
|
"loss": 2.8692, |
|
"step": 11100 |
|
}, |
|
{ |
|
"combined_loss": 2.1010890007019043, |
|
"distill_loss": 1.3140019178390503, |
|
"epoch": 0.40662319583852297, |
|
"step": 11100, |
|
"student_mlm_loss": 2.888176202774048 |
|
}, |
|
{ |
|
"epoch": 0.41028646787310424, |
|
"grad_norm": 9.695125579833984, |
|
"learning_rate": 4.3427033933705193e-05, |
|
"loss": 7.6829, |
|
"step": 11200 |
|
}, |
|
{ |
|
"combined_loss": 2.24194598197937, |
|
"distill_loss": 1.560063362121582, |
|
"epoch": 0.41028646787310424, |
|
"step": 11200, |
|
"student_mlm_loss": 2.923828601837158 |
|
}, |
|
{ |
|
"epoch": 0.4139497399076855, |
|
"grad_norm": 37.06310272216797, |
|
"learning_rate": 4.3365604344300565e-05, |
|
"loss": 3.5562, |
|
"step": 11300 |
|
}, |
|
{ |
|
"combined_loss": 9.297407150268555, |
|
"distill_loss": 1.2328678369522095, |
|
"epoch": 0.4139497399076855, |
|
"step": 11300, |
|
"student_mlm_loss": 17.36194610595703 |
|
}, |
|
{ |
|
"epoch": 0.41761301194226685, |
|
"grad_norm": 6.411166667938232, |
|
"learning_rate": 4.330417475489594e-05, |
|
"loss": 4.0543, |
|
"step": 11400 |
|
}, |
|
{ |
|
"combined_loss": 2.141500949859619, |
|
"distill_loss": 1.467064380645752, |
|
"epoch": 0.41761301194226685, |
|
"step": 11400, |
|
"student_mlm_loss": 2.8159377574920654 |
|
}, |
|
{ |
|
"epoch": 0.42127628397684813, |
|
"grad_norm": 5.802677154541016, |
|
"learning_rate": 4.3242745165491313e-05, |
|
"loss": 14.3215, |
|
"step": 11500 |
|
}, |
|
{ |
|
"combined_loss": 6.576130390167236, |
|
"distill_loss": 1.46802818775177, |
|
"epoch": 0.42127628397684813, |
|
"step": 11500, |
|
"student_mlm_loss": 11.684232711791992 |
|
}, |
|
{ |
|
"epoch": 0.4249395560114294, |
|
"grad_norm": 15.660844802856445, |
|
"learning_rate": 4.318131557608669e-05, |
|
"loss": 30.5877, |
|
"step": 11600 |
|
}, |
|
{ |
|
"combined_loss": 1.9305293560028076, |
|
"distill_loss": 1.405720591545105, |
|
"epoch": 0.4249395560114294, |
|
"step": 11600, |
|
"student_mlm_loss": 2.4553380012512207 |
|
}, |
|
{ |
|
"epoch": 0.4286028280460107, |
|
"grad_norm": 3.041947603225708, |
|
"learning_rate": 4.311988598668207e-05, |
|
"loss": 3.7156, |
|
"step": 11700 |
|
}, |
|
{ |
|
"combined_loss": 2.78572940826416, |
|
"distill_loss": 1.45219886302948, |
|
"epoch": 0.4286028280460107, |
|
"step": 11700, |
|
"student_mlm_loss": 4.119259834289551 |
|
}, |
|
{ |
|
"epoch": 0.43226610008059196, |
|
"grad_norm": 20.6744384765625, |
|
"learning_rate": 4.305845639727744e-05, |
|
"loss": 3.3939, |
|
"step": 11800 |
|
}, |
|
{ |
|
"combined_loss": 2.0835349559783936, |
|
"distill_loss": 1.4508671760559082, |
|
"epoch": 0.43226610008059196, |
|
"step": 11800, |
|
"student_mlm_loss": 2.716202735900879 |
|
}, |
|
{ |
|
"epoch": 0.4359293721151733, |
|
"grad_norm": 5.804731369018555, |
|
"learning_rate": 4.299702680787282e-05, |
|
"loss": 6.1951, |
|
"step": 11900 |
|
}, |
|
{ |
|
"combined_loss": 3.1048030853271484, |
|
"distill_loss": 1.455564260482788, |
|
"epoch": 0.4359293721151733, |
|
"step": 11900, |
|
"student_mlm_loss": 4.75404167175293 |
|
}, |
|
{ |
|
"epoch": 0.4395926441497546, |
|
"grad_norm": 33.689720153808594, |
|
"learning_rate": 4.2935597218468196e-05, |
|
"loss": 3.6583, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.4395926441497546, |
|
"eval_loss": 3.919630527496338, |
|
"eval_runtime": 2.0425, |
|
"eval_samples_per_second": 3425.261, |
|
"eval_steps_per_second": 13.709, |
|
"step": 12000 |
|
}, |
|
{ |
|
"combined_loss": 2.315965175628662, |
|
"distill_loss": 1.3009124994277954, |
|
"epoch": 0.4395926441497546, |
|
"step": 12000, |
|
"student_mlm_loss": 3.3310179710388184 |
|
}, |
|
{ |
|
"epoch": 0.44325591618433585, |
|
"grad_norm": 24.73545265197754, |
|
"learning_rate": 4.2874167629063574e-05, |
|
"loss": 2.9828, |
|
"step": 12100 |
|
}, |
|
{ |
|
"combined_loss": 5.060952186584473, |
|
"distill_loss": 1.3712559938430786, |
|
"epoch": 0.44325591618433585, |
|
"step": 12100, |
|
"student_mlm_loss": 8.750648498535156 |
|
}, |
|
{ |
|
"epoch": 0.44691918821891713, |
|
"grad_norm": 19.548921585083008, |
|
"learning_rate": 4.2812738039658945e-05, |
|
"loss": 3.1716, |
|
"step": 12200 |
|
}, |
|
{ |
|
"combined_loss": 2.3697307109832764, |
|
"distill_loss": 1.480096459388733, |
|
"epoch": 0.44691918821891713, |
|
"step": 12200, |
|
"student_mlm_loss": 3.2593650817871094 |
|
}, |
|
{ |
|
"epoch": 0.4505824602534984, |
|
"grad_norm": 6.217925548553467, |
|
"learning_rate": 4.2751308450254316e-05, |
|
"loss": 5.1037, |
|
"step": 12300 |
|
}, |
|
{ |
|
"combined_loss": 1.9682085514068604, |
|
"distill_loss": 1.3534774780273438, |
|
"epoch": 0.4505824602534984, |
|
"step": 12300, |
|
"student_mlm_loss": 2.582939624786377 |
|
}, |
|
{ |
|
"epoch": 0.45424573228807974, |
|
"grad_norm": 53.592735290527344, |
|
"learning_rate": 4.2689878860849694e-05, |
|
"loss": 5.3409, |
|
"step": 12400 |
|
}, |
|
{ |
|
"combined_loss": 2.413550853729248, |
|
"distill_loss": 1.3951433897018433, |
|
"epoch": 0.45424573228807974, |
|
"step": 12400, |
|
"student_mlm_loss": 3.4319584369659424 |
|
}, |
|
{ |
|
"epoch": 0.457909004322661, |
|
"grad_norm": 13.716507911682129, |
|
"learning_rate": 4.262844927144507e-05, |
|
"loss": 3.2261, |
|
"step": 12500 |
|
}, |
|
{ |
|
"combined_loss": 3.6318020820617676, |
|
"distill_loss": 1.3529082536697388, |
|
"epoch": 0.457909004322661, |
|
"step": 12500, |
|
"student_mlm_loss": 5.910696029663086 |
|
}, |
|
{ |
|
"epoch": 0.4615722763572423, |
|
"grad_norm": 16.206933975219727, |
|
"learning_rate": 4.256701968204045e-05, |
|
"loss": 3.1534, |
|
"step": 12600 |
|
}, |
|
{ |
|
"combined_loss": 15.371432304382324, |
|
"distill_loss": 1.4290032386779785, |
|
"epoch": 0.4615722763572423, |
|
"step": 12600, |
|
"student_mlm_loss": 29.313861846923828 |
|
}, |
|
{ |
|
"epoch": 0.4652355483918236, |
|
"grad_norm": 8.626960754394531, |
|
"learning_rate": 4.250559009263582e-05, |
|
"loss": 3.0824, |
|
"step": 12700 |
|
}, |
|
{ |
|
"combined_loss": 2.0715112686157227, |
|
"distill_loss": 1.3553932905197144, |
|
"epoch": 0.4652355483918236, |
|
"step": 12700, |
|
"student_mlm_loss": 2.7876293659210205 |
|
}, |
|
{ |
|
"epoch": 0.46889882042640485, |
|
"grad_norm": 8.153878211975098, |
|
"learning_rate": 4.24441605032312e-05, |
|
"loss": 3.8805, |
|
"step": 12800 |
|
}, |
|
{ |
|
"combined_loss": 2.0972392559051514, |
|
"distill_loss": 1.2276250123977661, |
|
"epoch": 0.46889882042640485, |
|
"step": 12800, |
|
"student_mlm_loss": 2.966853618621826 |
|
}, |
|
{ |
|
"epoch": 0.4725620924609861, |
|
"grad_norm": 12.068700790405273, |
|
"learning_rate": 4.2382730913826576e-05, |
|
"loss": 2.8937, |
|
"step": 12900 |
|
}, |
|
{ |
|
"combined_loss": 2.9497852325439453, |
|
"distill_loss": 1.314728021621704, |
|
"epoch": 0.4725620924609861, |
|
"step": 12900, |
|
"student_mlm_loss": 4.584842681884766 |
|
}, |
|
{ |
|
"epoch": 0.47622536449556746, |
|
"grad_norm": 12.260379791259766, |
|
"learning_rate": 4.232130132442195e-05, |
|
"loss": 5.581, |
|
"step": 13000 |
|
}, |
|
{ |
|
"combined_loss": 1.8658246994018555, |
|
"distill_loss": 1.2703187465667725, |
|
"epoch": 0.47622536449556746, |
|
"step": 13000, |
|
"student_mlm_loss": 2.4613306522369385 |
|
}, |
|
{ |
|
"epoch": 0.47988863653014874, |
|
"grad_norm": 22.688852310180664, |
|
"learning_rate": 4.2259871735017325e-05, |
|
"loss": 7.0059, |
|
"step": 13100 |
|
}, |
|
{ |
|
"combined_loss": 3.673346519470215, |
|
"distill_loss": 1.397099256515503, |
|
"epoch": 0.47988863653014874, |
|
"step": 13100, |
|
"student_mlm_loss": 5.949593544006348 |
|
}, |
|
{ |
|
"epoch": 0.48355190856473, |
|
"grad_norm": 28.811817169189453, |
|
"learning_rate": 4.2198442145612696e-05, |
|
"loss": 9.6395, |
|
"step": 13200 |
|
}, |
|
{ |
|
"combined_loss": 2.036362409591675, |
|
"distill_loss": 1.3239866495132446, |
|
"epoch": 0.48355190856473, |
|
"step": 13200, |
|
"student_mlm_loss": 2.7487380504608154 |
|
}, |
|
{ |
|
"epoch": 0.4872151805993113, |
|
"grad_norm": 6.380947589874268, |
|
"learning_rate": 4.213701255620808e-05, |
|
"loss": 2.7095, |
|
"step": 13300 |
|
}, |
|
{ |
|
"combined_loss": 2.2547478675842285, |
|
"distill_loss": 1.4122509956359863, |
|
"epoch": 0.4872151805993113, |
|
"step": 13300, |
|
"student_mlm_loss": 3.09724497795105 |
|
}, |
|
{ |
|
"epoch": 0.49087845263389257, |
|
"grad_norm": 83.60982513427734, |
|
"learning_rate": 4.207558296680345e-05, |
|
"loss": 3.2917, |
|
"step": 13400 |
|
}, |
|
{ |
|
"combined_loss": 2.009040355682373, |
|
"distill_loss": 1.4236946105957031, |
|
"epoch": 0.49087845263389257, |
|
"step": 13400, |
|
"student_mlm_loss": 2.594385862350464 |
|
}, |
|
{ |
|
"epoch": 0.4945417246684739, |
|
"grad_norm": 10.06588077545166, |
|
"learning_rate": 4.201415337739883e-05, |
|
"loss": 12.3205, |
|
"step": 13500 |
|
}, |
|
{ |
|
"combined_loss": 2.9317073822021484, |
|
"distill_loss": 1.4229042530059814, |
|
"epoch": 0.4945417246684739, |
|
"step": 13500, |
|
"student_mlm_loss": 4.440510272979736 |
|
}, |
|
{ |
|
"epoch": 0.4982049967030552, |
|
"grad_norm": 4.126479625701904, |
|
"learning_rate": 4.19527237879942e-05, |
|
"loss": 3.8077, |
|
"step": 13600 |
|
}, |
|
{ |
|
"combined_loss": 1.9033926725387573, |
|
"distill_loss": 1.357490062713623, |
|
"epoch": 0.4982049967030552, |
|
"step": 13600, |
|
"student_mlm_loss": 2.4492952823638916 |
|
}, |
|
{ |
|
"epoch": 0.5018682687376365, |
|
"grad_norm": 18.483203887939453, |
|
"learning_rate": 4.189129419858958e-05, |
|
"loss": 11.6361, |
|
"step": 13700 |
|
}, |
|
{ |
|
"combined_loss": 3.165005683898926, |
|
"distill_loss": 1.3812006711959839, |
|
"epoch": 0.5018682687376365, |
|
"step": 13700, |
|
"student_mlm_loss": 4.948810577392578 |
|
}, |
|
{ |
|
"epoch": 0.5055315407722177, |
|
"grad_norm": 7.388655662536621, |
|
"learning_rate": 4.1829864609184956e-05, |
|
"loss": 3.875, |
|
"step": 13800 |
|
}, |
|
{ |
|
"combined_loss": 1.8155145645141602, |
|
"distill_loss": 1.3641600608825684, |
|
"epoch": 0.5055315407722177, |
|
"step": 13800, |
|
"student_mlm_loss": 2.266869068145752 |
|
}, |
|
{ |
|
"epoch": 0.509194812806799, |
|
"grad_norm": 9.352982521057129, |
|
"learning_rate": 4.176843501978033e-05, |
|
"loss": 9.268, |
|
"step": 13900 |
|
}, |
|
{ |
|
"combined_loss": 2.3618173599243164, |
|
"distill_loss": 1.3162891864776611, |
|
"epoch": 0.509194812806799, |
|
"step": 13900, |
|
"student_mlm_loss": 3.4073452949523926 |
|
}, |
|
{ |
|
"epoch": 0.5128580848413803, |
|
"grad_norm": 8.513871192932129, |
|
"learning_rate": 4.1707005430375705e-05, |
|
"loss": 3.3999, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.5128580848413803, |
|
"eval_loss": 3.5987370014190674, |
|
"eval_runtime": 2.2869, |
|
"eval_samples_per_second": 3059.222, |
|
"eval_steps_per_second": 12.244, |
|
"step": 14000 |
|
}, |
|
{ |
|
"combined_loss": 2.6841559410095215, |
|
"distill_loss": 1.401199460029602, |
|
"epoch": 0.5128580848413803, |
|
"step": 14000, |
|
"student_mlm_loss": 3.9671125411987305 |
|
}, |
|
{ |
|
"epoch": 0.5165213568759616, |
|
"grad_norm": 30.661813735961914, |
|
"learning_rate": 4.1645575840971076e-05, |
|
"loss": 18.3341, |
|
"step": 14100 |
|
}, |
|
{ |
|
"combined_loss": 4.752758026123047, |
|
"distill_loss": 1.247560977935791, |
|
"epoch": 0.5165213568759616, |
|
"step": 14100, |
|
"student_mlm_loss": 8.257954597473145 |
|
}, |
|
{ |
|
"epoch": 0.5201846289105428, |
|
"grad_norm": 40.303707122802734, |
|
"learning_rate": 4.158414625156646e-05, |
|
"loss": 3.1057, |
|
"step": 14200 |
|
}, |
|
{ |
|
"combined_loss": 1.988144874572754, |
|
"distill_loss": 1.2577546834945679, |
|
"epoch": 0.5201846289105428, |
|
"step": 14200, |
|
"student_mlm_loss": 2.7185349464416504 |
|
}, |
|
{ |
|
"epoch": 0.5238479009451242, |
|
"grad_norm": 19.77947235107422, |
|
"learning_rate": 4.152271666216183e-05, |
|
"loss": 7.3457, |
|
"step": 14300 |
|
}, |
|
{ |
|
"combined_loss": 4.299380779266357, |
|
"distill_loss": 1.2770593166351318, |
|
"epoch": 0.5238479009451242, |
|
"step": 14300, |
|
"student_mlm_loss": 7.321702480316162 |
|
}, |
|
{ |
|
"epoch": 0.5275111729797055, |
|
"grad_norm": 7.412100315093994, |
|
"learning_rate": 4.146128707275721e-05, |
|
"loss": 4.8104, |
|
"step": 14400 |
|
}, |
|
{ |
|
"combined_loss": 10.650766372680664, |
|
"distill_loss": 1.3233892917633057, |
|
"epoch": 0.5275111729797055, |
|
"step": 14400, |
|
"student_mlm_loss": 19.9781436920166 |
|
}, |
|
{ |
|
"epoch": 0.5311744450142868, |
|
"grad_norm": 5.799710750579834, |
|
"learning_rate": 4.139985748335258e-05, |
|
"loss": 3.4765, |
|
"step": 14500 |
|
}, |
|
{ |
|
"combined_loss": 2.4540774822235107, |
|
"distill_loss": 1.319036841392517, |
|
"epoch": 0.5311744450142868, |
|
"step": 14500, |
|
"student_mlm_loss": 3.589118003845215 |
|
}, |
|
{ |
|
"epoch": 0.5348377170488681, |
|
"grad_norm": 7.147758483886719, |
|
"learning_rate": 4.133842789394796e-05, |
|
"loss": 3.12, |
|
"step": 14600 |
|
}, |
|
{ |
|
"combined_loss": 1.8580541610717773, |
|
"distill_loss": 1.3114832639694214, |
|
"epoch": 0.5348377170488681, |
|
"step": 14600, |
|
"student_mlm_loss": 2.4046249389648438 |
|
}, |
|
{ |
|
"epoch": 0.5385009890834493, |
|
"grad_norm": 5.120487213134766, |
|
"learning_rate": 4.1276998304543336e-05, |
|
"loss": 6.7029, |
|
"step": 14700 |
|
}, |
|
{ |
|
"combined_loss": 1.9685258865356445, |
|
"distill_loss": 1.2455390691757202, |
|
"epoch": 0.5385009890834493, |
|
"step": 14700, |
|
"student_mlm_loss": 2.6915125846862793 |
|
}, |
|
{ |
|
"epoch": 0.5421642611180306, |
|
"grad_norm": 6.225675106048584, |
|
"learning_rate": 4.121556871513871e-05, |
|
"loss": 7.1336, |
|
"step": 14800 |
|
}, |
|
{ |
|
"combined_loss": 1.8886613845825195, |
|
"distill_loss": 1.2913726568222046, |
|
"epoch": 0.5421642611180306, |
|
"step": 14800, |
|
"student_mlm_loss": 2.485949993133545 |
|
}, |
|
{ |
|
"epoch": 0.5458275331526119, |
|
"grad_norm": 11.508244514465332, |
|
"learning_rate": 4.1154139125734085e-05, |
|
"loss": 11.8719, |
|
"step": 14900 |
|
}, |
|
{ |
|
"combined_loss": 2.1455585956573486, |
|
"distill_loss": 1.3711117506027222, |
|
"epoch": 0.5458275331526119, |
|
"step": 14900, |
|
"student_mlm_loss": 2.9200053215026855 |
|
}, |
|
{ |
|
"epoch": 0.5494908051871932, |
|
"grad_norm": 17.030780792236328, |
|
"learning_rate": 4.109270953632946e-05, |
|
"loss": 3.091, |
|
"step": 15000 |
|
}, |
|
{ |
|
"combined_loss": 1.9433504343032837, |
|
"distill_loss": 1.538583517074585, |
|
"epoch": 0.5494908051871932, |
|
"step": 15000, |
|
"student_mlm_loss": 2.3481173515319824 |
|
}, |
|
{ |
|
"epoch": 0.5531540772217745, |
|
"grad_norm": 4.692992687225342, |
|
"learning_rate": 4.103127994692484e-05, |
|
"loss": 3.2488, |
|
"step": 15100 |
|
}, |
|
{ |
|
"combined_loss": 2.820077657699585, |
|
"distill_loss": 1.2906769514083862, |
|
"epoch": 0.5531540772217745, |
|
"step": 15100, |
|
"student_mlm_loss": 4.349478244781494 |
|
}, |
|
{ |
|
"epoch": 0.5568173492563557, |
|
"grad_norm": 49.70892333984375, |
|
"learning_rate": 4.096985035752021e-05, |
|
"loss": 10.6593, |
|
"step": 15200 |
|
}, |
|
{ |
|
"combined_loss": 1.857104778289795, |
|
"distill_loss": 1.4106833934783936, |
|
"epoch": 0.5568173492563557, |
|
"step": 15200, |
|
"student_mlm_loss": 2.3035261631011963 |
|
}, |
|
{ |
|
"epoch": 0.5604806212909371, |
|
"grad_norm": 7.913967609405518, |
|
"learning_rate": 4.090842076811558e-05, |
|
"loss": 3.3056, |
|
"step": 15300 |
|
}, |
|
{ |
|
"combined_loss": 3.2144076824188232, |
|
"distill_loss": 1.3917032480239868, |
|
"epoch": 0.5604806212909371, |
|
"step": 15300, |
|
"student_mlm_loss": 5.037112236022949 |
|
}, |
|
{ |
|
"epoch": 0.5641438933255184, |
|
"grad_norm": 10.575057983398438, |
|
"learning_rate": 4.084699117871096e-05, |
|
"loss": 10.0757, |
|
"step": 15400 |
|
}, |
|
{ |
|
"combined_loss": 5.352452754974365, |
|
"distill_loss": 1.3542910814285278, |
|
"epoch": 0.5641438933255184, |
|
"step": 15400, |
|
"student_mlm_loss": 9.350614547729492 |
|
}, |
|
{ |
|
"epoch": 0.5678071653600997, |
|
"grad_norm": 119.92784118652344, |
|
"learning_rate": 4.078556158930634e-05, |
|
"loss": 3.4463, |
|
"step": 15500 |
|
}, |
|
{ |
|
"combined_loss": 1.7753610610961914, |
|
"distill_loss": 1.3875095844268799, |
|
"epoch": 0.5678071653600997, |
|
"step": 15500, |
|
"student_mlm_loss": 2.163212537765503 |
|
}, |
|
{ |
|
"epoch": 0.571470437394681, |
|
"grad_norm": 4.203140735626221, |
|
"learning_rate": 4.0724131999901717e-05, |
|
"loss": 4.8205, |
|
"step": 15600 |
|
}, |
|
{ |
|
"combined_loss": 1.8941802978515625, |
|
"distill_loss": 1.3584777116775513, |
|
"epoch": 0.571470437394681, |
|
"step": 15600, |
|
"student_mlm_loss": 2.4298830032348633 |
|
}, |
|
{ |
|
"epoch": 0.5751337094292622, |
|
"grad_norm": 16.848825454711914, |
|
"learning_rate": 4.066270241049709e-05, |
|
"loss": 7.7339, |
|
"step": 15700 |
|
}, |
|
{ |
|
"combined_loss": 1.9499808549880981, |
|
"distill_loss": 1.3122260570526123, |
|
"epoch": 0.5751337094292622, |
|
"step": 15700, |
|
"student_mlm_loss": 2.587735652923584 |
|
}, |
|
{ |
|
"epoch": 0.5787969814638435, |
|
"grad_norm": 2.9838955402374268, |
|
"learning_rate": 4.0601272821092465e-05, |
|
"loss": 3.4354, |
|
"step": 15800 |
|
}, |
|
{ |
|
"combined_loss": 1.9672229290008545, |
|
"distill_loss": 1.3119910955429077, |
|
"epoch": 0.5787969814638435, |
|
"step": 15800, |
|
"student_mlm_loss": 2.622454881668091 |
|
}, |
|
{ |
|
"epoch": 0.5824602534984248, |
|
"grad_norm": 6.6938676834106445, |
|
"learning_rate": 4.053984323168784e-05, |
|
"loss": 5.2244, |
|
"step": 15900 |
|
}, |
|
{ |
|
"combined_loss": 2.8469321727752686, |
|
"distill_loss": 1.361178994178772, |
|
"epoch": 0.5824602534984248, |
|
"step": 15900, |
|
"student_mlm_loss": 4.332685470581055 |
|
}, |
|
{ |
|
"epoch": 0.5861235255330061, |
|
"grad_norm": 31.440717697143555, |
|
"learning_rate": 4.047841364228322e-05, |
|
"loss": 8.7168, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5861235255330061, |
|
"eval_loss": 3.480536937713623, |
|
"eval_runtime": 2.1572, |
|
"eval_samples_per_second": 3243.154, |
|
"eval_steps_per_second": 12.98, |
|
"step": 16000 |
|
}, |
|
{ |
|
"combined_loss": 2.0847339630126953, |
|
"distill_loss": 1.4640412330627441, |
|
"epoch": 0.5861235255330061, |
|
"step": 16000, |
|
"student_mlm_loss": 2.7054266929626465 |
|
}, |
|
{ |
|
"epoch": 0.5897867975675873, |
|
"grad_norm": 6.238570690155029, |
|
"learning_rate": 4.041698405287859e-05, |
|
"loss": 3.2375, |
|
"step": 16100 |
|
}, |
|
{ |
|
"combined_loss": 2.2635374069213867, |
|
"distill_loss": 1.5188945531845093, |
|
"epoch": 0.5897867975675873, |
|
"step": 16100, |
|
"student_mlm_loss": 3.0081801414489746 |
|
}, |
|
{ |
|
"epoch": 0.5934500696021686, |
|
"grad_norm": 11.832098960876465, |
|
"learning_rate": 4.035555446347396e-05, |
|
"loss": 3.3115, |
|
"step": 16200 |
|
}, |
|
{ |
|
"combined_loss": 2.2285714149475098, |
|
"distill_loss": 1.4724992513656616, |
|
"epoch": 0.5934500696021686, |
|
"step": 16200, |
|
"student_mlm_loss": 2.9846436977386475 |
|
}, |
|
{ |
|
"epoch": 0.5971133416367499, |
|
"grad_norm": 8.876389503479004, |
|
"learning_rate": 4.029412487406934e-05, |
|
"loss": 4.1388, |
|
"step": 16300 |
|
}, |
|
{ |
|
"combined_loss": 2.0907256603240967, |
|
"distill_loss": 1.2955131530761719, |
|
"epoch": 0.5971133416367499, |
|
"step": 16300, |
|
"student_mlm_loss": 2.8859381675720215 |
|
}, |
|
{ |
|
"epoch": 0.6007766136713313, |
|
"grad_norm": 4.118688106536865, |
|
"learning_rate": 4.023269528466472e-05, |
|
"loss": 5.4036, |
|
"step": 16400 |
|
}, |
|
{ |
|
"combined_loss": 5.190587997436523, |
|
"distill_loss": 1.502519965171814, |
|
"epoch": 0.6007766136713313, |
|
"step": 16400, |
|
"student_mlm_loss": 8.878656387329102 |
|
}, |
|
{ |
|
"epoch": 0.6044398857059126, |
|
"grad_norm": 17.806203842163086, |
|
"learning_rate": 4.01712656952601e-05, |
|
"loss": 3.4529, |
|
"step": 16500 |
|
}, |
|
{ |
|
"combined_loss": 2.0771563053131104, |
|
"distill_loss": 1.5032036304473877, |
|
"epoch": 0.6044398857059126, |
|
"step": 16500, |
|
"student_mlm_loss": 2.651108980178833 |
|
}, |
|
{ |
|
"epoch": 0.6081031577404938, |
|
"grad_norm": 11.406692504882812, |
|
"learning_rate": 4.010983610585547e-05, |
|
"loss": 2.9157, |
|
"step": 16600 |
|
}, |
|
{ |
|
"combined_loss": 2.0262105464935303, |
|
"distill_loss": 1.406888723373413, |
|
"epoch": 0.6081031577404938, |
|
"step": 16600, |
|
"student_mlm_loss": 2.6455323696136475 |
|
}, |
|
{ |
|
"epoch": 0.6117664297750751, |
|
"grad_norm": 9.248611450195312, |
|
"learning_rate": 4.0048406516450846e-05, |
|
"loss": 3.7273, |
|
"step": 16700 |
|
}, |
|
{ |
|
"combined_loss": 9.912755966186523, |
|
"distill_loss": 1.3654385805130005, |
|
"epoch": 0.6117664297750751, |
|
"step": 16700, |
|
"student_mlm_loss": 18.460073471069336 |
|
}, |
|
{ |
|
"epoch": 0.6154297018096564, |
|
"grad_norm": 7.337488651275635, |
|
"learning_rate": 3.9986976927046223e-05, |
|
"loss": 3.5316, |
|
"step": 16800 |
|
}, |
|
{ |
|
"combined_loss": 2.2111759185791016, |
|
"distill_loss": 1.410059928894043, |
|
"epoch": 0.6154297018096564, |
|
"step": 16800, |
|
"student_mlm_loss": 3.012291669845581 |
|
}, |
|
{ |
|
"epoch": 0.6190929738442377, |
|
"grad_norm": 3.7927513122558594, |
|
"learning_rate": 3.9925547337641595e-05, |
|
"loss": 2.942, |
|
"step": 16900 |
|
}, |
|
{ |
|
"combined_loss": 1.9941096305847168, |
|
"distill_loss": 1.3353883028030396, |
|
"epoch": 0.6190929738442377, |
|
"step": 16900, |
|
"student_mlm_loss": 2.6528310775756836 |
|
}, |
|
{ |
|
"epoch": 0.622756245878819, |
|
"grad_norm": 8.092863082885742, |
|
"learning_rate": 3.986411774823697e-05, |
|
"loss": 8.3194, |
|
"step": 17000 |
|
}, |
|
{ |
|
"combined_loss": 1.8197941780090332, |
|
"distill_loss": 1.2830308675765991, |
|
"epoch": 0.622756245878819, |
|
"step": 17000, |
|
"student_mlm_loss": 2.356557607650757 |
|
}, |
|
{ |
|
"epoch": 0.6264195179134002, |
|
"grad_norm": 21.95607566833496, |
|
"learning_rate": 3.9802688158832343e-05, |
|
"loss": 3.6842, |
|
"step": 17100 |
|
}, |
|
{ |
|
"combined_loss": 1.967858076095581, |
|
"distill_loss": 1.3744505643844604, |
|
"epoch": 0.6264195179134002, |
|
"step": 17100, |
|
"student_mlm_loss": 2.561265707015991 |
|
}, |
|
{ |
|
"epoch": 0.6300827899479815, |
|
"grad_norm": 17.734630584716797, |
|
"learning_rate": 3.974125856942773e-05, |
|
"loss": 3.4446, |
|
"step": 17200 |
|
}, |
|
{ |
|
"combined_loss": 3.56831955909729, |
|
"distill_loss": 1.4127169847488403, |
|
"epoch": 0.6300827899479815, |
|
"step": 17200, |
|
"student_mlm_loss": 5.723922252655029 |
|
}, |
|
{ |
|
"epoch": 0.6337460619825628, |
|
"grad_norm": 14.227143287658691, |
|
"learning_rate": 3.96798289800231e-05, |
|
"loss": 4.3058, |
|
"step": 17300 |
|
}, |
|
{ |
|
"combined_loss": 6.485238552093506, |
|
"distill_loss": 1.3285768032073975, |
|
"epoch": 0.6337460619825628, |
|
"step": 17300, |
|
"student_mlm_loss": 11.641900062561035 |
|
}, |
|
{ |
|
"epoch": 0.6374093340171441, |
|
"grad_norm": 27.379819869995117, |
|
"learning_rate": 3.961839939061848e-05, |
|
"loss": 3.3666, |
|
"step": 17400 |
|
}, |
|
{ |
|
"combined_loss": 3.212083339691162, |
|
"distill_loss": 1.3358004093170166, |
|
"epoch": 0.6374093340171441, |
|
"step": 17400, |
|
"student_mlm_loss": 5.088366508483887 |
|
}, |
|
{ |
|
"epoch": 0.6410726060517254, |
|
"grad_norm": 6.261890411376953, |
|
"learning_rate": 3.955696980121385e-05, |
|
"loss": 6.3216, |
|
"step": 17500 |
|
}, |
|
{ |
|
"combined_loss": 1.8787257671356201, |
|
"distill_loss": 1.3068917989730835, |
|
"epoch": 0.6410726060517254, |
|
"step": 17500, |
|
"student_mlm_loss": 2.4505598545074463 |
|
}, |
|
{ |
|
"epoch": 0.6447358780863067, |
|
"grad_norm": 4.643723011016846, |
|
"learning_rate": 3.9495540211809226e-05, |
|
"loss": 6.3659, |
|
"step": 17600 |
|
}, |
|
{ |
|
"combined_loss": 1.9111711978912354, |
|
"distill_loss": 1.315952181816101, |
|
"epoch": 0.6447358780863067, |
|
"step": 17600, |
|
"student_mlm_loss": 2.506390333175659 |
|
}, |
|
{ |
|
"epoch": 0.648399150120888, |
|
"grad_norm": 209.94358825683594, |
|
"learning_rate": 3.9434110622404604e-05, |
|
"loss": 3.1778, |
|
"step": 17700 |
|
}, |
|
{ |
|
"combined_loss": 2.7990779876708984, |
|
"distill_loss": 1.360758662223816, |
|
"epoch": 0.648399150120888, |
|
"step": 17700, |
|
"student_mlm_loss": 4.237397193908691 |
|
}, |
|
{ |
|
"epoch": 0.6520624221554693, |
|
"grad_norm": 25.861230850219727, |
|
"learning_rate": 3.9372681032999975e-05, |
|
"loss": 6.5636, |
|
"step": 17800 |
|
}, |
|
{ |
|
"combined_loss": 3.8194119930267334, |
|
"distill_loss": 1.45068359375, |
|
"epoch": 0.6520624221554693, |
|
"step": 17800, |
|
"student_mlm_loss": 6.188140392303467 |
|
}, |
|
{ |
|
"epoch": 0.6557256941900506, |
|
"grad_norm": 46.81015396118164, |
|
"learning_rate": 3.931125144359535e-05, |
|
"loss": 6.4281, |
|
"step": 17900 |
|
}, |
|
{ |
|
"combined_loss": 1.8790740966796875, |
|
"distill_loss": 1.2603598833084106, |
|
"epoch": 0.6557256941900506, |
|
"step": 17900, |
|
"student_mlm_loss": 2.497788429260254 |
|
}, |
|
{ |
|
"epoch": 0.6593889662246318, |
|
"grad_norm": 3.634798049926758, |
|
"learning_rate": 3.924982185419073e-05, |
|
"loss": 3.7705, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.6593889662246318, |
|
"eval_loss": 3.4686477184295654, |
|
"eval_runtime": 2.0476, |
|
"eval_samples_per_second": 3416.619, |
|
"eval_steps_per_second": 13.674, |
|
"step": 18000 |
|
}, |
|
{ |
|
"combined_loss": 1.8001245260238647, |
|
"distill_loss": 1.358407735824585, |
|
"epoch": 0.6593889662246318, |
|
"step": 18000, |
|
"student_mlm_loss": 2.2418413162231445 |
|
}, |
|
{ |
|
"epoch": 0.6630522382592131, |
|
"grad_norm": 14.09543514251709, |
|
"learning_rate": 3.918839226478611e-05, |
|
"loss": 7.2198, |
|
"step": 18100 |
|
}, |
|
{ |
|
"combined_loss": 2.165346622467041, |
|
"distill_loss": 1.3290469646453857, |
|
"epoch": 0.6630522382592131, |
|
"step": 18100, |
|
"student_mlm_loss": 3.0016462802886963 |
|
}, |
|
{ |
|
"epoch": 0.6667155102937944, |
|
"grad_norm": 4.29142951965332, |
|
"learning_rate": 3.912696267538148e-05, |
|
"loss": 4.3053, |
|
"step": 18200 |
|
}, |
|
{ |
|
"combined_loss": 1.8569279909133911, |
|
"distill_loss": 1.355130910873413, |
|
"epoch": 0.6667155102937944, |
|
"step": 18200, |
|
"student_mlm_loss": 2.358725070953369 |
|
}, |
|
{ |
|
"epoch": 0.6703787823283757, |
|
"grad_norm": 4.424899101257324, |
|
"learning_rate": 3.906553308597686e-05, |
|
"loss": 3.2385, |
|
"step": 18300 |
|
}, |
|
{ |
|
"combined_loss": 2.083707094192505, |
|
"distill_loss": 1.307104229927063, |
|
"epoch": 0.6703787823283757, |
|
"step": 18300, |
|
"student_mlm_loss": 2.8603098392486572 |
|
}, |
|
{ |
|
"epoch": 0.6740420543629569, |
|
"grad_norm": 8.061409950256348, |
|
"learning_rate": 3.900410349657223e-05, |
|
"loss": 2.9075, |
|
"step": 18400 |
|
}, |
|
{ |
|
"combined_loss": 1.9213597774505615, |
|
"distill_loss": 1.434320330619812, |
|
"epoch": 0.6740420543629569, |
|
"step": 18400, |
|
"student_mlm_loss": 2.4083993434906006 |
|
}, |
|
{ |
|
"epoch": 0.6777053263975383, |
|
"grad_norm": 55.50898361206055, |
|
"learning_rate": 3.8942673907167606e-05, |
|
"loss": 13.4077, |
|
"step": 18500 |
|
}, |
|
{ |
|
"combined_loss": 2.01340389251709, |
|
"distill_loss": 1.3991159200668335, |
|
"epoch": 0.6777053263975383, |
|
"step": 18500, |
|
"student_mlm_loss": 2.6276917457580566 |
|
}, |
|
{ |
|
"epoch": 0.6813685984321196, |
|
"grad_norm": 5.348477840423584, |
|
"learning_rate": 3.8881244317762984e-05, |
|
"loss": 6.8559, |
|
"step": 18600 |
|
}, |
|
{ |
|
"combined_loss": 2.5955307483673096, |
|
"distill_loss": 1.4375801086425781, |
|
"epoch": 0.6813685984321196, |
|
"step": 18600, |
|
"student_mlm_loss": 3.753481388092041 |
|
}, |
|
{ |
|
"epoch": 0.6850318704667009, |
|
"grad_norm": 26.911954879760742, |
|
"learning_rate": 3.8819814728358355e-05, |
|
"loss": 9.8471, |
|
"step": 18700 |
|
}, |
|
{ |
|
"combined_loss": 2.3086562156677246, |
|
"distill_loss": 1.4082762002944946, |
|
"epoch": 0.6850318704667009, |
|
"step": 18700, |
|
"student_mlm_loss": 3.209036350250244 |
|
}, |
|
{ |
|
"epoch": 0.6886951425012822, |
|
"grad_norm": 8.086039543151855, |
|
"learning_rate": 3.875838513895373e-05, |
|
"loss": 3.841, |
|
"step": 18800 |
|
}, |
|
{ |
|
"combined_loss": 4.487699031829834, |
|
"distill_loss": 1.4052667617797852, |
|
"epoch": 0.6886951425012822, |
|
"step": 18800, |
|
"student_mlm_loss": 7.570131301879883 |
|
}, |
|
{ |
|
"epoch": 0.6923584145358634, |
|
"grad_norm": 10.749812126159668, |
|
"learning_rate": 3.869695554954911e-05, |
|
"loss": 9.7279, |
|
"step": 18900 |
|
}, |
|
{ |
|
"combined_loss": 3.3014779090881348, |
|
"distill_loss": 1.246164083480835, |
|
"epoch": 0.6923584145358634, |
|
"step": 18900, |
|
"student_mlm_loss": 5.3567914962768555 |
|
}, |
|
{ |
|
"epoch": 0.6960216865704447, |
|
"grad_norm": 11.313789367675781, |
|
"learning_rate": 3.863552596014449e-05, |
|
"loss": 28.0849, |
|
"step": 19000 |
|
}, |
|
{ |
|
"combined_loss": 4.825923919677734, |
|
"distill_loss": 1.377113938331604, |
|
"epoch": 0.6960216865704447, |
|
"step": 19000, |
|
"student_mlm_loss": 8.274733543395996 |
|
}, |
|
{ |
|
"epoch": 0.699684958605026, |
|
"grad_norm": 3.8648459911346436, |
|
"learning_rate": 3.857409637073986e-05, |
|
"loss": 5.8981, |
|
"step": 19100 |
|
}, |
|
{ |
|
"combined_loss": 3.4921586513519287, |
|
"distill_loss": 1.4171725511550903, |
|
"epoch": 0.699684958605026, |
|
"step": 19100, |
|
"student_mlm_loss": 5.567144870758057 |
|
}, |
|
{ |
|
"epoch": 0.7033482306396073, |
|
"grad_norm": 18.98455238342285, |
|
"learning_rate": 3.851266678133523e-05, |
|
"loss": 2.5944, |
|
"step": 19200 |
|
}, |
|
{ |
|
"combined_loss": 1.8949182033538818, |
|
"distill_loss": 1.3743678331375122, |
|
"epoch": 0.7033482306396073, |
|
"step": 19200, |
|
"student_mlm_loss": 2.415468692779541 |
|
}, |
|
{ |
|
"epoch": 0.7070115026741886, |
|
"grad_norm": 27.53456687927246, |
|
"learning_rate": 3.845123719193061e-05, |
|
"loss": 2.8462, |
|
"step": 19300 |
|
}, |
|
{ |
|
"combined_loss": 1.8077284097671509, |
|
"distill_loss": 1.2764451503753662, |
|
"epoch": 0.7070115026741886, |
|
"step": 19300, |
|
"student_mlm_loss": 2.3390116691589355 |
|
}, |
|
{ |
|
"epoch": 0.7106747747087698, |
|
"grad_norm": 8.815896987915039, |
|
"learning_rate": 3.8389807602525986e-05, |
|
"loss": 3.403, |
|
"step": 19400 |
|
}, |
|
{ |
|
"combined_loss": 2.2496674060821533, |
|
"distill_loss": 1.408218264579773, |
|
"epoch": 0.7106747747087698, |
|
"step": 19400, |
|
"student_mlm_loss": 3.091116428375244 |
|
}, |
|
{ |
|
"epoch": 0.7143380467433511, |
|
"grad_norm": 20.02590560913086, |
|
"learning_rate": 3.8328378013121364e-05, |
|
"loss": 3.7767, |
|
"step": 19500 |
|
}, |
|
{ |
|
"combined_loss": 2.6540353298187256, |
|
"distill_loss": 1.451707124710083, |
|
"epoch": 0.7143380467433511, |
|
"step": 19500, |
|
"student_mlm_loss": 3.856363534927368 |
|
}, |
|
{ |
|
"epoch": 0.7180013187779325, |
|
"grad_norm": 48.139583587646484, |
|
"learning_rate": 3.8266948423716735e-05, |
|
"loss": 3.4148, |
|
"step": 19600 |
|
}, |
|
{ |
|
"combined_loss": 3.5710411071777344, |
|
"distill_loss": 1.2874888181686401, |
|
"epoch": 0.7180013187779325, |
|
"step": 19600, |
|
"student_mlm_loss": 5.854593276977539 |
|
}, |
|
{ |
|
"epoch": 0.7216645908125138, |
|
"grad_norm": 5.810763835906982, |
|
"learning_rate": 3.820551883431211e-05, |
|
"loss": 11.1815, |
|
"step": 19700 |
|
}, |
|
{ |
|
"combined_loss": 2.022658586502075, |
|
"distill_loss": 1.408826231956482, |
|
"epoch": 0.7216645908125138, |
|
"step": 19700, |
|
"student_mlm_loss": 2.636491060256958 |
|
}, |
|
{ |
|
"epoch": 0.725327862847095, |
|
"grad_norm": 5.03505277633667, |
|
"learning_rate": 3.814408924490749e-05, |
|
"loss": 3.5792, |
|
"step": 19800 |
|
}, |
|
{ |
|
"combined_loss": 2.450950860977173, |
|
"distill_loss": 1.3786026239395142, |
|
"epoch": 0.725327862847095, |
|
"step": 19800, |
|
"student_mlm_loss": 3.523299217224121 |
|
}, |
|
{ |
|
"epoch": 0.7289911348816763, |
|
"grad_norm": 44.703548431396484, |
|
"learning_rate": 3.808265965550287e-05, |
|
"loss": 14.0822, |
|
"step": 19900 |
|
}, |
|
{ |
|
"combined_loss": 1.8448269367218018, |
|
"distill_loss": 1.3061137199401855, |
|
"epoch": 0.7289911348816763, |
|
"step": 19900, |
|
"student_mlm_loss": 2.383540153503418 |
|
}, |
|
{ |
|
"epoch": 0.7326544069162576, |
|
"grad_norm": 73.46593475341797, |
|
"learning_rate": 3.802123006609824e-05, |
|
"loss": 3.5648, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.7326544069162576, |
|
"eval_loss": 3.689605474472046, |
|
"eval_runtime": 2.2951, |
|
"eval_samples_per_second": 3048.261, |
|
"eval_steps_per_second": 12.2, |
|
"step": 20000 |
|
}, |
|
{ |
|
"combined_loss": 5.831945896148682, |
|
"distill_loss": 1.2505719661712646, |
|
"epoch": 0.7326544069162576, |
|
"step": 20000, |
|
"student_mlm_loss": 10.41331958770752 |
|
}, |
|
{ |
|
"epoch": 0.7363176789508389, |
|
"grad_norm": 7.289074897766113, |
|
"learning_rate": 3.795980047669361e-05, |
|
"loss": 5.9452, |
|
"step": 20100 |
|
}, |
|
{ |
|
"combined_loss": 14.608942985534668, |
|
"distill_loss": 1.4141182899475098, |
|
"epoch": 0.7363176789508389, |
|
"step": 20100, |
|
"student_mlm_loss": 27.803768157958984 |
|
}, |
|
{ |
|
"epoch": 0.7399809509854202, |
|
"grad_norm": 15.717759132385254, |
|
"learning_rate": 3.7898370887288995e-05, |
|
"loss": 5.3196, |
|
"step": 20200 |
|
}, |
|
{ |
|
"combined_loss": 2.34932279586792, |
|
"distill_loss": 1.2641239166259766, |
|
"epoch": 0.7399809509854202, |
|
"step": 20200, |
|
"student_mlm_loss": 3.434521436691284 |
|
}, |
|
{ |
|
"epoch": 0.7436442230200014, |
|
"grad_norm": 75.113037109375, |
|
"learning_rate": 3.7836941297884366e-05, |
|
"loss": 3.4868, |
|
"step": 20300 |
|
}, |
|
{ |
|
"combined_loss": 2.0885400772094727, |
|
"distill_loss": 1.3560060262680054, |
|
"epoch": 0.7436442230200014, |
|
"step": 20300, |
|
"student_mlm_loss": 2.8210740089416504 |
|
}, |
|
{ |
|
"epoch": 0.7473074950545827, |
|
"grad_norm": 12.071985244750977, |
|
"learning_rate": 3.7775511708479744e-05, |
|
"loss": 3.1594, |
|
"step": 20400 |
|
}, |
|
{ |
|
"combined_loss": 2.104968309402466, |
|
"distill_loss": 1.456742286682129, |
|
"epoch": 0.7473074950545827, |
|
"step": 20400, |
|
"student_mlm_loss": 2.7531943321228027 |
|
}, |
|
{ |
|
"epoch": 0.750970767089164, |
|
"grad_norm": 49.17687225341797, |
|
"learning_rate": 3.7714082119075115e-05, |
|
"loss": 5.0772, |
|
"step": 20500 |
|
}, |
|
{ |
|
"combined_loss": 1.9532296657562256, |
|
"distill_loss": 1.2734321355819702, |
|
"epoch": 0.750970767089164, |
|
"step": 20500, |
|
"student_mlm_loss": 2.6330270767211914 |
|
}, |
|
{ |
|
"epoch": 0.7546340391237454, |
|
"grad_norm": 4.601011753082275, |
|
"learning_rate": 3.765265252967049e-05, |
|
"loss": 8.0874, |
|
"step": 20600 |
|
}, |
|
{ |
|
"combined_loss": 1.8828588724136353, |
|
"distill_loss": 1.35260009765625, |
|
"epoch": 0.7546340391237454, |
|
"step": 20600, |
|
"student_mlm_loss": 2.4131176471710205 |
|
}, |
|
{ |
|
"epoch": 0.7582973111583267, |
|
"grad_norm": 3.9183883666992188, |
|
"learning_rate": 3.759122294026587e-05, |
|
"loss": 3.1836, |
|
"step": 20700 |
|
}, |
|
{ |
|
"combined_loss": 3.261841058731079, |
|
"distill_loss": 1.35749351978302, |
|
"epoch": 0.7582973111583267, |
|
"step": 20700, |
|
"student_mlm_loss": 5.166188716888428 |
|
}, |
|
{ |
|
"epoch": 0.7619605831929079, |
|
"grad_norm": 59.35635757446289, |
|
"learning_rate": 3.752979335086124e-05, |
|
"loss": 3.446, |
|
"step": 20800 |
|
}, |
|
{ |
|
"combined_loss": 2.0783181190490723, |
|
"distill_loss": 1.3386023044586182, |
|
"epoch": 0.7619605831929079, |
|
"step": 20800, |
|
"student_mlm_loss": 2.8180341720581055 |
|
}, |
|
{ |
|
"epoch": 0.7656238552274892, |
|
"grad_norm": 14.875, |
|
"learning_rate": 3.746836376145662e-05, |
|
"loss": 8.5798, |
|
"step": 20900 |
|
}, |
|
{ |
|
"combined_loss": 1.926416039466858, |
|
"distill_loss": 1.3077542781829834, |
|
"epoch": 0.7656238552274892, |
|
"step": 20900, |
|
"student_mlm_loss": 2.5450778007507324 |
|
}, |
|
{ |
|
"epoch": 0.7692871272620705, |
|
"grad_norm": 23.419870376586914, |
|
"learning_rate": 3.740693417205199e-05, |
|
"loss": 5.2177, |
|
"step": 21000 |
|
}, |
|
{ |
|
"combined_loss": 1.7290170192718506, |
|
"distill_loss": 1.2258715629577637, |
|
"epoch": 0.7692871272620705, |
|
"step": 21000, |
|
"student_mlm_loss": 2.2321624755859375 |
|
}, |
|
{ |
|
"epoch": 0.7729503992966518, |
|
"grad_norm": 29.292964935302734, |
|
"learning_rate": 3.7345504582647375e-05, |
|
"loss": 13.8021, |
|
"step": 21100 |
|
}, |
|
{ |
|
"combined_loss": 1.9402461051940918, |
|
"distill_loss": 1.2749103307724, |
|
"epoch": 0.7729503992966518, |
|
"step": 21100, |
|
"student_mlm_loss": 2.6055819988250732 |
|
}, |
|
{ |
|
"epoch": 0.776613671331233, |
|
"grad_norm": 9.03995418548584, |
|
"learning_rate": 3.7284074993242747e-05, |
|
"loss": 6.547, |
|
"step": 21200 |
|
}, |
|
{ |
|
"combined_loss": 2.2710204124450684, |
|
"distill_loss": 1.312924861907959, |
|
"epoch": 0.776613671331233, |
|
"step": 21200, |
|
"student_mlm_loss": 3.229116201400757 |
|
}, |
|
{ |
|
"epoch": 0.7802769433658143, |
|
"grad_norm": 11.86938190460205, |
|
"learning_rate": 3.7222645403838124e-05, |
|
"loss": 12.9682, |
|
"step": 21300 |
|
}, |
|
{ |
|
"combined_loss": 3.114459991455078, |
|
"distill_loss": 1.318755865097046, |
|
"epoch": 0.7802769433658143, |
|
"step": 21300, |
|
"student_mlm_loss": 4.910163879394531 |
|
}, |
|
{ |
|
"epoch": 0.7839402154003956, |
|
"grad_norm": 14.11950969696045, |
|
"learning_rate": 3.7161215814433495e-05, |
|
"loss": 3.1257, |
|
"step": 21400 |
|
}, |
|
{ |
|
"combined_loss": 3.882293224334717, |
|
"distill_loss": 1.1930829286575317, |
|
"epoch": 0.7839402154003956, |
|
"step": 21400, |
|
"student_mlm_loss": 6.571503639221191 |
|
}, |
|
{ |
|
"epoch": 0.7876034874349769, |
|
"grad_norm": 22.7275447845459, |
|
"learning_rate": 3.709978622502887e-05, |
|
"loss": 3.1395, |
|
"step": 21500 |
|
}, |
|
{ |
|
"combined_loss": 2.00057315826416, |
|
"distill_loss": 1.3134089708328247, |
|
"epoch": 0.7876034874349769, |
|
"step": 21500, |
|
"student_mlm_loss": 2.687737226486206 |
|
}, |
|
{ |
|
"epoch": 0.7912667594695582, |
|
"grad_norm": 56.84143829345703, |
|
"learning_rate": 3.703835663562425e-05, |
|
"loss": 13.1799, |
|
"step": 21600 |
|
}, |
|
{ |
|
"combined_loss": 2.094574213027954, |
|
"distill_loss": 1.3792191743850708, |
|
"epoch": 0.7912667594695582, |
|
"step": 21600, |
|
"student_mlm_loss": 2.809929370880127 |
|
}, |
|
{ |
|
"epoch": 0.7949300315041395, |
|
"grad_norm": 30.655105590820312, |
|
"learning_rate": 3.697692704621962e-05, |
|
"loss": 4.1563, |
|
"step": 21700 |
|
}, |
|
{ |
|
"combined_loss": 2.167109489440918, |
|
"distill_loss": 1.3041900396347046, |
|
"epoch": 0.7949300315041395, |
|
"step": 21700, |
|
"student_mlm_loss": 3.030029058456421 |
|
}, |
|
{ |
|
"epoch": 0.7985933035387208, |
|
"grad_norm": 7.400668144226074, |
|
"learning_rate": 3.6915497456815e-05, |
|
"loss": 9.7848, |
|
"step": 21800 |
|
}, |
|
{ |
|
"combined_loss": 2.2639806270599365, |
|
"distill_loss": 1.3241550922393799, |
|
"epoch": 0.7985933035387208, |
|
"step": 21800, |
|
"student_mlm_loss": 3.203806161880493 |
|
}, |
|
{ |
|
"epoch": 0.8022565755733021, |
|
"grad_norm": 28.212512969970703, |
|
"learning_rate": 3.685406786741038e-05, |
|
"loss": 2.7595, |
|
"step": 21900 |
|
}, |
|
{ |
|
"combined_loss": 1.9249264001846313, |
|
"distill_loss": 1.337939739227295, |
|
"epoch": 0.8022565755733021, |
|
"step": 21900, |
|
"student_mlm_loss": 2.5119130611419678 |
|
}, |
|
{ |
|
"epoch": 0.8059198476078834, |
|
"grad_norm": 5.998919486999512, |
|
"learning_rate": 3.6792638278005756e-05, |
|
"loss": 5.9041, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.8059198476078834, |
|
"eval_loss": 3.310230016708374, |
|
"eval_runtime": 1.9252, |
|
"eval_samples_per_second": 3633.98, |
|
"eval_steps_per_second": 14.544, |
|
"step": 22000 |
|
}, |
|
{ |
|
"combined_loss": 2.208944320678711, |
|
"distill_loss": 1.2883169651031494, |
|
"epoch": 0.8059198476078834, |
|
"step": 22000, |
|
"student_mlm_loss": 3.1295716762542725 |
|
}, |
|
{ |
|
"epoch": 0.8095831196424647, |
|
"grad_norm": 42.16996383666992, |
|
"learning_rate": 3.673120868860113e-05, |
|
"loss": 10.4166, |
|
"step": 22100 |
|
}, |
|
{ |
|
"combined_loss": 2.089421510696411, |
|
"distill_loss": 1.3541114330291748, |
|
"epoch": 0.8095831196424647, |
|
"step": 22100, |
|
"student_mlm_loss": 2.8247315883636475 |
|
}, |
|
{ |
|
"epoch": 0.8132463916770459, |
|
"grad_norm": 10.702394485473633, |
|
"learning_rate": 3.6669779099196505e-05, |
|
"loss": 3.5812, |
|
"step": 22200 |
|
}, |
|
{ |
|
"combined_loss": 1.8974239826202393, |
|
"distill_loss": 1.3954590559005737, |
|
"epoch": 0.8132463916770459, |
|
"step": 22200, |
|
"student_mlm_loss": 2.3993890285491943 |
|
}, |
|
{ |
|
"epoch": 0.8169096637116272, |
|
"grad_norm": 149.82179260253906, |
|
"learning_rate": 3.6608349509791876e-05, |
|
"loss": 3.229, |
|
"step": 22300 |
|
}, |
|
{ |
|
"combined_loss": 2.0663747787475586, |
|
"distill_loss": 1.3880882263183594, |
|
"epoch": 0.8169096637116272, |
|
"step": 22300, |
|
"student_mlm_loss": 2.7446610927581787 |
|
}, |
|
{ |
|
"epoch": 0.8205729357462085, |
|
"grad_norm": 5.735169410705566, |
|
"learning_rate": 3.6546919920387253e-05, |
|
"loss": 13.0135, |
|
"step": 22400 |
|
}, |
|
{ |
|
"combined_loss": 2.3801686763763428, |
|
"distill_loss": 1.2296876907348633, |
|
"epoch": 0.8205729357462085, |
|
"step": 22400, |
|
"student_mlm_loss": 3.5306496620178223 |
|
}, |
|
{ |
|
"epoch": 0.8242362077807898, |
|
"grad_norm": 3.9154951572418213, |
|
"learning_rate": 3.648549033098263e-05, |
|
"loss": 3.0256, |
|
"step": 22500 |
|
}, |
|
{ |
|
"combined_loss": 2.619138240814209, |
|
"distill_loss": 1.369718313217163, |
|
"epoch": 0.8242362077807898, |
|
"step": 22500, |
|
"student_mlm_loss": 3.868557929992676 |
|
}, |
|
{ |
|
"epoch": 0.827899479815371, |
|
"grad_norm": 6.706686019897461, |
|
"learning_rate": 3.6424060741578e-05, |
|
"loss": 6.8373, |
|
"step": 22600 |
|
}, |
|
{ |
|
"combined_loss": 3.571559429168701, |
|
"distill_loss": 1.360285758972168, |
|
"epoch": 0.827899479815371, |
|
"step": 22600, |
|
"student_mlm_loss": 5.782833099365234 |
|
}, |
|
{ |
|
"epoch": 0.8315627518499524, |
|
"grad_norm": 63.70609664916992, |
|
"learning_rate": 3.636263115217338e-05, |
|
"loss": 3.1874, |
|
"step": 22700 |
|
}, |
|
{ |
|
"combined_loss": 6.645792007446289, |
|
"distill_loss": 1.3381716012954712, |
|
"epoch": 0.8315627518499524, |
|
"step": 22700, |
|
"student_mlm_loss": 11.953412055969238 |
|
}, |
|
{ |
|
"epoch": 0.8352260238845337, |
|
"grad_norm": 112.02607727050781, |
|
"learning_rate": 3.630120156276876e-05, |
|
"loss": 4.1698, |
|
"step": 22800 |
|
}, |
|
{ |
|
"combined_loss": 2.399282455444336, |
|
"distill_loss": 1.2190183401107788, |
|
"epoch": 0.8352260238845337, |
|
"step": 22800, |
|
"student_mlm_loss": 3.5795464515686035 |
|
}, |
|
{ |
|
"epoch": 0.838889295919115, |
|
"grad_norm": 319.05230712890625, |
|
"learning_rate": 3.6239771973364136e-05, |
|
"loss": 3.351, |
|
"step": 22900 |
|
}, |
|
{ |
|
"combined_loss": 5.626018047332764, |
|
"distill_loss": 1.3532286882400513, |
|
"epoch": 0.838889295919115, |
|
"step": 22900, |
|
"student_mlm_loss": 9.898807525634766 |
|
}, |
|
{ |
|
"epoch": 0.8425525679536963, |
|
"grad_norm": 4.46912956237793, |
|
"learning_rate": 3.617834238395951e-05, |
|
"loss": 3.1926, |
|
"step": 23000 |
|
}, |
|
{ |
|
"combined_loss": 1.8462562561035156, |
|
"distill_loss": 1.339337944984436, |
|
"epoch": 0.8425525679536963, |
|
"step": 23000, |
|
"student_mlm_loss": 2.3531746864318848 |
|
}, |
|
{ |
|
"epoch": 0.8462158399882775, |
|
"grad_norm": 15.756026268005371, |
|
"learning_rate": 3.611691279455488e-05, |
|
"loss": 11.7086, |
|
"step": 23100 |
|
}, |
|
{ |
|
"combined_loss": 3.4101529121398926, |
|
"distill_loss": 1.3407546281814575, |
|
"epoch": 0.8462158399882775, |
|
"step": 23100, |
|
"student_mlm_loss": 5.479551315307617 |
|
}, |
|
{ |
|
"epoch": 0.8498791120228588, |
|
"grad_norm": 12.350069046020508, |
|
"learning_rate": 3.6055483205150256e-05, |
|
"loss": 3.1203, |
|
"step": 23200 |
|
}, |
|
{ |
|
"combined_loss": 2.5675039291381836, |
|
"distill_loss": 1.2296205759048462, |
|
"epoch": 0.8498791120228588, |
|
"step": 23200, |
|
"student_mlm_loss": 3.9053874015808105 |
|
}, |
|
{ |
|
"epoch": 0.8535423840574401, |
|
"grad_norm": 11.17212963104248, |
|
"learning_rate": 3.5994053615745634e-05, |
|
"loss": 6.2935, |
|
"step": 23300 |
|
}, |
|
{ |
|
"combined_loss": 2.901674270629883, |
|
"distill_loss": 1.318871021270752, |
|
"epoch": 0.8535423840574401, |
|
"step": 23300, |
|
"student_mlm_loss": 4.484477519989014 |
|
}, |
|
{ |
|
"epoch": 0.8572056560920214, |
|
"grad_norm": 11.69430160522461, |
|
"learning_rate": 3.593262402634101e-05, |
|
"loss": 6.1123, |
|
"step": 23400 |
|
}, |
|
{ |
|
"combined_loss": 1.962475061416626, |
|
"distill_loss": 1.3837331533432007, |
|
"epoch": 0.8572056560920214, |
|
"step": 23400, |
|
"student_mlm_loss": 2.541217088699341 |
|
}, |
|
{ |
|
"epoch": 0.8608689281266027, |
|
"grad_norm": 6.221428394317627, |
|
"learning_rate": 3.587119443693638e-05, |
|
"loss": 5.0621, |
|
"step": 23500 |
|
}, |
|
{ |
|
"combined_loss": 2.3063066005706787, |
|
"distill_loss": 1.364685297012329, |
|
"epoch": 0.8608689281266027, |
|
"step": 23500, |
|
"student_mlm_loss": 3.2479279041290283 |
|
}, |
|
{ |
|
"epoch": 0.8645322001611839, |
|
"grad_norm": 3.200302839279175, |
|
"learning_rate": 3.580976484753176e-05, |
|
"loss": 3.1679, |
|
"step": 23600 |
|
}, |
|
{ |
|
"combined_loss": 14.653901100158691, |
|
"distill_loss": 1.3521461486816406, |
|
"epoch": 0.8645322001611839, |
|
"step": 23600, |
|
"student_mlm_loss": 27.955656051635742 |
|
}, |
|
{ |
|
"epoch": 0.8681954721957652, |
|
"grad_norm": 18.003841400146484, |
|
"learning_rate": 3.574833525812714e-05, |
|
"loss": 4.2524, |
|
"step": 23700 |
|
}, |
|
{ |
|
"combined_loss": 2.05013108253479, |
|
"distill_loss": 1.473749041557312, |
|
"epoch": 0.8681954721957652, |
|
"step": 23700, |
|
"student_mlm_loss": 2.6265130043029785 |
|
}, |
|
{ |
|
"epoch": 0.8718587442303466, |
|
"grad_norm": 16.64165687561035, |
|
"learning_rate": 3.5686905668722516e-05, |
|
"loss": 3.4139, |
|
"step": 23800 |
|
}, |
|
{ |
|
"combined_loss": 3.8039913177490234, |
|
"distill_loss": 1.3022387027740479, |
|
"epoch": 0.8718587442303466, |
|
"step": 23800, |
|
"student_mlm_loss": 6.305744171142578 |
|
}, |
|
{ |
|
"epoch": 0.8755220162649279, |
|
"grad_norm": 6.90595817565918, |
|
"learning_rate": 3.562547607931789e-05, |
|
"loss": 5.4512, |
|
"step": 23900 |
|
}, |
|
{ |
|
"combined_loss": 2.0175633430480957, |
|
"distill_loss": 1.2362921237945557, |
|
"epoch": 0.8755220162649279, |
|
"step": 23900, |
|
"student_mlm_loss": 2.7988343238830566 |
|
}, |
|
{ |
|
"epoch": 0.8791852882995091, |
|
"grad_norm": 26.792980194091797, |
|
"learning_rate": 3.556404648991326e-05, |
|
"loss": 6.622, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.8791852882995091, |
|
"eval_loss": 3.643918991088867, |
|
"eval_runtime": 1.9198, |
|
"eval_samples_per_second": 3644.043, |
|
"eval_steps_per_second": 14.585, |
|
"step": 24000 |
|
}, |
|
{ |
|
"combined_loss": 2.1716020107269287, |
|
"distill_loss": 1.3234556913375854, |
|
"epoch": 0.8791852882995091, |
|
"step": 24000, |
|
"student_mlm_loss": 3.0197484493255615 |
|
}, |
|
{ |
|
"epoch": 0.8828485603340904, |
|
"grad_norm": 4.8087568283081055, |
|
"learning_rate": 3.550261690050864e-05, |
|
"loss": 4.0542, |
|
"step": 24100 |
|
}, |
|
{ |
|
"combined_loss": 13.035262107849121, |
|
"distill_loss": 1.353433609008789, |
|
"epoch": 0.8828485603340904, |
|
"step": 24100, |
|
"student_mlm_loss": 24.717090606689453 |
|
}, |
|
{ |
|
"epoch": 0.8865118323686717, |
|
"grad_norm": 10.60560417175293, |
|
"learning_rate": 3.5441187311104014e-05, |
|
"loss": 3.1068, |
|
"step": 24200 |
|
}, |
|
{ |
|
"combined_loss": 1.8867456912994385, |
|
"distill_loss": 1.2289210557937622, |
|
"epoch": 0.8865118323686717, |
|
"step": 24200, |
|
"student_mlm_loss": 2.544570207595825 |
|
}, |
|
{ |
|
"epoch": 0.890175104403253, |
|
"grad_norm": 11.34473705291748, |
|
"learning_rate": 3.537975772169939e-05, |
|
"loss": 2.9801, |
|
"step": 24300 |
|
}, |
|
{ |
|
"combined_loss": 1.7472858428955078, |
|
"distill_loss": 1.229453206062317, |
|
"epoch": 0.890175104403253, |
|
"step": 24300, |
|
"student_mlm_loss": 2.265118360519409 |
|
}, |
|
{ |
|
"epoch": 0.8938383764378343, |
|
"grad_norm": 17.742507934570312, |
|
"learning_rate": 3.531832813229476e-05, |
|
"loss": 4.6617, |
|
"step": 24400 |
|
}, |
|
{ |
|
"combined_loss": 1.9173786640167236, |
|
"distill_loss": 1.3212807178497314, |
|
"epoch": 0.8938383764378343, |
|
"step": 24400, |
|
"student_mlm_loss": 2.513476610183716 |
|
}, |
|
{ |
|
"epoch": 0.8975016484724155, |
|
"grad_norm": 14.223791122436523, |
|
"learning_rate": 3.525689854289014e-05, |
|
"loss": 3.0537, |
|
"step": 24500 |
|
}, |
|
{ |
|
"combined_loss": 1.7878549098968506, |
|
"distill_loss": 1.2908958196640015, |
|
"epoch": 0.8975016484724155, |
|
"step": 24500, |
|
"student_mlm_loss": 2.28481388092041 |
|
}, |
|
{ |
|
"epoch": 0.9011649205069968, |
|
"grad_norm": 4.241771697998047, |
|
"learning_rate": 3.519546895348552e-05, |
|
"loss": 7.9255, |
|
"step": 24600 |
|
}, |
|
{ |
|
"combined_loss": 1.8853719234466553, |
|
"distill_loss": 1.3350555896759033, |
|
"epoch": 0.9011649205069968, |
|
"step": 24600, |
|
"student_mlm_loss": 2.4356882572174072 |
|
}, |
|
{ |
|
"epoch": 0.9048281925415781, |
|
"grad_norm": 5.793640613555908, |
|
"learning_rate": 3.513403936408089e-05, |
|
"loss": 2.9971, |
|
"step": 24700 |
|
}, |
|
{ |
|
"combined_loss": 9.072087287902832, |
|
"distill_loss": 1.2805593013763428, |
|
"epoch": 0.9048281925415781, |
|
"step": 24700, |
|
"student_mlm_loss": 16.863615036010742 |
|
}, |
|
{ |
|
"epoch": 0.9084914645761595, |
|
"grad_norm": 4.500351905822754, |
|
"learning_rate": 3.507260977467627e-05, |
|
"loss": 2.9841, |
|
"step": 24800 |
|
}, |
|
{ |
|
"combined_loss": 4.229645252227783, |
|
"distill_loss": 1.231893539428711, |
|
"epoch": 0.9084914645761595, |
|
"step": 24800, |
|
"student_mlm_loss": 7.2273969650268555 |
|
}, |
|
{ |
|
"epoch": 0.9121547366107408, |
|
"grad_norm": 24.93678855895996, |
|
"learning_rate": 3.501118018527164e-05, |
|
"loss": 5.2865, |
|
"step": 24900 |
|
}, |
|
{ |
|
"combined_loss": 4.519498825073242, |
|
"distill_loss": 1.35053288936615, |
|
"epoch": 0.9121547366107408, |
|
"step": 24900, |
|
"student_mlm_loss": 7.688465118408203 |
|
}, |
|
{ |
|
"epoch": 0.915818008645322, |
|
"grad_norm": 9.416017532348633, |
|
"learning_rate": 3.494975059586702e-05, |
|
"loss": 2.9688, |
|
"step": 25000 |
|
}, |
|
{ |
|
"combined_loss": 4.33969783782959, |
|
"distill_loss": 1.2811079025268555, |
|
"epoch": 0.915818008645322, |
|
"step": 25000, |
|
"student_mlm_loss": 7.398288249969482 |
|
}, |
|
{ |
|
"epoch": 0.9194812806799033, |
|
"grad_norm": 41.79585266113281, |
|
"learning_rate": 3.4888321006462394e-05, |
|
"loss": 12.352, |
|
"step": 25100 |
|
}, |
|
{ |
|
"combined_loss": 2.398942232131958, |
|
"distill_loss": 1.3129199743270874, |
|
"epoch": 0.9194812806799033, |
|
"step": 25100, |
|
"student_mlm_loss": 3.484964609146118 |
|
}, |
|
{ |
|
"epoch": 0.9231445527144846, |
|
"grad_norm": 27.67843246459961, |
|
"learning_rate": 3.482689141705777e-05, |
|
"loss": 4.6291, |
|
"step": 25200 |
|
}, |
|
{ |
|
"combined_loss": 1.8275630474090576, |
|
"distill_loss": 1.1290583610534668, |
|
"epoch": 0.9231445527144846, |
|
"step": 25200, |
|
"student_mlm_loss": 2.5260677337646484 |
|
}, |
|
{ |
|
"epoch": 0.9268078247490659, |
|
"grad_norm": 57.03019332885742, |
|
"learning_rate": 3.476546182765314e-05, |
|
"loss": 3.8226, |
|
"step": 25300 |
|
}, |
|
{ |
|
"combined_loss": 1.8621808290481567, |
|
"distill_loss": 1.3249785900115967, |
|
"epoch": 0.9268078247490659, |
|
"step": 25300, |
|
"student_mlm_loss": 2.399383068084717 |
|
}, |
|
{ |
|
"epoch": 0.9304710967836471, |
|
"grad_norm": 5.4275007247924805, |
|
"learning_rate": 3.470403223824852e-05, |
|
"loss": 3.7803, |
|
"step": 25400 |
|
}, |
|
{ |
|
"combined_loss": 5.317490100860596, |
|
"distill_loss": 1.3810964822769165, |
|
"epoch": 0.9304710967836471, |
|
"step": 25400, |
|
"student_mlm_loss": 9.253883361816406 |
|
}, |
|
{ |
|
"epoch": 0.9341343688182284, |
|
"grad_norm": 6.36318302154541, |
|
"learning_rate": 3.46426026488439e-05, |
|
"loss": 17.9114, |
|
"step": 25500 |
|
}, |
|
{ |
|
"combined_loss": 4.816742897033691, |
|
"distill_loss": 1.274537444114685, |
|
"epoch": 0.9341343688182284, |
|
"step": 25500, |
|
"student_mlm_loss": 8.358948707580566 |
|
}, |
|
{ |
|
"epoch": 0.9377976408528097, |
|
"grad_norm": 4.670822620391846, |
|
"learning_rate": 3.458117305943927e-05, |
|
"loss": 3.4352, |
|
"step": 25600 |
|
}, |
|
{ |
|
"combined_loss": 1.7166364192962646, |
|
"distill_loss": 1.2876447439193726, |
|
"epoch": 0.9377976408528097, |
|
"step": 25600, |
|
"student_mlm_loss": 2.145627975463867 |
|
}, |
|
{ |
|
"epoch": 0.941460912887391, |
|
"grad_norm": 16.301795959472656, |
|
"learning_rate": 3.451974347003465e-05, |
|
"loss": 2.591, |
|
"step": 25700 |
|
}, |
|
{ |
|
"combined_loss": 1.8349076509475708, |
|
"distill_loss": 1.3192713260650635, |
|
"epoch": 0.941460912887391, |
|
"step": 25700, |
|
"student_mlm_loss": 2.350543975830078 |
|
}, |
|
{ |
|
"epoch": 0.9451241849219723, |
|
"grad_norm": 4.464934349060059, |
|
"learning_rate": 3.4458313880630025e-05, |
|
"loss": 5.3202, |
|
"step": 25800 |
|
}, |
|
{ |
|
"combined_loss": 2.022656202316284, |
|
"distill_loss": 1.4582451581954956, |
|
"epoch": 0.9451241849219723, |
|
"step": 25800, |
|
"student_mlm_loss": 2.587067127227783 |
|
}, |
|
{ |
|
"epoch": 0.9487874569565536, |
|
"grad_norm": 13.280508041381836, |
|
"learning_rate": 3.43968842912254e-05, |
|
"loss": 3.2685, |
|
"step": 25900 |
|
}, |
|
{ |
|
"combined_loss": 1.7409727573394775, |
|
"distill_loss": 1.2449432611465454, |
|
"epoch": 0.9487874569565536, |
|
"step": 25900, |
|
"student_mlm_loss": 2.23700213432312 |
|
}, |
|
{ |
|
"epoch": 0.9524507289911349, |
|
"grad_norm": 34.54155349731445, |
|
"learning_rate": 3.4335454701820774e-05, |
|
"loss": 4.4614, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.9524507289911349, |
|
"eval_loss": 3.371135950088501, |
|
"eval_runtime": 1.9026, |
|
"eval_samples_per_second": 3677.064, |
|
"eval_steps_per_second": 14.717, |
|
"step": 26000 |
|
}, |
|
{ |
|
"combined_loss": 2.1200222969055176, |
|
"distill_loss": 1.4147942066192627, |
|
"epoch": 0.9524507289911349, |
|
"step": 26000, |
|
"student_mlm_loss": 2.8252503871917725 |
|
}, |
|
{ |
|
"epoch": 0.9561140010257162, |
|
"grad_norm": 12.063314437866211, |
|
"learning_rate": 3.427402511241615e-05, |
|
"loss": 3.8605, |
|
"step": 26100 |
|
}, |
|
{ |
|
"combined_loss": 2.440842866897583, |
|
"distill_loss": 1.4115891456604004, |
|
"epoch": 0.9561140010257162, |
|
"step": 26100, |
|
"student_mlm_loss": 3.4700965881347656 |
|
}, |
|
{ |
|
"epoch": 0.9597772730602975, |
|
"grad_norm": 3.154322862625122, |
|
"learning_rate": 3.421259552301152e-05, |
|
"loss": 3.4216, |
|
"step": 26200 |
|
}, |
|
{ |
|
"combined_loss": 2.0511860847473145, |
|
"distill_loss": 1.2086646556854248, |
|
"epoch": 0.9597772730602975, |
|
"step": 26200, |
|
"student_mlm_loss": 2.893707752227783 |
|
}, |
|
{ |
|
"epoch": 0.9634405450948788, |
|
"grad_norm": 4.469895839691162, |
|
"learning_rate": 3.41511659336069e-05, |
|
"loss": 8.4313, |
|
"step": 26300 |
|
}, |
|
{ |
|
"combined_loss": 1.9184556007385254, |
|
"distill_loss": 1.311684489250183, |
|
"epoch": 0.9634405450948788, |
|
"step": 26300, |
|
"student_mlm_loss": 2.525226593017578 |
|
}, |
|
{ |
|
"epoch": 0.96710381712946, |
|
"grad_norm": 37.47445297241211, |
|
"learning_rate": 3.408973634420228e-05, |
|
"loss": 3.33, |
|
"step": 26400 |
|
}, |
|
{ |
|
"combined_loss": 1.8568530082702637, |
|
"distill_loss": 1.3435510396957397, |
|
"epoch": 0.96710381712946, |
|
"step": 26400, |
|
"student_mlm_loss": 2.370154857635498 |
|
}, |
|
{ |
|
"epoch": 0.9707670891640413, |
|
"grad_norm": 5.385250091552734, |
|
"learning_rate": 3.402830675479765e-05, |
|
"loss": 3.0353, |
|
"step": 26500 |
|
}, |
|
{ |
|
"combined_loss": 2.078137159347534, |
|
"distill_loss": 1.4688613414764404, |
|
"epoch": 0.9707670891640413, |
|
"step": 26500, |
|
"student_mlm_loss": 2.687412977218628 |
|
}, |
|
{ |
|
"epoch": 0.9744303611986226, |
|
"grad_norm": 20.363506317138672, |
|
"learning_rate": 3.396687716539303e-05, |
|
"loss": 5.5902, |
|
"step": 26600 |
|
}, |
|
{ |
|
"combined_loss": 2.420652151107788, |
|
"distill_loss": 1.3566147089004517, |
|
"epoch": 0.9744303611986226, |
|
"step": 26600, |
|
"student_mlm_loss": 3.484689474105835 |
|
}, |
|
{ |
|
"epoch": 0.9780936332332039, |
|
"grad_norm": 5.678069591522217, |
|
"learning_rate": 3.3905447575988405e-05, |
|
"loss": 3.1063, |
|
"step": 26700 |
|
}, |
|
{ |
|
"combined_loss": 2.2643003463745117, |
|
"distill_loss": 1.3446204662322998, |
|
"epoch": 0.9780936332332039, |
|
"step": 26700, |
|
"student_mlm_loss": 3.1839799880981445 |
|
}, |
|
{ |
|
"epoch": 0.9817569052677851, |
|
"grad_norm": 8.722668647766113, |
|
"learning_rate": 3.384401798658378e-05, |
|
"loss": 9.3685, |
|
"step": 26800 |
|
}, |
|
{ |
|
"combined_loss": 8.34331226348877, |
|
"distill_loss": 1.3864542245864868, |
|
"epoch": 0.9817569052677851, |
|
"step": 26800, |
|
"student_mlm_loss": 15.3001708984375 |
|
}, |
|
{ |
|
"epoch": 0.9854201773023665, |
|
"grad_norm": 5.101404190063477, |
|
"learning_rate": 3.3782588397179154e-05, |
|
"loss": 3.1112, |
|
"step": 26900 |
|
}, |
|
{ |
|
"combined_loss": 30.241453170776367, |
|
"distill_loss": 1.3818217515945435, |
|
"epoch": 0.9854201773023665, |
|
"step": 26900, |
|
"student_mlm_loss": 59.1010856628418 |
|
}, |
|
{ |
|
"epoch": 0.9890834493369478, |
|
"grad_norm": 3.8359858989715576, |
|
"learning_rate": 3.3721158807774525e-05, |
|
"loss": 3.348, |
|
"step": 27000 |
|
}, |
|
{ |
|
"combined_loss": 1.8264105319976807, |
|
"distill_loss": 1.2956147193908691, |
|
"epoch": 0.9890834493369478, |
|
"step": 27000, |
|
"student_mlm_loss": 2.357206344604492 |
|
}, |
|
{ |
|
"epoch": 0.9927467213715291, |
|
"grad_norm": 33.43736267089844, |
|
"learning_rate": 3.36597292183699e-05, |
|
"loss": 3.5437, |
|
"step": 27100 |
|
}, |
|
{ |
|
"combined_loss": 2.331777572631836, |
|
"distill_loss": 1.3274433612823486, |
|
"epoch": 0.9927467213715291, |
|
"step": 27100, |
|
"student_mlm_loss": 3.3361120223999023 |
|
}, |
|
{ |
|
"epoch": 0.9964099934061104, |
|
"grad_norm": 2.9736690521240234, |
|
"learning_rate": 3.359829962896528e-05, |
|
"loss": 2.828, |
|
"step": 27200 |
|
}, |
|
{ |
|
"combined_loss": 2.0438201427459717, |
|
"distill_loss": 1.334372639656067, |
|
"epoch": 0.9964099934061104, |
|
"step": 27200, |
|
"student_mlm_loss": 2.753267526626587 |
|
}, |
|
{ |
|
"epoch": 1.0000732654406916, |
|
"grad_norm": 3.6774871349334717, |
|
"learning_rate": 3.353687003956066e-05, |
|
"loss": 3.168, |
|
"step": 27300 |
|
}, |
|
{ |
|
"combined_loss": 3.4676733016967773, |
|
"distill_loss": 1.2681790590286255, |
|
"epoch": 1.0000732654406916, |
|
"step": 27300, |
|
"student_mlm_loss": 5.667167663574219 |
|
}, |
|
{ |
|
"epoch": 1.003736537475273, |
|
"grad_norm": 20.265796661376953, |
|
"learning_rate": 3.347544045015603e-05, |
|
"loss": 4.9071, |
|
"step": 27400 |
|
}, |
|
{ |
|
"combined_loss": 1.740236520767212, |
|
"distill_loss": 1.1595730781555176, |
|
"epoch": 1.003736537475273, |
|
"step": 27400, |
|
"student_mlm_loss": 2.3208999633789062 |
|
}, |
|
{ |
|
"epoch": 1.0073998095098542, |
|
"grad_norm": 14.427675247192383, |
|
"learning_rate": 3.341401086075141e-05, |
|
"loss": 3.1375, |
|
"step": 27500 |
|
}, |
|
{ |
|
"combined_loss": 2.0229873657226562, |
|
"distill_loss": 1.3961925506591797, |
|
"epoch": 1.0073998095098542, |
|
"step": 27500, |
|
"student_mlm_loss": 2.6497819423675537 |
|
}, |
|
{ |
|
"epoch": 1.0110630815444355, |
|
"grad_norm": 3.032438039779663, |
|
"learning_rate": 3.3352581271346786e-05, |
|
"loss": 2.7581, |
|
"step": 27600 |
|
}, |
|
{ |
|
"combined_loss": 1.9314367771148682, |
|
"distill_loss": 1.2618595361709595, |
|
"epoch": 1.0110630815444355, |
|
"step": 27600, |
|
"student_mlm_loss": 2.6010141372680664 |
|
}, |
|
{ |
|
"epoch": 1.0147263535790167, |
|
"grad_norm": 6.167496681213379, |
|
"learning_rate": 3.3291151681942163e-05, |
|
"loss": 6.7788, |
|
"step": 27700 |
|
}, |
|
{ |
|
"combined_loss": 2.247697353363037, |
|
"distill_loss": 1.4385483264923096, |
|
"epoch": 1.0147263535790167, |
|
"step": 27700, |
|
"student_mlm_loss": 3.0568461418151855 |
|
}, |
|
{ |
|
"epoch": 1.018389625613598, |
|
"grad_norm": 4.82693338394165, |
|
"learning_rate": 3.3229722092537534e-05, |
|
"loss": 5.9229, |
|
"step": 27800 |
|
}, |
|
{ |
|
"combined_loss": 3.4328160285949707, |
|
"distill_loss": 1.319059133529663, |
|
"epoch": 1.018389625613598, |
|
"step": 27800, |
|
"student_mlm_loss": 5.546572685241699 |
|
}, |
|
{ |
|
"epoch": 1.0220528976481793, |
|
"grad_norm": 13.18911361694336, |
|
"learning_rate": 3.3168292503132906e-05, |
|
"loss": 3.5041, |
|
"step": 27900 |
|
}, |
|
{ |
|
"combined_loss": 3.720487594604492, |
|
"distill_loss": 1.233067274093628, |
|
"epoch": 1.0220528976481793, |
|
"step": 27900, |
|
"student_mlm_loss": 6.207907676696777 |
|
}, |
|
{ |
|
"epoch": 1.0257161696827606, |
|
"grad_norm": 10.725250244140625, |
|
"learning_rate": 3.310686291372829e-05, |
|
"loss": 2.9279, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.0257161696827606, |
|
"eval_loss": 3.3177244663238525, |
|
"eval_runtime": 2.0821, |
|
"eval_samples_per_second": 3360.034, |
|
"eval_steps_per_second": 13.448, |
|
"step": 28000 |
|
}, |
|
{ |
|
"combined_loss": 2.0106987953186035, |
|
"distill_loss": 1.3163011074066162, |
|
"epoch": 1.0257161696827606, |
|
"step": 28000, |
|
"student_mlm_loss": 2.70509672164917 |
|
}, |
|
{ |
|
"epoch": 1.0293794417173419, |
|
"grad_norm": 5.406506538391113, |
|
"learning_rate": 3.304543332432366e-05, |
|
"loss": 3.2149, |
|
"step": 28100 |
|
}, |
|
{ |
|
"combined_loss": 2.042628288269043, |
|
"distill_loss": 1.3173636198043823, |
|
"epoch": 1.0293794417173419, |
|
"step": 28100, |
|
"student_mlm_loss": 2.767892837524414 |
|
}, |
|
{ |
|
"epoch": 1.0330427137519231, |
|
"grad_norm": 3.2733256816864014, |
|
"learning_rate": 3.298400373491904e-05, |
|
"loss": 6.3856, |
|
"step": 28200 |
|
}, |
|
{ |
|
"combined_loss": 1.9145760536193848, |
|
"distill_loss": 1.438834309577942, |
|
"epoch": 1.0330427137519231, |
|
"step": 28200, |
|
"student_mlm_loss": 2.390317916870117 |
|
}, |
|
{ |
|
"epoch": 1.0367059857865044, |
|
"grad_norm": 10.546121597290039, |
|
"learning_rate": 3.292257414551441e-05, |
|
"loss": 3.5422, |
|
"step": 28300 |
|
}, |
|
{ |
|
"combined_loss": 2.6431736946105957, |
|
"distill_loss": 1.367489218711853, |
|
"epoch": 1.0367059857865044, |
|
"step": 28300, |
|
"student_mlm_loss": 3.918858289718628 |
|
}, |
|
{ |
|
"epoch": 1.0403692578210857, |
|
"grad_norm": 25.674352645874023, |
|
"learning_rate": 3.286114455610979e-05, |
|
"loss": 6.2258, |
|
"step": 28400 |
|
}, |
|
{ |
|
"combined_loss": 1.8416577577590942, |
|
"distill_loss": 1.2867157459259033, |
|
"epoch": 1.0403692578210857, |
|
"step": 28400, |
|
"student_mlm_loss": 2.396599769592285 |
|
}, |
|
{ |
|
"epoch": 1.044032529855667, |
|
"grad_norm": 3.6745688915252686, |
|
"learning_rate": 3.2799714966705166e-05, |
|
"loss": 5.0647, |
|
"step": 28500 |
|
}, |
|
{ |
|
"combined_loss": 1.9693520069122314, |
|
"distill_loss": 1.3039644956588745, |
|
"epoch": 1.044032529855667, |
|
"step": 28500, |
|
"student_mlm_loss": 2.634739637374878 |
|
}, |
|
{ |
|
"epoch": 1.0476958018902485, |
|
"grad_norm": 40.79129409790039, |
|
"learning_rate": 3.273828537730054e-05, |
|
"loss": 2.6424, |
|
"step": 28600 |
|
}, |
|
{ |
|
"combined_loss": 2.4251365661621094, |
|
"distill_loss": 1.3121291399002075, |
|
"epoch": 1.0476958018902485, |
|
"step": 28600, |
|
"student_mlm_loss": 3.5381438732147217 |
|
}, |
|
{ |
|
"epoch": 1.0513590739248297, |
|
"grad_norm": 7.185906410217285, |
|
"learning_rate": 3.2676855787895915e-05, |
|
"loss": 2.9095, |
|
"step": 28700 |
|
}, |
|
{ |
|
"combined_loss": 5.781175136566162, |
|
"distill_loss": 1.3236074447631836, |
|
"epoch": 1.0513590739248297, |
|
"step": 28700, |
|
"student_mlm_loss": 10.23874282836914 |
|
}, |
|
{ |
|
"epoch": 1.055022345959411, |
|
"grad_norm": 7.2639079093933105, |
|
"learning_rate": 3.2615426198491286e-05, |
|
"loss": 3.0536, |
|
"step": 28800 |
|
}, |
|
{ |
|
"combined_loss": 1.8534462451934814, |
|
"distill_loss": 1.433970332145691, |
|
"epoch": 1.055022345959411, |
|
"step": 28800, |
|
"student_mlm_loss": 2.2729220390319824 |
|
}, |
|
{ |
|
"epoch": 1.0586856179939923, |
|
"grad_norm": 82.9974365234375, |
|
"learning_rate": 3.255399660908667e-05, |
|
"loss": 3.4605, |
|
"step": 28900 |
|
}, |
|
{ |
|
"combined_loss": 2.385720729827881, |
|
"distill_loss": 1.319982647895813, |
|
"epoch": 1.0586856179939923, |
|
"step": 28900, |
|
"student_mlm_loss": 3.4514589309692383 |
|
}, |
|
{ |
|
"epoch": 1.0623488900285736, |
|
"grad_norm": 8.101861000061035, |
|
"learning_rate": 3.249256701968204e-05, |
|
"loss": 2.9531, |
|
"step": 29000 |
|
}, |
|
{ |
|
"combined_loss": 1.9569958448410034, |
|
"distill_loss": 1.350255012512207, |
|
"epoch": 1.0623488900285736, |
|
"step": 29000, |
|
"student_mlm_loss": 2.5637366771698 |
|
}, |
|
{ |
|
"epoch": 1.0660121620631549, |
|
"grad_norm": 42.843135833740234, |
|
"learning_rate": 3.243113743027742e-05, |
|
"loss": 3.5336, |
|
"step": 29100 |
|
}, |
|
{ |
|
"combined_loss": 2.0199599266052246, |
|
"distill_loss": 1.1558183431625366, |
|
"epoch": 1.0660121620631549, |
|
"step": 29100, |
|
"student_mlm_loss": 2.884101390838623 |
|
}, |
|
{ |
|
"epoch": 1.0696754340977361, |
|
"grad_norm": 10.401261329650879, |
|
"learning_rate": 3.236970784087279e-05, |
|
"loss": 2.6909, |
|
"step": 29200 |
|
}, |
|
{ |
|
"combined_loss": 1.898897409439087, |
|
"distill_loss": 1.2361267805099487, |
|
"epoch": 1.0696754340977361, |
|
"step": 29200, |
|
"student_mlm_loss": 2.5616679191589355 |
|
}, |
|
{ |
|
"epoch": 1.0733387061323174, |
|
"grad_norm": 13.08026123046875, |
|
"learning_rate": 3.230827825146817e-05, |
|
"loss": 10.7499, |
|
"step": 29300 |
|
}, |
|
{ |
|
"combined_loss": 2.385263442993164, |
|
"distill_loss": 1.2960166931152344, |
|
"epoch": 1.0733387061323174, |
|
"step": 29300, |
|
"student_mlm_loss": 3.4745099544525146 |
|
}, |
|
{ |
|
"epoch": 1.0770019781668987, |
|
"grad_norm": 6.8822431564331055, |
|
"learning_rate": 3.2246848662063546e-05, |
|
"loss": 3.0651, |
|
"step": 29400 |
|
}, |
|
{ |
|
"combined_loss": 2.1257505416870117, |
|
"distill_loss": 1.3224972486495972, |
|
"epoch": 1.0770019781668987, |
|
"step": 29400, |
|
"student_mlm_loss": 2.929003953933716 |
|
}, |
|
{ |
|
"epoch": 1.08066525020148, |
|
"grad_norm": 3.4312744140625, |
|
"learning_rate": 3.218541907265892e-05, |
|
"loss": 3.1323, |
|
"step": 29500 |
|
}, |
|
{ |
|
"combined_loss": 2.0117716789245605, |
|
"distill_loss": 1.2447552680969238, |
|
"epoch": 1.08066525020148, |
|
"step": 29500, |
|
"student_mlm_loss": 2.7787880897521973 |
|
}, |
|
{ |
|
"epoch": 1.0843285222360612, |
|
"grad_norm": 3.970820426940918, |
|
"learning_rate": 3.2123989483254295e-05, |
|
"loss": 3.7427, |
|
"step": 29600 |
|
}, |
|
{ |
|
"combined_loss": 2.493256092071533, |
|
"distill_loss": 1.27970290184021, |
|
"epoch": 1.0843285222360612, |
|
"step": 29600, |
|
"student_mlm_loss": 3.7068092823028564 |
|
}, |
|
{ |
|
"epoch": 1.0879917942706425, |
|
"grad_norm": 5.8632426261901855, |
|
"learning_rate": 3.206255989384967e-05, |
|
"loss": 3.0698, |
|
"step": 29700 |
|
}, |
|
{ |
|
"combined_loss": 2.017867088317871, |
|
"distill_loss": 1.408115029335022, |
|
"epoch": 1.0879917942706425, |
|
"step": 29700, |
|
"student_mlm_loss": 2.6276190280914307 |
|
}, |
|
{ |
|
"epoch": 1.0916550663052238, |
|
"grad_norm": 7.350955963134766, |
|
"learning_rate": 3.200113030444505e-05, |
|
"loss": 10.1517, |
|
"step": 29800 |
|
}, |
|
{ |
|
"combined_loss": 3.020230770111084, |
|
"distill_loss": 1.1870992183685303, |
|
"epoch": 1.0916550663052238, |
|
"step": 29800, |
|
"student_mlm_loss": 4.853362083435059 |
|
}, |
|
{ |
|
"epoch": 1.095318338339805, |
|
"grad_norm": 14.347647666931152, |
|
"learning_rate": 3.193970071504042e-05, |
|
"loss": 2.8345, |
|
"step": 29900 |
|
}, |
|
{ |
|
"combined_loss": 1.8037035465240479, |
|
"distill_loss": 1.2421637773513794, |
|
"epoch": 1.095318338339805, |
|
"step": 29900, |
|
"student_mlm_loss": 2.365243434906006 |
|
}, |
|
{ |
|
"epoch": 1.0989816103743864, |
|
"grad_norm": 8.716060638427734, |
|
"learning_rate": 3.18782711256358e-05, |
|
"loss": 4.9073, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.0989816103743864, |
|
"eval_loss": 3.289705753326416, |
|
"eval_runtime": 2.6398, |
|
"eval_samples_per_second": 2650.179, |
|
"eval_steps_per_second": 10.607, |
|
"step": 30000 |
|
}, |
|
{ |
|
"combined_loss": 3.3838839530944824, |
|
"distill_loss": 1.2657897472381592, |
|
"epoch": 1.0989816103743864, |
|
"step": 30000, |
|
"student_mlm_loss": 5.501977920532227 |
|
}, |
|
{ |
|
"epoch": 1.1026448824089676, |
|
"grad_norm": 9.78013801574707, |
|
"learning_rate": 3.181684153623117e-05, |
|
"loss": 6.1366, |
|
"step": 30100 |
|
}, |
|
{ |
|
"combined_loss": 1.8116616010665894, |
|
"distill_loss": 1.3585631847381592, |
|
"epoch": 1.1026448824089676, |
|
"step": 30100, |
|
"student_mlm_loss": 2.2647600173950195 |
|
}, |
|
{ |
|
"epoch": 1.106308154443549, |
|
"grad_norm": 20.41010856628418, |
|
"learning_rate": 3.175541194682655e-05, |
|
"loss": 4.7028, |
|
"step": 30200 |
|
}, |
|
{ |
|
"combined_loss": 1.9074151515960693, |
|
"distill_loss": 1.119224190711975, |
|
"epoch": 1.106308154443549, |
|
"step": 30200, |
|
"student_mlm_loss": 2.695605993270874 |
|
}, |
|
{ |
|
"epoch": 1.1099714264781302, |
|
"grad_norm": 7.005733966827393, |
|
"learning_rate": 3.1693982357421926e-05, |
|
"loss": 4.9073, |
|
"step": 30300 |
|
}, |
|
{ |
|
"combined_loss": 1.7690558433532715, |
|
"distill_loss": 1.2762707471847534, |
|
"epoch": 1.1099714264781302, |
|
"step": 30300, |
|
"student_mlm_loss": 2.2618408203125 |
|
}, |
|
{ |
|
"epoch": 1.1136346985127115, |
|
"grad_norm": 4.290195465087891, |
|
"learning_rate": 3.16325527680173e-05, |
|
"loss": 4.1257, |
|
"step": 30400 |
|
}, |
|
{ |
|
"combined_loss": 15.505983352661133, |
|
"distill_loss": 1.252361536026001, |
|
"epoch": 1.1136346985127115, |
|
"step": 30400, |
|
"student_mlm_loss": 29.759605407714844 |
|
}, |
|
{ |
|
"epoch": 1.1172979705472927, |
|
"grad_norm": 27.59025764465332, |
|
"learning_rate": 3.1571123178612675e-05, |
|
"loss": 3.6319, |
|
"step": 30500 |
|
}, |
|
{ |
|
"combined_loss": 3.190175771713257, |
|
"distill_loss": 1.237632155418396, |
|
"epoch": 1.1172979705472927, |
|
"step": 30500, |
|
"student_mlm_loss": 5.142719268798828 |
|
}, |
|
{ |
|
"epoch": 1.120961242581874, |
|
"grad_norm": 35.681365966796875, |
|
"learning_rate": 3.150969358920805e-05, |
|
"loss": 5.2866, |
|
"step": 30600 |
|
}, |
|
{ |
|
"combined_loss": 2.1486501693725586, |
|
"distill_loss": 1.3570821285247803, |
|
"epoch": 1.120961242581874, |
|
"step": 30600, |
|
"student_mlm_loss": 2.940218448638916 |
|
}, |
|
{ |
|
"epoch": 1.1246245146164555, |
|
"grad_norm": 28.920949935913086, |
|
"learning_rate": 3.144826399980343e-05, |
|
"loss": 11.35, |
|
"step": 30700 |
|
}, |
|
{ |
|
"combined_loss": 3.544619560241699, |
|
"distill_loss": 1.3219174146652222, |
|
"epoch": 1.1246245146164555, |
|
"step": 30700, |
|
"student_mlm_loss": 5.767321586608887 |
|
}, |
|
{ |
|
"epoch": 1.1282877866510368, |
|
"grad_norm": 36.29865264892578, |
|
"learning_rate": 3.13868344103988e-05, |
|
"loss": 8.8748, |
|
"step": 30800 |
|
}, |
|
{ |
|
"combined_loss": 3.136960744857788, |
|
"distill_loss": 1.4069170951843262, |
|
"epoch": 1.1282877866510368, |
|
"step": 30800, |
|
"student_mlm_loss": 4.86700439453125 |
|
}, |
|
{ |
|
"epoch": 1.131951058685618, |
|
"grad_norm": 8.498424530029297, |
|
"learning_rate": 3.132540482099417e-05, |
|
"loss": 2.6175, |
|
"step": 30900 |
|
}, |
|
{ |
|
"combined_loss": 2.584123373031616, |
|
"distill_loss": 1.3318666219711304, |
|
"epoch": 1.131951058685618, |
|
"step": 30900, |
|
"student_mlm_loss": 3.8363800048828125 |
|
}, |
|
{ |
|
"epoch": 1.1356143307201993, |
|
"grad_norm": 8.784627914428711, |
|
"learning_rate": 3.126397523158955e-05, |
|
"loss": 3.7912, |
|
"step": 31000 |
|
}, |
|
{ |
|
"combined_loss": 4.065792083740234, |
|
"distill_loss": 1.279055118560791, |
|
"epoch": 1.1356143307201993, |
|
"step": 31000, |
|
"student_mlm_loss": 6.8525285720825195 |
|
}, |
|
{ |
|
"epoch": 1.1392776027547806, |
|
"grad_norm": 15.763399124145508, |
|
"learning_rate": 3.120254564218493e-05, |
|
"loss": 7.3671, |
|
"step": 31100 |
|
}, |
|
{ |
|
"combined_loss": 1.9532334804534912, |
|
"distill_loss": 1.2137418985366821, |
|
"epoch": 1.1392776027547806, |
|
"step": 31100, |
|
"student_mlm_loss": 2.6927249431610107 |
|
}, |
|
{ |
|
"epoch": 1.142940874789362, |
|
"grad_norm": 6.777341842651367, |
|
"learning_rate": 3.1141116052780306e-05, |
|
"loss": 2.8877, |
|
"step": 31200 |
|
}, |
|
{ |
|
"combined_loss": 3.5847015380859375, |
|
"distill_loss": 1.3712694644927979, |
|
"epoch": 1.142940874789362, |
|
"step": 31200, |
|
"student_mlm_loss": 5.798133850097656 |
|
}, |
|
{ |
|
"epoch": 1.1466041468239432, |
|
"grad_norm": 6.115112781524658, |
|
"learning_rate": 3.107968646337568e-05, |
|
"loss": 3.3763, |
|
"step": 31300 |
|
}, |
|
{ |
|
"combined_loss": 1.899533748626709, |
|
"distill_loss": 1.2805981636047363, |
|
"epoch": 1.1466041468239432, |
|
"step": 31300, |
|
"student_mlm_loss": 2.5184693336486816 |
|
}, |
|
{ |
|
"epoch": 1.1502674188585245, |
|
"grad_norm": 3.3896713256835938, |
|
"learning_rate": 3.1018256873971055e-05, |
|
"loss": 3.2932, |
|
"step": 31400 |
|
}, |
|
{ |
|
"combined_loss": 1.9794254302978516, |
|
"distill_loss": 1.3896270990371704, |
|
"epoch": 1.1502674188585245, |
|
"step": 31400, |
|
"student_mlm_loss": 2.5692238807678223 |
|
}, |
|
{ |
|
"epoch": 1.1539306908931057, |
|
"grad_norm": 12.824034690856934, |
|
"learning_rate": 3.095682728456643e-05, |
|
"loss": 3.5341, |
|
"step": 31500 |
|
}, |
|
{ |
|
"combined_loss": 2.5983529090881348, |
|
"distill_loss": 1.2135576009750366, |
|
"epoch": 1.1539306908931057, |
|
"step": 31500, |
|
"student_mlm_loss": 3.9831480979919434 |
|
}, |
|
{ |
|
"epoch": 1.157593962927687, |
|
"grad_norm": 73.47982025146484, |
|
"learning_rate": 3.089539769516181e-05, |
|
"loss": 2.9879, |
|
"step": 31600 |
|
}, |
|
{ |
|
"combined_loss": 1.8584779500961304, |
|
"distill_loss": 1.3214514255523682, |
|
"epoch": 1.157593962927687, |
|
"step": 31600, |
|
"student_mlm_loss": 2.3955044746398926 |
|
}, |
|
{ |
|
"epoch": 1.1612572349622683, |
|
"grad_norm": 5.6778340339660645, |
|
"learning_rate": 3.083396810575718e-05, |
|
"loss": 2.9781, |
|
"step": 31700 |
|
}, |
|
{ |
|
"combined_loss": 4.854001045227051, |
|
"distill_loss": 1.2088978290557861, |
|
"epoch": 1.1612572349622683, |
|
"step": 31700, |
|
"student_mlm_loss": 8.499104499816895 |
|
}, |
|
{ |
|
"epoch": 1.1649205069968496, |
|
"grad_norm": 17.93754768371582, |
|
"learning_rate": 3.077253851635255e-05, |
|
"loss": 3.5773, |
|
"step": 31800 |
|
}, |
|
{ |
|
"combined_loss": 1.9064607620239258, |
|
"distill_loss": 1.363638997077942, |
|
"epoch": 1.1649205069968496, |
|
"step": 31800, |
|
"student_mlm_loss": 2.449282646179199 |
|
}, |
|
{ |
|
"epoch": 1.1685837790314308, |
|
"grad_norm": 8.912027359008789, |
|
"learning_rate": 3.071110892694794e-05, |
|
"loss": 3.0949, |
|
"step": 31900 |
|
}, |
|
{ |
|
"combined_loss": 1.9666361808776855, |
|
"distill_loss": 1.3997029066085815, |
|
"epoch": 1.1685837790314308, |
|
"step": 31900, |
|
"student_mlm_loss": 2.5335693359375 |
|
}, |
|
{ |
|
"epoch": 1.1722470510660121, |
|
"grad_norm": 21.05866050720215, |
|
"learning_rate": 3.064967933754331e-05, |
|
"loss": 2.965, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.1722470510660121, |
|
"eval_loss": 3.516061544418335, |
|
"eval_runtime": 2.6391, |
|
"eval_samples_per_second": 2650.903, |
|
"eval_steps_per_second": 10.61, |
|
"step": 32000 |
|
}, |
|
{ |
|
"combined_loss": 2.466904640197754, |
|
"distill_loss": 1.2619636058807373, |
|
"epoch": 1.1722470510660121, |
|
"step": 32000, |
|
"student_mlm_loss": 3.6718459129333496 |
|
}, |
|
{ |
|
"epoch": 1.1759103231005934, |
|
"grad_norm": 14.288066864013672, |
|
"learning_rate": 3.0588249748138686e-05, |
|
"loss": 6.5656, |
|
"step": 32100 |
|
}, |
|
{ |
|
"combined_loss": 5.987391471862793, |
|
"distill_loss": 1.3964972496032715, |
|
"epoch": 1.1759103231005934, |
|
"step": 32100, |
|
"student_mlm_loss": 10.578286170959473 |
|
}, |
|
{ |
|
"epoch": 1.1795735951351747, |
|
"grad_norm": 10.953961372375488, |
|
"learning_rate": 3.052682015873406e-05, |
|
"loss": 7.1246, |
|
"step": 32200 |
|
}, |
|
{ |
|
"combined_loss": 1.758845567703247, |
|
"distill_loss": 1.2731348276138306, |
|
"epoch": 1.1795735951351747, |
|
"step": 32200, |
|
"student_mlm_loss": 2.244556188583374 |
|
}, |
|
{ |
|
"epoch": 1.183236867169756, |
|
"grad_norm": 17.076087951660156, |
|
"learning_rate": 3.046539056932944e-05, |
|
"loss": 7.3734, |
|
"step": 32300 |
|
}, |
|
{ |
|
"combined_loss": 1.7941749095916748, |
|
"distill_loss": 1.282630205154419, |
|
"epoch": 1.183236867169756, |
|
"step": 32300, |
|
"student_mlm_loss": 2.3057196140289307 |
|
}, |
|
{ |
|
"epoch": 1.1869001392043372, |
|
"grad_norm": 11.33812427520752, |
|
"learning_rate": 3.040396097992481e-05, |
|
"loss": 5.4979, |
|
"step": 32400 |
|
}, |
|
{ |
|
"combined_loss": 2.379426956176758, |
|
"distill_loss": 1.2975032329559326, |
|
"epoch": 1.1869001392043372, |
|
"step": 32400, |
|
"student_mlm_loss": 3.461350917816162 |
|
}, |
|
{ |
|
"epoch": 1.1905634112389185, |
|
"grad_norm": 3.6378591060638428, |
|
"learning_rate": 3.0342531390520184e-05, |
|
"loss": 5.077, |
|
"step": 32500 |
|
}, |
|
{ |
|
"combined_loss": 1.835166573524475, |
|
"distill_loss": 1.294168472290039, |
|
"epoch": 1.1905634112389185, |
|
"step": 32500, |
|
"student_mlm_loss": 2.376164674758911 |
|
}, |
|
{ |
|
"epoch": 1.1942266832735, |
|
"grad_norm": 23.017444610595703, |
|
"learning_rate": 3.0281101801115562e-05, |
|
"loss": 3.1428, |
|
"step": 32600 |
|
}, |
|
{ |
|
"combined_loss": 1.8867619037628174, |
|
"distill_loss": 1.2372292280197144, |
|
"epoch": 1.1942266832735, |
|
"step": 32600, |
|
"student_mlm_loss": 2.536294460296631 |
|
}, |
|
{ |
|
"epoch": 1.197889955308081, |
|
"grad_norm": 7.055652141571045, |
|
"learning_rate": 3.0219672211710937e-05, |
|
"loss": 8.7118, |
|
"step": 32700 |
|
}, |
|
{ |
|
"combined_loss": 6.59044075012207, |
|
"distill_loss": 1.3554973602294922, |
|
"epoch": 1.197889955308081, |
|
"step": 32700, |
|
"student_mlm_loss": 11.825384140014648 |
|
}, |
|
{ |
|
"epoch": 1.2015532273426626, |
|
"grad_norm": 6.935373783111572, |
|
"learning_rate": 3.0158242622306314e-05, |
|
"loss": 7.5763, |
|
"step": 32800 |
|
}, |
|
{ |
|
"combined_loss": 2.4971964359283447, |
|
"distill_loss": 1.2960432767868042, |
|
"epoch": 1.2015532273426626, |
|
"step": 32800, |
|
"student_mlm_loss": 3.698349714279175 |
|
}, |
|
{ |
|
"epoch": 1.2052164993772438, |
|
"grad_norm": 19.48725700378418, |
|
"learning_rate": 3.009681303290169e-05, |
|
"loss": 5.1993, |
|
"step": 32900 |
|
}, |
|
{ |
|
"combined_loss": 2.639206886291504, |
|
"distill_loss": 1.2536990642547607, |
|
"epoch": 1.2052164993772438, |
|
"step": 32900, |
|
"student_mlm_loss": 4.024714469909668 |
|
}, |
|
{ |
|
"epoch": 1.2088797714118251, |
|
"grad_norm": 215.4875946044922, |
|
"learning_rate": 3.0035383443497067e-05, |
|
"loss": 3.9297, |
|
"step": 33000 |
|
}, |
|
{ |
|
"combined_loss": 2.1888670921325684, |
|
"distill_loss": 1.4587746858596802, |
|
"epoch": 1.2088797714118251, |
|
"step": 33000, |
|
"student_mlm_loss": 2.918959379196167 |
|
}, |
|
{ |
|
"epoch": 1.2125430434464064, |
|
"grad_norm": 5.346382141113281, |
|
"learning_rate": 2.997395385409244e-05, |
|
"loss": 3.3704, |
|
"step": 33100 |
|
}, |
|
{ |
|
"combined_loss": 2.5722949504852295, |
|
"distill_loss": 1.2250982522964478, |
|
"epoch": 1.2125430434464064, |
|
"step": 33100, |
|
"student_mlm_loss": 3.9194915294647217 |
|
}, |
|
{ |
|
"epoch": 1.2162063154809877, |
|
"grad_norm": 21.193038940429688, |
|
"learning_rate": 2.991252426468782e-05, |
|
"loss": 3.22, |
|
"step": 33200 |
|
}, |
|
{ |
|
"combined_loss": 1.8822517395019531, |
|
"distill_loss": 1.264020323753357, |
|
"epoch": 1.2162063154809877, |
|
"step": 33200, |
|
"student_mlm_loss": 2.5004830360412598 |
|
}, |
|
{ |
|
"epoch": 1.219869587515569, |
|
"grad_norm": 8.840603828430176, |
|
"learning_rate": 2.9851094675283193e-05, |
|
"loss": 13.091, |
|
"step": 33300 |
|
}, |
|
{ |
|
"combined_loss": 2.0461645126342773, |
|
"distill_loss": 1.3376085758209229, |
|
"epoch": 1.219869587515569, |
|
"step": 33300, |
|
"student_mlm_loss": 2.7547202110290527 |
|
}, |
|
{ |
|
"epoch": 1.2235328595501502, |
|
"grad_norm": 16.414852142333984, |
|
"learning_rate": 2.9789665085878564e-05, |
|
"loss": 3.6096, |
|
"step": 33400 |
|
}, |
|
{ |
|
"combined_loss": 1.8437246084213257, |
|
"distill_loss": 1.2731173038482666, |
|
"epoch": 1.2235328595501502, |
|
"step": 33400, |
|
"student_mlm_loss": 2.4143319129943848 |
|
}, |
|
{ |
|
"epoch": 1.2271961315847315, |
|
"grad_norm": 5.047356605529785, |
|
"learning_rate": 2.9728235496473946e-05, |
|
"loss": 10.6014, |
|
"step": 33500 |
|
}, |
|
{ |
|
"combined_loss": 2.0613672733306885, |
|
"distill_loss": 1.1784592866897583, |
|
"epoch": 1.2271961315847315, |
|
"step": 33500, |
|
"student_mlm_loss": 2.944275140762329 |
|
}, |
|
{ |
|
"epoch": 1.2308594036193128, |
|
"grad_norm": 8.502574920654297, |
|
"learning_rate": 2.9666805907069317e-05, |
|
"loss": 12.6532, |
|
"step": 33600 |
|
}, |
|
{ |
|
"combined_loss": 2.301725149154663, |
|
"distill_loss": 1.2482868432998657, |
|
"epoch": 1.2308594036193128, |
|
"step": 33600, |
|
"student_mlm_loss": 3.355163335800171 |
|
}, |
|
{ |
|
"epoch": 1.234522675653894, |
|
"grad_norm": 25.97445297241211, |
|
"learning_rate": 2.9605376317664695e-05, |
|
"loss": 3.1296, |
|
"step": 33700 |
|
}, |
|
{ |
|
"combined_loss": 1.8135402202606201, |
|
"distill_loss": 1.309229850769043, |
|
"epoch": 1.234522675653894, |
|
"step": 33700, |
|
"student_mlm_loss": 2.3178505897521973 |
|
}, |
|
{ |
|
"epoch": 1.2381859476884753, |
|
"grad_norm": 7.912507057189941, |
|
"learning_rate": 2.954394672826007e-05, |
|
"loss": 2.9749, |
|
"step": 33800 |
|
}, |
|
{ |
|
"combined_loss": 1.9506487846374512, |
|
"distill_loss": 1.3808802366256714, |
|
"epoch": 1.2381859476884753, |
|
"step": 33800, |
|
"student_mlm_loss": 2.5204174518585205 |
|
}, |
|
{ |
|
"epoch": 1.2418492197230566, |
|
"grad_norm": 28.239988327026367, |
|
"learning_rate": 2.9482517138855447e-05, |
|
"loss": 5.7527, |
|
"step": 33900 |
|
}, |
|
{ |
|
"combined_loss": 1.881349802017212, |
|
"distill_loss": 1.3489292860031128, |
|
"epoch": 1.2418492197230566, |
|
"step": 33900, |
|
"student_mlm_loss": 2.4137701988220215 |
|
}, |
|
{ |
|
"epoch": 1.245512491757638, |
|
"grad_norm": 25.953353881835938, |
|
"learning_rate": 2.942108754945082e-05, |
|
"loss": 4.0339, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.245512491757638, |
|
"eval_loss": 3.297154188156128, |
|
"eval_runtime": 2.3826, |
|
"eval_samples_per_second": 2936.248, |
|
"eval_steps_per_second": 11.752, |
|
"step": 34000 |
|
}, |
|
{ |
|
"combined_loss": 2.5429787635803223, |
|
"distill_loss": 1.2718520164489746, |
|
"epoch": 1.245512491757638, |
|
"step": 34000, |
|
"student_mlm_loss": 3.814105272293091 |
|
}, |
|
{ |
|
"epoch": 1.2491757637922192, |
|
"grad_norm": 48.45500183105469, |
|
"learning_rate": 2.9359657960046196e-05, |
|
"loss": 6.1408, |
|
"step": 34100 |
|
}, |
|
{ |
|
"combined_loss": 4.794422626495361, |
|
"distill_loss": 1.3052036762237549, |
|
"epoch": 1.2491757637922192, |
|
"step": 34100, |
|
"student_mlm_loss": 8.283641815185547 |
|
}, |
|
{ |
|
"epoch": 1.2528390358268005, |
|
"grad_norm": 6.028234004974365, |
|
"learning_rate": 2.9298228370641574e-05, |
|
"loss": 2.9116, |
|
"step": 34200 |
|
}, |
|
{ |
|
"combined_loss": 2.125443458557129, |
|
"distill_loss": 1.25053071975708, |
|
"epoch": 1.2528390358268005, |
|
"step": 34200, |
|
"student_mlm_loss": 3.0003561973571777 |
|
}, |
|
{ |
|
"epoch": 1.2565023078613817, |
|
"grad_norm": 15.824817657470703, |
|
"learning_rate": 2.9236798781236945e-05, |
|
"loss": 3.5834, |
|
"step": 34300 |
|
}, |
|
{ |
|
"combined_loss": 2.156796932220459, |
|
"distill_loss": 1.1805670261383057, |
|
"epoch": 1.2565023078613817, |
|
"step": 34300, |
|
"student_mlm_loss": 3.1330268383026123 |
|
}, |
|
{ |
|
"epoch": 1.260165579895963, |
|
"grad_norm": 8.438326835632324, |
|
"learning_rate": 2.9175369191832326e-05, |
|
"loss": 5.0724, |
|
"step": 34400 |
|
}, |
|
{ |
|
"combined_loss": 3.144615888595581, |
|
"distill_loss": 1.2467416524887085, |
|
"epoch": 1.260165579895963, |
|
"step": 34400, |
|
"student_mlm_loss": 5.042490005493164 |
|
}, |
|
{ |
|
"epoch": 1.2638288519305443, |
|
"grad_norm": 3.7252449989318848, |
|
"learning_rate": 2.9113939602427697e-05, |
|
"loss": 2.9306, |
|
"step": 34500 |
|
}, |
|
{ |
|
"combined_loss": 4.309004783630371, |
|
"distill_loss": 1.2629985809326172, |
|
"epoch": 1.2638288519305443, |
|
"step": 34500, |
|
"student_mlm_loss": 7.355010986328125 |
|
}, |
|
{ |
|
"epoch": 1.2674921239651256, |
|
"grad_norm": 14.86426067352295, |
|
"learning_rate": 2.9052510013023078e-05, |
|
"loss": 3.059, |
|
"step": 34600 |
|
}, |
|
{ |
|
"combined_loss": 2.128227472305298, |
|
"distill_loss": 1.3674236536026, |
|
"epoch": 1.2674921239651256, |
|
"step": 34600, |
|
"student_mlm_loss": 2.889031171798706 |
|
}, |
|
{ |
|
"epoch": 1.271155395999707, |
|
"grad_norm": 14.947731018066406, |
|
"learning_rate": 2.899108042361845e-05, |
|
"loss": 3.0461, |
|
"step": 34700 |
|
}, |
|
{ |
|
"combined_loss": 1.9557018280029297, |
|
"distill_loss": 1.3122907876968384, |
|
"epoch": 1.271155395999707, |
|
"step": 34700, |
|
"student_mlm_loss": 2.5991127490997314 |
|
}, |
|
{ |
|
"epoch": 1.2748186680342881, |
|
"grad_norm": 4.714714527130127, |
|
"learning_rate": 2.8929650834213824e-05, |
|
"loss": 3.0221, |
|
"step": 34800 |
|
}, |
|
{ |
|
"combined_loss": 1.7830932140350342, |
|
"distill_loss": 1.278725028038025, |
|
"epoch": 1.2748186680342881, |
|
"step": 34800, |
|
"student_mlm_loss": 2.287461519241333 |
|
}, |
|
{ |
|
"epoch": 1.2784819400688696, |
|
"grad_norm": 13.885130882263184, |
|
"learning_rate": 2.88682212448092e-05, |
|
"loss": 8.529, |
|
"step": 34900 |
|
}, |
|
{ |
|
"combined_loss": 4.974426746368408, |
|
"distill_loss": 1.4173694849014282, |
|
"epoch": 1.2784819400688696, |
|
"step": 34900, |
|
"student_mlm_loss": 8.53148365020752 |
|
}, |
|
{ |
|
"epoch": 1.2821452121034507, |
|
"grad_norm": 6.786545753479004, |
|
"learning_rate": 2.8806791655404576e-05, |
|
"loss": 3.563, |
|
"step": 35000 |
|
}, |
|
{ |
|
"combined_loss": 1.7134695053100586, |
|
"distill_loss": 1.2251827716827393, |
|
"epoch": 1.2821452121034507, |
|
"step": 35000, |
|
"student_mlm_loss": 2.201756238937378 |
|
}, |
|
{ |
|
"epoch": 1.2858084841380322, |
|
"grad_norm": 18.235891342163086, |
|
"learning_rate": 2.8745362065999954e-05, |
|
"loss": 6.9188, |
|
"step": 35100 |
|
}, |
|
{ |
|
"combined_loss": 6.00921106338501, |
|
"distill_loss": 1.3103188276290894, |
|
"epoch": 1.2858084841380322, |
|
"step": 35100, |
|
"student_mlm_loss": 10.70810317993164 |
|
}, |
|
{ |
|
"epoch": 1.2894717561726134, |
|
"grad_norm": 6.3708696365356445, |
|
"learning_rate": 2.8683932476595328e-05, |
|
"loss": 6.7695, |
|
"step": 35200 |
|
}, |
|
{ |
|
"combined_loss": 2.2400052547454834, |
|
"distill_loss": 1.3289698362350464, |
|
"epoch": 1.2894717561726134, |
|
"step": 35200, |
|
"student_mlm_loss": 3.151040554046631 |
|
}, |
|
{ |
|
"epoch": 1.2931350282071947, |
|
"grad_norm": 7.5602946281433105, |
|
"learning_rate": 2.8622502887190706e-05, |
|
"loss": 9.8005, |
|
"step": 35300 |
|
}, |
|
{ |
|
"combined_loss": 1.848390817642212, |
|
"distill_loss": 1.2897430658340454, |
|
"epoch": 1.2931350282071947, |
|
"step": 35300, |
|
"student_mlm_loss": 2.407038688659668 |
|
}, |
|
{ |
|
"epoch": 1.296798300241776, |
|
"grad_norm": 24.799640655517578, |
|
"learning_rate": 2.8561073297786077e-05, |
|
"loss": 3.2996, |
|
"step": 35400 |
|
}, |
|
{ |
|
"combined_loss": 4.894403457641602, |
|
"distill_loss": 1.282358169555664, |
|
"epoch": 1.296798300241776, |
|
"step": 35400, |
|
"student_mlm_loss": 8.506448745727539 |
|
}, |
|
{ |
|
"epoch": 1.3004615722763573, |
|
"grad_norm": 34.4364013671875, |
|
"learning_rate": 2.849964370838146e-05, |
|
"loss": 3.399, |
|
"step": 35500 |
|
}, |
|
{ |
|
"combined_loss": 1.7965787649154663, |
|
"distill_loss": 1.3232142925262451, |
|
"epoch": 1.3004615722763573, |
|
"step": 35500, |
|
"student_mlm_loss": 2.2699432373046875 |
|
}, |
|
{ |
|
"epoch": 1.3041248443109386, |
|
"grad_norm": 7.9551825523376465, |
|
"learning_rate": 2.843821411897683e-05, |
|
"loss": 3.1887, |
|
"step": 35600 |
|
}, |
|
{ |
|
"combined_loss": 1.855729579925537, |
|
"distill_loss": 1.2217527627944946, |
|
"epoch": 1.3041248443109386, |
|
"step": 35600, |
|
"student_mlm_loss": 2.48970627784729 |
|
}, |
|
{ |
|
"epoch": 1.3077881163455198, |
|
"grad_norm": 5.838754177093506, |
|
"learning_rate": 2.8376784529572204e-05, |
|
"loss": 3.1524, |
|
"step": 35700 |
|
}, |
|
{ |
|
"combined_loss": 2.3417129516601562, |
|
"distill_loss": 1.2872867584228516, |
|
"epoch": 1.3077881163455198, |
|
"step": 35700, |
|
"student_mlm_loss": 3.39613938331604 |
|
}, |
|
{ |
|
"epoch": 1.3114513883801011, |
|
"grad_norm": 4.118559837341309, |
|
"learning_rate": 2.831535494016758e-05, |
|
"loss": 7.9754, |
|
"step": 35800 |
|
}, |
|
{ |
|
"combined_loss": 3.906961679458618, |
|
"distill_loss": 1.2905327081680298, |
|
"epoch": 1.3114513883801011, |
|
"step": 35800, |
|
"student_mlm_loss": 6.523390769958496 |
|
}, |
|
{ |
|
"epoch": 1.3151146604146824, |
|
"grad_norm": 5.229255199432373, |
|
"learning_rate": 2.8253925350762956e-05, |
|
"loss": 3.6586, |
|
"step": 35900 |
|
}, |
|
{ |
|
"combined_loss": 2.6259002685546875, |
|
"distill_loss": 1.217278003692627, |
|
"epoch": 1.3151146604146824, |
|
"step": 35900, |
|
"student_mlm_loss": 4.034522533416748 |
|
}, |
|
{ |
|
"epoch": 1.3187779324492637, |
|
"grad_norm": 9.182631492614746, |
|
"learning_rate": 2.8192495761358334e-05, |
|
"loss": 8.5789, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.3187779324492637, |
|
"eval_loss": 3.3097567558288574, |
|
"eval_runtime": 1.9861, |
|
"eval_samples_per_second": 3522.525, |
|
"eval_steps_per_second": 14.098, |
|
"step": 36000 |
|
}, |
|
{ |
|
"combined_loss": 15.921034812927246, |
|
"distill_loss": 1.2575896978378296, |
|
"epoch": 1.3187779324492637, |
|
"step": 36000, |
|
"student_mlm_loss": 30.58448028564453 |
|
}, |
|
{ |
|
"epoch": 1.322441204483845, |
|
"grad_norm": 5.999209880828857, |
|
"learning_rate": 2.813106617195371e-05, |
|
"loss": 3.6109, |
|
"step": 36100 |
|
}, |
|
{ |
|
"combined_loss": 204.92184448242188, |
|
"distill_loss": 1.2291535139083862, |
|
"epoch": 1.322441204483845, |
|
"step": 36100, |
|
"student_mlm_loss": 408.6145324707031 |
|
}, |
|
{ |
|
"epoch": 1.3261044765184262, |
|
"grad_norm": 8.351846694946289, |
|
"learning_rate": 2.8069636582549086e-05, |
|
"loss": 5.9753, |
|
"step": 36200 |
|
}, |
|
{ |
|
"combined_loss": 3.7332310676574707, |
|
"distill_loss": 1.377110481262207, |
|
"epoch": 1.3261044765184262, |
|
"step": 36200, |
|
"student_mlm_loss": 6.089351654052734 |
|
}, |
|
{ |
|
"epoch": 1.3297677485530075, |
|
"grad_norm": 4.738751411437988, |
|
"learning_rate": 2.800820699314446e-05, |
|
"loss": 2.8706, |
|
"step": 36300 |
|
}, |
|
{ |
|
"combined_loss": 1.949210286140442, |
|
"distill_loss": 1.1820151805877686, |
|
"epoch": 1.3297677485530075, |
|
"step": 36300, |
|
"student_mlm_loss": 2.7164053916931152 |
|
}, |
|
{ |
|
"epoch": 1.3334310205875888, |
|
"grad_norm": 3.7835421562194824, |
|
"learning_rate": 2.7946777403739832e-05, |
|
"loss": 3.5794, |
|
"step": 36400 |
|
}, |
|
{ |
|
"combined_loss": 1.7922800779342651, |
|
"distill_loss": 1.2455928325653076, |
|
"epoch": 1.3334310205875888, |
|
"step": 36400, |
|
"student_mlm_loss": 2.3389673233032227 |
|
}, |
|
{ |
|
"epoch": 1.33709429262217, |
|
"grad_norm": 22.528881072998047, |
|
"learning_rate": 2.788534781433521e-05, |
|
"loss": 3.8623, |
|
"step": 36500 |
|
}, |
|
{ |
|
"combined_loss": 1.788147211074829, |
|
"distill_loss": 1.2254056930541992, |
|
"epoch": 1.33709429262217, |
|
"step": 36500, |
|
"student_mlm_loss": 2.350888729095459 |
|
}, |
|
{ |
|
"epoch": 1.3407575646567513, |
|
"grad_norm": 5.876169681549072, |
|
"learning_rate": 2.7823918224930584e-05, |
|
"loss": 8.4137, |
|
"step": 36600 |
|
}, |
|
{ |
|
"combined_loss": 2.0377962589263916, |
|
"distill_loss": 1.2204126119613647, |
|
"epoch": 1.3407575646567513, |
|
"step": 36600, |
|
"student_mlm_loss": 2.855179786682129 |
|
}, |
|
{ |
|
"epoch": 1.3444208366913326, |
|
"grad_norm": 20.921276092529297, |
|
"learning_rate": 2.7762488635525962e-05, |
|
"loss": 3.5857, |
|
"step": 36700 |
|
}, |
|
{ |
|
"combined_loss": 1.9521321058273315, |
|
"distill_loss": 1.249513864517212, |
|
"epoch": 1.3444208366913326, |
|
"step": 36700, |
|
"student_mlm_loss": 2.654750347137451 |
|
}, |
|
{ |
|
"epoch": 1.348084108725914, |
|
"grad_norm": 13.851704597473145, |
|
"learning_rate": 2.7701059046121336e-05, |
|
"loss": 3.8678, |
|
"step": 36800 |
|
}, |
|
{ |
|
"combined_loss": 2.2560389041900635, |
|
"distill_loss": 1.2315130233764648, |
|
"epoch": 1.348084108725914, |
|
"step": 36800, |
|
"student_mlm_loss": 3.280564785003662 |
|
}, |
|
{ |
|
"epoch": 1.3517473807604952, |
|
"grad_norm": 16.56214714050293, |
|
"learning_rate": 2.7639629456716714e-05, |
|
"loss": 3.3998, |
|
"step": 36900 |
|
}, |
|
{ |
|
"combined_loss": 3.098896026611328, |
|
"distill_loss": 1.3377043008804321, |
|
"epoch": 1.3517473807604952, |
|
"step": 36900, |
|
"student_mlm_loss": 4.860087871551514 |
|
}, |
|
{ |
|
"epoch": 1.3554106527950767, |
|
"grad_norm": 35.91291809082031, |
|
"learning_rate": 2.757819986731209e-05, |
|
"loss": 3.761, |
|
"step": 37000 |
|
}, |
|
{ |
|
"combined_loss": 1.9794631004333496, |
|
"distill_loss": 1.3087836503982544, |
|
"epoch": 1.3554106527950767, |
|
"step": 37000, |
|
"student_mlm_loss": 2.6501426696777344 |
|
}, |
|
{ |
|
"epoch": 1.3590739248296577, |
|
"grad_norm": 11.776296615600586, |
|
"learning_rate": 2.7516770277907466e-05, |
|
"loss": 3.9886, |
|
"step": 37100 |
|
}, |
|
{ |
|
"combined_loss": 2.3107573986053467, |
|
"distill_loss": 1.268768310546875, |
|
"epoch": 1.3590739248296577, |
|
"step": 37100, |
|
"student_mlm_loss": 3.3527464866638184 |
|
}, |
|
{ |
|
"epoch": 1.3627371968642392, |
|
"grad_norm": 13.237029075622559, |
|
"learning_rate": 2.745534068850284e-05, |
|
"loss": 5.3161, |
|
"step": 37200 |
|
}, |
|
{ |
|
"combined_loss": 4.210747718811035, |
|
"distill_loss": 1.4009877443313599, |
|
"epoch": 1.3627371968642392, |
|
"step": 37200, |
|
"student_mlm_loss": 7.0205078125 |
|
}, |
|
{ |
|
"epoch": 1.3664004688988205, |
|
"grad_norm": 18.256624221801758, |
|
"learning_rate": 2.7393911099098212e-05, |
|
"loss": 3.3122, |
|
"step": 37300 |
|
}, |
|
{ |
|
"combined_loss": 2.467655658721924, |
|
"distill_loss": 1.3313319683074951, |
|
"epoch": 1.3664004688988205, |
|
"step": 37300, |
|
"student_mlm_loss": 3.6039793491363525 |
|
}, |
|
{ |
|
"epoch": 1.3700637409334018, |
|
"grad_norm": 3.6821129322052, |
|
"learning_rate": 2.7332481509693593e-05, |
|
"loss": 2.5638, |
|
"step": 37400 |
|
}, |
|
{ |
|
"combined_loss": 4.0961503982543945, |
|
"distill_loss": 1.2590566873550415, |
|
"epoch": 1.3700637409334018, |
|
"step": 37400, |
|
"student_mlm_loss": 6.933243751525879 |
|
}, |
|
{ |
|
"epoch": 1.373727012967983, |
|
"grad_norm": 9.491351127624512, |
|
"learning_rate": 2.7271051920288964e-05, |
|
"loss": 5.2572, |
|
"step": 37500 |
|
}, |
|
{ |
|
"combined_loss": 1.8323596715927124, |
|
"distill_loss": 1.2323403358459473, |
|
"epoch": 1.373727012967983, |
|
"step": 37500, |
|
"student_mlm_loss": 2.4323790073394775 |
|
}, |
|
{ |
|
"epoch": 1.3773902850025643, |
|
"grad_norm": 10.13337516784668, |
|
"learning_rate": 2.7209622330884342e-05, |
|
"loss": 2.9805, |
|
"step": 37600 |
|
}, |
|
{ |
|
"combined_loss": 2.7236733436584473, |
|
"distill_loss": 1.2598845958709717, |
|
"epoch": 1.3773902850025643, |
|
"step": 37600, |
|
"student_mlm_loss": 4.187462329864502 |
|
}, |
|
{ |
|
"epoch": 1.3810535570371456, |
|
"grad_norm": 22.098358154296875, |
|
"learning_rate": 2.7148192741479716e-05, |
|
"loss": 3.1095, |
|
"step": 37700 |
|
}, |
|
{ |
|
"combined_loss": 1.7910634279251099, |
|
"distill_loss": 1.271672010421753, |
|
"epoch": 1.3810535570371456, |
|
"step": 37700, |
|
"student_mlm_loss": 2.310454845428467 |
|
}, |
|
{ |
|
"epoch": 1.3847168290717269, |
|
"grad_norm": 233.01779174804688, |
|
"learning_rate": 2.7086763152075094e-05, |
|
"loss": 3.0334, |
|
"step": 37800 |
|
}, |
|
{ |
|
"combined_loss": 2.449730396270752, |
|
"distill_loss": 1.343329906463623, |
|
"epoch": 1.3847168290717269, |
|
"step": 37800, |
|
"student_mlm_loss": 3.556130886077881 |
|
}, |
|
{ |
|
"epoch": 1.3883801011063082, |
|
"grad_norm": 7.459797382354736, |
|
"learning_rate": 2.702533356267047e-05, |
|
"loss": 5.0088, |
|
"step": 37900 |
|
}, |
|
{ |
|
"combined_loss": 2.047302722930908, |
|
"distill_loss": 1.2358465194702148, |
|
"epoch": 1.3883801011063082, |
|
"step": 37900, |
|
"student_mlm_loss": 2.8587586879730225 |
|
}, |
|
{ |
|
"epoch": 1.3920433731408894, |
|
"grad_norm": 3.9627275466918945, |
|
"learning_rate": 2.6963903973265843e-05, |
|
"loss": 2.7476, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.3920433731408894, |
|
"eval_loss": 4.346156120300293, |
|
"eval_runtime": 1.974, |
|
"eval_samples_per_second": 3544.088, |
|
"eval_steps_per_second": 14.184, |
|
"step": 38000 |
|
}, |
|
{ |
|
"combined_loss": 2.4468555450439453, |
|
"distill_loss": 1.166190505027771, |
|
"epoch": 1.3920433731408894, |
|
"step": 38000, |
|
"student_mlm_loss": 3.72752046585083 |
|
}, |
|
{ |
|
"epoch": 1.3957066451754707, |
|
"grad_norm": 11.812987327575684, |
|
"learning_rate": 2.690247438386122e-05, |
|
"loss": 3.8226, |
|
"step": 38100 |
|
}, |
|
{ |
|
"combined_loss": 2.274935245513916, |
|
"distill_loss": 1.3503799438476562, |
|
"epoch": 1.3957066451754707, |
|
"step": 38100, |
|
"student_mlm_loss": 3.199490785598755 |
|
}, |
|
{ |
|
"epoch": 1.399369917210052, |
|
"grad_norm": 6.545460224151611, |
|
"learning_rate": 2.6841044794456592e-05, |
|
"loss": 4.1598, |
|
"step": 38200 |
|
}, |
|
{ |
|
"combined_loss": 2.1577343940734863, |
|
"distill_loss": 1.2623993158340454, |
|
"epoch": 1.399369917210052, |
|
"step": 38200, |
|
"student_mlm_loss": 3.0530693531036377 |
|
}, |
|
{ |
|
"epoch": 1.4030331892446333, |
|
"grad_norm": 7.286951541900635, |
|
"learning_rate": 2.6779615205051973e-05, |
|
"loss": 3.8211, |
|
"step": 38300 |
|
}, |
|
{ |
|
"combined_loss": 2.479806900024414, |
|
"distill_loss": 1.2152717113494873, |
|
"epoch": 1.4030331892446333, |
|
"step": 38300, |
|
"student_mlm_loss": 3.74434232711792 |
|
}, |
|
{ |
|
"epoch": 1.4066964612792145, |
|
"grad_norm": 18.360294342041016, |
|
"learning_rate": 2.6718185615647344e-05, |
|
"loss": 3.3871, |
|
"step": 38400 |
|
}, |
|
{ |
|
"combined_loss": 1.7289254665374756, |
|
"distill_loss": 1.3171356916427612, |
|
"epoch": 1.4066964612792145, |
|
"step": 38400, |
|
"student_mlm_loss": 2.1407151222229004 |
|
}, |
|
{ |
|
"epoch": 1.4103597333137958, |
|
"grad_norm": 8.086026191711426, |
|
"learning_rate": 2.6656756026242726e-05, |
|
"loss": 2.6337, |
|
"step": 38500 |
|
}, |
|
{ |
|
"combined_loss": 1.9621633291244507, |
|
"distill_loss": 1.3215687274932861, |
|
"epoch": 1.4103597333137958, |
|
"step": 38500, |
|
"student_mlm_loss": 2.6027579307556152 |
|
}, |
|
{ |
|
"epoch": 1.414023005348377, |
|
"grad_norm": 13.378824234008789, |
|
"learning_rate": 2.6595326436838097e-05, |
|
"loss": 3.4032, |
|
"step": 38600 |
|
}, |
|
{ |
|
"combined_loss": 37.448326110839844, |
|
"distill_loss": 1.2198776006698608, |
|
"epoch": 1.414023005348377, |
|
"step": 38600, |
|
"student_mlm_loss": 73.67677307128906 |
|
}, |
|
{ |
|
"epoch": 1.4176862773829584, |
|
"grad_norm": 5.834230422973633, |
|
"learning_rate": 2.653389684743347e-05, |
|
"loss": 6.724, |
|
"step": 38700 |
|
}, |
|
{ |
|
"combined_loss": 1.8702625036239624, |
|
"distill_loss": 1.2802906036376953, |
|
"epoch": 1.4176862773829584, |
|
"step": 38700, |
|
"student_mlm_loss": 2.4602344036102295 |
|
}, |
|
{ |
|
"epoch": 1.4213495494175397, |
|
"grad_norm": 3.5685741901397705, |
|
"learning_rate": 2.647246725802885e-05, |
|
"loss": 3.2721, |
|
"step": 38800 |
|
}, |
|
{ |
|
"combined_loss": 1.7411483526229858, |
|
"distill_loss": 1.285083532333374, |
|
"epoch": 1.4213495494175397, |
|
"step": 38800, |
|
"student_mlm_loss": 2.1972131729125977 |
|
}, |
|
{ |
|
"epoch": 1.4250128214521212, |
|
"grad_norm": 8.644251823425293, |
|
"learning_rate": 2.6411037668624223e-05, |
|
"loss": 13.6859, |
|
"step": 38900 |
|
}, |
|
{ |
|
"combined_loss": 3.234241008758545, |
|
"distill_loss": 1.2654619216918945, |
|
"epoch": 1.4250128214521212, |
|
"step": 38900, |
|
"student_mlm_loss": 5.203020095825195 |
|
}, |
|
{ |
|
"epoch": 1.4286760934867022, |
|
"grad_norm": 15.043992042541504, |
|
"learning_rate": 2.63496080792196e-05, |
|
"loss": 4.3161, |
|
"step": 39000 |
|
}, |
|
{ |
|
"combined_loss": 2.013312339782715, |
|
"distill_loss": 1.2555652856826782, |
|
"epoch": 1.4286760934867022, |
|
"step": 39000, |
|
"student_mlm_loss": 2.771059274673462 |
|
}, |
|
{ |
|
"epoch": 1.4323393655212837, |
|
"grad_norm": 35.315345764160156, |
|
"learning_rate": 2.6288178489814976e-05, |
|
"loss": 6.3089, |
|
"step": 39100 |
|
}, |
|
{ |
|
"combined_loss": 1.7854509353637695, |
|
"distill_loss": 1.2994376420974731, |
|
"epoch": 1.4323393655212837, |
|
"step": 39100, |
|
"student_mlm_loss": 2.2714641094207764 |
|
}, |
|
{ |
|
"epoch": 1.4360026375558648, |
|
"grad_norm": 8.155647277832031, |
|
"learning_rate": 2.6226748900410353e-05, |
|
"loss": 3.3881, |
|
"step": 39200 |
|
}, |
|
{ |
|
"combined_loss": 1.8790473937988281, |
|
"distill_loss": 1.2656193971633911, |
|
"epoch": 1.4360026375558648, |
|
"step": 39200, |
|
"student_mlm_loss": 2.4924752712249756 |
|
}, |
|
{ |
|
"epoch": 1.4396659095904463, |
|
"grad_norm": 4.777060508728027, |
|
"learning_rate": 2.6165319311005725e-05, |
|
"loss": 3.0181, |
|
"step": 39300 |
|
}, |
|
{ |
|
"combined_loss": 2.2714784145355225, |
|
"distill_loss": 1.2724400758743286, |
|
"epoch": 1.4396659095904463, |
|
"step": 39300, |
|
"student_mlm_loss": 3.270516872406006 |
|
}, |
|
{ |
|
"epoch": 1.4433291816250275, |
|
"grad_norm": 3.7660317420959473, |
|
"learning_rate": 2.6103889721601106e-05, |
|
"loss": 3.3045, |
|
"step": 39400 |
|
}, |
|
{ |
|
"combined_loss": 1.9759800434112549, |
|
"distill_loss": 1.1767717599868774, |
|
"epoch": 1.4433291816250275, |
|
"step": 39400, |
|
"student_mlm_loss": 2.775188446044922 |
|
}, |
|
{ |
|
"epoch": 1.4469924536596088, |
|
"grad_norm": 55.78919982910156, |
|
"learning_rate": 2.6042460132196477e-05, |
|
"loss": 3.5094, |
|
"step": 39500 |
|
}, |
|
{ |
|
"combined_loss": 2.5586395263671875, |
|
"distill_loss": 1.3177176713943481, |
|
"epoch": 1.4469924536596088, |
|
"step": 39500, |
|
"student_mlm_loss": 3.7995612621307373 |
|
}, |
|
{ |
|
"epoch": 1.45065572569419, |
|
"grad_norm": 11.648473739624023, |
|
"learning_rate": 2.598103054279185e-05, |
|
"loss": 6.3066, |
|
"step": 39600 |
|
}, |
|
{ |
|
"combined_loss": 1.8263496160507202, |
|
"distill_loss": 1.2649195194244385, |
|
"epoch": 1.45065572569419, |
|
"step": 39600, |
|
"student_mlm_loss": 2.387779712677002 |
|
}, |
|
{ |
|
"epoch": 1.4543189977287714, |
|
"grad_norm": 4.982020378112793, |
|
"learning_rate": 2.591960095338723e-05, |
|
"loss": 3.1475, |
|
"step": 39700 |
|
}, |
|
{ |
|
"combined_loss": 4.95673131942749, |
|
"distill_loss": 1.2415388822555542, |
|
"epoch": 1.4543189977287714, |
|
"step": 39700, |
|
"student_mlm_loss": 8.671923637390137 |
|
}, |
|
{ |
|
"epoch": 1.4579822697633527, |
|
"grad_norm": 4.551340103149414, |
|
"learning_rate": 2.5858171363982604e-05, |
|
"loss": 6.0043, |
|
"step": 39800 |
|
}, |
|
{ |
|
"combined_loss": 2.124246597290039, |
|
"distill_loss": 1.197386384010315, |
|
"epoch": 1.4579822697633527, |
|
"step": 39800, |
|
"student_mlm_loss": 3.0511069297790527 |
|
}, |
|
{ |
|
"epoch": 1.461645541797934, |
|
"grad_norm": 41.217533111572266, |
|
"learning_rate": 2.579674177457798e-05, |
|
"loss": 2.7216, |
|
"step": 39900 |
|
}, |
|
{ |
|
"combined_loss": 1.8579926490783691, |
|
"distill_loss": 1.1948734521865845, |
|
"epoch": 1.461645541797934, |
|
"step": 39900, |
|
"student_mlm_loss": 2.5211119651794434 |
|
}, |
|
{ |
|
"epoch": 1.4653088138325152, |
|
"grad_norm": 3.3428897857666016, |
|
"learning_rate": 2.5735312185173356e-05, |
|
"loss": 3.5888, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.4653088138325152, |
|
"eval_loss": 3.433469295501709, |
|
"eval_runtime": 2.0987, |
|
"eval_samples_per_second": 3333.452, |
|
"eval_steps_per_second": 13.341, |
|
"step": 40000 |
|
}, |
|
{ |
|
"combined_loss": 3.9790029525756836, |
|
"distill_loss": 1.2571158409118652, |
|
"epoch": 1.4653088138325152, |
|
"step": 40000, |
|
"student_mlm_loss": 6.700890064239502 |
|
}, |
|
{ |
|
"epoch": 1.4689720858670965, |
|
"grad_norm": 24.387128829956055, |
|
"learning_rate": 2.5673882595768734e-05, |
|
"loss": 3.3546, |
|
"step": 40100 |
|
}, |
|
{ |
|
"combined_loss": 2.113370418548584, |
|
"distill_loss": 1.2904696464538574, |
|
"epoch": 1.4689720858670965, |
|
"step": 40100, |
|
"student_mlm_loss": 2.9362711906433105 |
|
}, |
|
{ |
|
"epoch": 1.4726353579016778, |
|
"grad_norm": 11.271422386169434, |
|
"learning_rate": 2.5612453006364108e-05, |
|
"loss": 9.1182, |
|
"step": 40200 |
|
}, |
|
{ |
|
"combined_loss": 1.7249795198440552, |
|
"distill_loss": 1.2220125198364258, |
|
"epoch": 1.4726353579016778, |
|
"step": 40200, |
|
"student_mlm_loss": 2.2279465198516846 |
|
}, |
|
{ |
|
"epoch": 1.476298629936259, |
|
"grad_norm": 88.92086029052734, |
|
"learning_rate": 2.555102341695948e-05, |
|
"loss": 5.5622, |
|
"step": 40300 |
|
}, |
|
{ |
|
"combined_loss": 3.5107364654541016, |
|
"distill_loss": 1.2663298845291138, |
|
"epoch": 1.476298629936259, |
|
"step": 40300, |
|
"student_mlm_loss": 5.755143165588379 |
|
}, |
|
{ |
|
"epoch": 1.4799619019708403, |
|
"grad_norm": 4.677048683166504, |
|
"learning_rate": 2.5489593827554857e-05, |
|
"loss": 5.3278, |
|
"step": 40400 |
|
}, |
|
{ |
|
"combined_loss": 3.5298116207122803, |
|
"distill_loss": 1.1846145391464233, |
|
"epoch": 1.4799619019708403, |
|
"step": 40400, |
|
"student_mlm_loss": 5.875008583068848 |
|
}, |
|
{ |
|
"epoch": 1.4836251740054216, |
|
"grad_norm": 21.207704544067383, |
|
"learning_rate": 2.542816423815023e-05, |
|
"loss": 2.9588, |
|
"step": 40500 |
|
}, |
|
{ |
|
"combined_loss": 2.6109657287597656, |
|
"distill_loss": 1.2608091831207275, |
|
"epoch": 1.4836251740054216, |
|
"step": 40500, |
|
"student_mlm_loss": 3.9611220359802246 |
|
}, |
|
{ |
|
"epoch": 1.4872884460400029, |
|
"grad_norm": 7.7415876388549805, |
|
"learning_rate": 2.536673464874561e-05, |
|
"loss": 2.706, |
|
"step": 40600 |
|
}, |
|
{ |
|
"combined_loss": 2.455023765563965, |
|
"distill_loss": 1.3175585269927979, |
|
"epoch": 1.4872884460400029, |
|
"step": 40600, |
|
"student_mlm_loss": 3.5924887657165527 |
|
}, |
|
{ |
|
"epoch": 1.4909517180745842, |
|
"grad_norm": 19.366378784179688, |
|
"learning_rate": 2.5305305059340984e-05, |
|
"loss": 2.7981, |
|
"step": 40700 |
|
}, |
|
{ |
|
"combined_loss": 3.624007225036621, |
|
"distill_loss": 1.1402699947357178, |
|
"epoch": 1.4909517180745842, |
|
"step": 40700, |
|
"student_mlm_loss": 6.1077446937561035 |
|
}, |
|
{ |
|
"epoch": 1.4946149901091654, |
|
"grad_norm": 7.310671806335449, |
|
"learning_rate": 2.524387546993636e-05, |
|
"loss": 29.272, |
|
"step": 40800 |
|
}, |
|
{ |
|
"combined_loss": 2.2329726219177246, |
|
"distill_loss": 1.303555965423584, |
|
"epoch": 1.4946149901091654, |
|
"step": 40800, |
|
"student_mlm_loss": 3.1623895168304443 |
|
}, |
|
{ |
|
"epoch": 1.4982782621437467, |
|
"grad_norm": 48.7297477722168, |
|
"learning_rate": 2.5182445880531736e-05, |
|
"loss": 3.1319, |
|
"step": 40900 |
|
}, |
|
{ |
|
"combined_loss": 1.8255285024642944, |
|
"distill_loss": 1.1643202304840088, |
|
"epoch": 1.4982782621437467, |
|
"step": 40900, |
|
"student_mlm_loss": 2.48673677444458 |
|
}, |
|
{ |
|
"epoch": 1.5019415341783282, |
|
"grad_norm": 32.60409927368164, |
|
"learning_rate": 2.5121016291127114e-05, |
|
"loss": 8.524, |
|
"step": 41000 |
|
}, |
|
{ |
|
"combined_loss": 2.896923542022705, |
|
"distill_loss": 1.3571655750274658, |
|
"epoch": 1.5019415341783282, |
|
"step": 41000, |
|
"student_mlm_loss": 4.436681747436523 |
|
}, |
|
{ |
|
"epoch": 1.5056048062129093, |
|
"grad_norm": 4.127974510192871, |
|
"learning_rate": 2.5059586701722488e-05, |
|
"loss": 6.3087, |
|
"step": 41100 |
|
}, |
|
{ |
|
"combined_loss": 2.145819664001465, |
|
"distill_loss": 1.2983198165893555, |
|
"epoch": 1.5056048062129093, |
|
"step": 41100, |
|
"student_mlm_loss": 2.993319511413574 |
|
}, |
|
{ |
|
"epoch": 1.5092680782474908, |
|
"grad_norm": 3.873206853866577, |
|
"learning_rate": 2.4998157112317863e-05, |
|
"loss": 5.279, |
|
"step": 41200 |
|
}, |
|
{ |
|
"combined_loss": 4.8266730308532715, |
|
"distill_loss": 1.1676665544509888, |
|
"epoch": 1.5092680782474908, |
|
"step": 41200, |
|
"student_mlm_loss": 8.485679626464844 |
|
}, |
|
{ |
|
"epoch": 1.5129313502820718, |
|
"grad_norm": 6.902312755584717, |
|
"learning_rate": 2.493672752291324e-05, |
|
"loss": 5.3583, |
|
"step": 41300 |
|
}, |
|
{ |
|
"combined_loss": 1.7068848609924316, |
|
"distill_loss": 1.1335561275482178, |
|
"epoch": 1.5129313502820718, |
|
"step": 41300, |
|
"student_mlm_loss": 2.2802135944366455 |
|
}, |
|
{ |
|
"epoch": 1.5165946223166533, |
|
"grad_norm": 17.415306091308594, |
|
"learning_rate": 2.487529793350861e-05, |
|
"loss": 2.8319, |
|
"step": 41400 |
|
}, |
|
{ |
|
"combined_loss": 1.5696630477905273, |
|
"distill_loss": 1.152633786201477, |
|
"epoch": 1.5165946223166533, |
|
"step": 41400, |
|
"student_mlm_loss": 1.9866924285888672 |
|
}, |
|
{ |
|
"epoch": 1.5202578943512344, |
|
"grad_norm": 11.67779541015625, |
|
"learning_rate": 2.481386834410399e-05, |
|
"loss": 3.0117, |
|
"step": 41500 |
|
}, |
|
{ |
|
"combined_loss": 1.9209272861480713, |
|
"distill_loss": 1.2611881494522095, |
|
"epoch": 1.5202578943512344, |
|
"step": 41500, |
|
"student_mlm_loss": 2.5806663036346436 |
|
}, |
|
{ |
|
"epoch": 1.5239211663858159, |
|
"grad_norm": 9.814743041992188, |
|
"learning_rate": 2.4752438754699364e-05, |
|
"loss": 2.8479, |
|
"step": 41600 |
|
}, |
|
{ |
|
"combined_loss": 4.1822404861450195, |
|
"distill_loss": 1.254117488861084, |
|
"epoch": 1.5239211663858159, |
|
"step": 41600, |
|
"student_mlm_loss": 7.110363960266113 |
|
}, |
|
{ |
|
"epoch": 1.5275844384203972, |
|
"grad_norm": 11.7344970703125, |
|
"learning_rate": 2.4691009165294742e-05, |
|
"loss": 3.2502, |
|
"step": 41700 |
|
}, |
|
{ |
|
"combined_loss": 1.7558622360229492, |
|
"distill_loss": 1.1821727752685547, |
|
"epoch": 1.5275844384203972, |
|
"step": 41700, |
|
"student_mlm_loss": 2.3295516967773438 |
|
}, |
|
{ |
|
"epoch": 1.5312477104549784, |
|
"grad_norm": 8.426025390625, |
|
"learning_rate": 2.4629579575890116e-05, |
|
"loss": 3.3169, |
|
"step": 41800 |
|
}, |
|
{ |
|
"combined_loss": 1.843000054359436, |
|
"distill_loss": 1.1456735134124756, |
|
"epoch": 1.5312477104549784, |
|
"step": 41800, |
|
"student_mlm_loss": 2.5403265953063965 |
|
}, |
|
{ |
|
"epoch": 1.5349109824895597, |
|
"grad_norm": 3.654872417449951, |
|
"learning_rate": 2.456814998648549e-05, |
|
"loss": 2.6259, |
|
"step": 41900 |
|
}, |
|
{ |
|
"combined_loss": 1.7651002407073975, |
|
"distill_loss": 1.1741529703140259, |
|
"epoch": 1.5349109824895597, |
|
"step": 41900, |
|
"student_mlm_loss": 2.3560476303100586 |
|
}, |
|
{ |
|
"epoch": 1.538574254524141, |
|
"grad_norm": 18.605615615844727, |
|
"learning_rate": 2.450672039708087e-05, |
|
"loss": 2.4854, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.538574254524141, |
|
"eval_loss": 3.4032058715820312, |
|
"eval_runtime": 1.8747, |
|
"eval_samples_per_second": 3731.788, |
|
"eval_steps_per_second": 14.936, |
|
"step": 42000 |
|
}, |
|
{ |
|
"combined_loss": 2.60400390625, |
|
"distill_loss": 1.2034615278244019, |
|
"epoch": 1.538574254524141, |
|
"step": 42000, |
|
"student_mlm_loss": 4.004546165466309 |
|
}, |
|
{ |
|
"epoch": 1.5422375265587223, |
|
"grad_norm": 6.775146484375, |
|
"learning_rate": 2.4445290807676243e-05, |
|
"loss": 2.8405, |
|
"step": 42100 |
|
}, |
|
{ |
|
"combined_loss": 1.7485601902008057, |
|
"distill_loss": 1.1682909727096558, |
|
"epoch": 1.5422375265587223, |
|
"step": 42100, |
|
"student_mlm_loss": 2.328829288482666 |
|
}, |
|
{ |
|
"epoch": 1.5459007985933035, |
|
"grad_norm": 24.79000473022461, |
|
"learning_rate": 2.4383861218271617e-05, |
|
"loss": 2.9811, |
|
"step": 42200 |
|
}, |
|
{ |
|
"combined_loss": 2.2294323444366455, |
|
"distill_loss": 1.262848138809204, |
|
"epoch": 1.5459007985933035, |
|
"step": 42200, |
|
"student_mlm_loss": 3.196016550064087 |
|
}, |
|
{ |
|
"epoch": 1.5495640706278848, |
|
"grad_norm": 11.027627944946289, |
|
"learning_rate": 2.4322431628866992e-05, |
|
"loss": 3.7109, |
|
"step": 42300 |
|
}, |
|
{ |
|
"combined_loss": 1.8129802942276, |
|
"distill_loss": 1.205324411392212, |
|
"epoch": 1.5495640706278848, |
|
"step": 42300, |
|
"student_mlm_loss": 2.4206361770629883 |
|
}, |
|
{ |
|
"epoch": 1.553227342662466, |
|
"grad_norm": 6.328401565551758, |
|
"learning_rate": 2.426100203946237e-05, |
|
"loss": 31.168, |
|
"step": 42400 |
|
}, |
|
{ |
|
"combined_loss": 2.391860246658325, |
|
"distill_loss": 1.1356655359268188, |
|
"epoch": 1.553227342662466, |
|
"step": 42400, |
|
"student_mlm_loss": 3.648054838180542 |
|
}, |
|
{ |
|
"epoch": 1.5568906146970474, |
|
"grad_norm": 26.61184310913086, |
|
"learning_rate": 2.4199572450057744e-05, |
|
"loss": 6.4259, |
|
"step": 42500 |
|
}, |
|
{ |
|
"combined_loss": 3.222200870513916, |
|
"distill_loss": 1.3243845701217651, |
|
"epoch": 1.5568906146970474, |
|
"step": 42500, |
|
"student_mlm_loss": 5.120017051696777 |
|
}, |
|
{ |
|
"epoch": 1.5605538867316286, |
|
"grad_norm": 78.89910888671875, |
|
"learning_rate": 2.4138142860653122e-05, |
|
"loss": 3.3441, |
|
"step": 42600 |
|
}, |
|
{ |
|
"combined_loss": 1.7442145347595215, |
|
"distill_loss": 1.282542109489441, |
|
"epoch": 1.5605538867316286, |
|
"step": 42600, |
|
"student_mlm_loss": 2.2058870792388916 |
|
}, |
|
{ |
|
"epoch": 1.56421715876621, |
|
"grad_norm": 88.92566680908203, |
|
"learning_rate": 2.4076713271248496e-05, |
|
"loss": 2.8234, |
|
"step": 42700 |
|
}, |
|
{ |
|
"combined_loss": 2.366835117340088, |
|
"distill_loss": 1.1711124181747437, |
|
"epoch": 1.56421715876621, |
|
"step": 42700, |
|
"student_mlm_loss": 3.5625579357147217 |
|
}, |
|
{ |
|
"epoch": 1.5678804308007912, |
|
"grad_norm": 6.83758544921875, |
|
"learning_rate": 2.4015283681843874e-05, |
|
"loss": 5.4491, |
|
"step": 42800 |
|
}, |
|
{ |
|
"combined_loss": 4.174956798553467, |
|
"distill_loss": 1.0669249296188354, |
|
"epoch": 1.5678804308007912, |
|
"step": 42800, |
|
"student_mlm_loss": 7.282988548278809 |
|
}, |
|
{ |
|
"epoch": 1.5715437028353727, |
|
"grad_norm": 5.723924160003662, |
|
"learning_rate": 2.395385409243925e-05, |
|
"loss": 3.1108, |
|
"step": 42900 |
|
}, |
|
{ |
|
"combined_loss": 2.3197238445281982, |
|
"distill_loss": 1.2763570547103882, |
|
"epoch": 1.5715437028353727, |
|
"step": 42900, |
|
"student_mlm_loss": 3.3630905151367188 |
|
}, |
|
{ |
|
"epoch": 1.5752069748699538, |
|
"grad_norm": 14.807353973388672, |
|
"learning_rate": 2.3892424503034623e-05, |
|
"loss": 6.4113, |
|
"step": 43000 |
|
}, |
|
{ |
|
"combined_loss": 1.7868092060089111, |
|
"distill_loss": 1.1304634809494019, |
|
"epoch": 1.5752069748699538, |
|
"step": 43000, |
|
"student_mlm_loss": 2.44315505027771 |
|
}, |
|
{ |
|
"epoch": 1.5788702469045353, |
|
"grad_norm": 8.68276596069336, |
|
"learning_rate": 2.3830994913629998e-05, |
|
"loss": 5.1213, |
|
"step": 43100 |
|
}, |
|
{ |
|
"combined_loss": 19.46100425720215, |
|
"distill_loss": 1.259545087814331, |
|
"epoch": 1.5788702469045353, |
|
"step": 43100, |
|
"student_mlm_loss": 37.6624641418457 |
|
}, |
|
{ |
|
"epoch": 1.5825335189391163, |
|
"grad_norm": 4.91242790222168, |
|
"learning_rate": 2.3769565324225372e-05, |
|
"loss": 3.2674, |
|
"step": 43200 |
|
}, |
|
{ |
|
"combined_loss": 1.797656536102295, |
|
"distill_loss": 1.3039189577102661, |
|
"epoch": 1.5825335189391163, |
|
"step": 43200, |
|
"student_mlm_loss": 2.2913942337036133 |
|
}, |
|
{ |
|
"epoch": 1.5861967909736978, |
|
"grad_norm": 52.68294906616211, |
|
"learning_rate": 2.370813573482075e-05, |
|
"loss": 3.7711, |
|
"step": 43300 |
|
}, |
|
{ |
|
"combined_loss": 1.8017528057098389, |
|
"distill_loss": 1.1734706163406372, |
|
"epoch": 1.5861967909736978, |
|
"step": 43300, |
|
"student_mlm_loss": 2.43003511428833 |
|
}, |
|
{ |
|
"epoch": 1.5898600630082789, |
|
"grad_norm": 11.869544982910156, |
|
"learning_rate": 2.3646706145416124e-05, |
|
"loss": 9.8177, |
|
"step": 43400 |
|
}, |
|
{ |
|
"combined_loss": 2.760119915008545, |
|
"distill_loss": 1.2446471452713013, |
|
"epoch": 1.5898600630082789, |
|
"step": 43400, |
|
"student_mlm_loss": 4.275592803955078 |
|
}, |
|
{ |
|
"epoch": 1.5935233350428604, |
|
"grad_norm": 3.7819387912750244, |
|
"learning_rate": 2.3585276556011502e-05, |
|
"loss": 4.6552, |
|
"step": 43500 |
|
}, |
|
{ |
|
"combined_loss": 4.660012245178223, |
|
"distill_loss": 1.1187530755996704, |
|
"epoch": 1.5935233350428604, |
|
"step": 43500, |
|
"student_mlm_loss": 8.201271057128906 |
|
}, |
|
{ |
|
"epoch": 1.5971866070774414, |
|
"grad_norm": 21.269559860229492, |
|
"learning_rate": 2.3523846966606877e-05, |
|
"loss": 8.5404, |
|
"step": 43600 |
|
}, |
|
{ |
|
"combined_loss": 2.3045759201049805, |
|
"distill_loss": 1.3545589447021484, |
|
"epoch": 1.5971866070774414, |
|
"step": 43600, |
|
"student_mlm_loss": 3.2545931339263916 |
|
}, |
|
{ |
|
"epoch": 1.600849879112023, |
|
"grad_norm": 8.289508819580078, |
|
"learning_rate": 2.3462417377202254e-05, |
|
"loss": 2.7135, |
|
"step": 43700 |
|
}, |
|
{ |
|
"combined_loss": 3.0867691040039062, |
|
"distill_loss": 1.1124651432037354, |
|
"epoch": 1.600849879112023, |
|
"step": 43700, |
|
"student_mlm_loss": 5.061073303222656 |
|
}, |
|
{ |
|
"epoch": 1.6045131511466042, |
|
"grad_norm": 22.303661346435547, |
|
"learning_rate": 2.3400987787797625e-05, |
|
"loss": 3.6364, |
|
"step": 43800 |
|
}, |
|
{ |
|
"combined_loss": 1.7930564880371094, |
|
"distill_loss": 1.2114512920379639, |
|
"epoch": 1.6045131511466042, |
|
"step": 43800, |
|
"student_mlm_loss": 2.374661684036255 |
|
}, |
|
{ |
|
"epoch": 1.6081764231811855, |
|
"grad_norm": 4.351790904998779, |
|
"learning_rate": 2.3339558198393003e-05, |
|
"loss": 5.6887, |
|
"step": 43900 |
|
}, |
|
{ |
|
"combined_loss": 1.7365663051605225, |
|
"distill_loss": 1.2089755535125732, |
|
"epoch": 1.6081764231811855, |
|
"step": 43900, |
|
"student_mlm_loss": 2.2641570568084717 |
|
}, |
|
{ |
|
"epoch": 1.6118396952157668, |
|
"grad_norm": 13.450850486755371, |
|
"learning_rate": 2.3278128608988378e-05, |
|
"loss": 3.6702, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.6118396952157668, |
|
"eval_loss": 3.194415330886841, |
|
"eval_runtime": 1.9274, |
|
"eval_samples_per_second": 3629.828, |
|
"eval_steps_per_second": 14.528, |
|
"step": 44000 |
|
}, |
|
{ |
|
"combined_loss": 1.760496735572815, |
|
"distill_loss": 1.1514201164245605, |
|
"epoch": 1.6118396952157668, |
|
"step": 44000, |
|
"student_mlm_loss": 2.3695733547210693 |
|
}, |
|
{ |
|
"epoch": 1.615502967250348, |
|
"grad_norm": 7.381774425506592, |
|
"learning_rate": 2.3216699019583756e-05, |
|
"loss": 2.9269, |
|
"step": 44100 |
|
}, |
|
{ |
|
"combined_loss": 4.663776397705078, |
|
"distill_loss": 1.307958722114563, |
|
"epoch": 1.615502967250348, |
|
"step": 44100, |
|
"student_mlm_loss": 8.019594192504883 |
|
}, |
|
{ |
|
"epoch": 1.6191662392849293, |
|
"grad_norm": 10.999051094055176, |
|
"learning_rate": 2.315526943017913e-05, |
|
"loss": 3.0334, |
|
"step": 44200 |
|
}, |
|
{ |
|
"combined_loss": 1.9191560745239258, |
|
"distill_loss": 1.3481658697128296, |
|
"epoch": 1.6191662392849293, |
|
"step": 44200, |
|
"student_mlm_loss": 2.4901461601257324 |
|
}, |
|
{ |
|
"epoch": 1.6228295113195106, |
|
"grad_norm": 6.187446594238281, |
|
"learning_rate": 2.3093839840774504e-05, |
|
"loss": 30.6923, |
|
"step": 44300 |
|
}, |
|
{ |
|
"combined_loss": 12.122703552246094, |
|
"distill_loss": 1.1659897565841675, |
|
"epoch": 1.6228295113195106, |
|
"step": 44300, |
|
"student_mlm_loss": 23.079418182373047 |
|
}, |
|
{ |
|
"epoch": 1.6264927833540919, |
|
"grad_norm": 6.142828941345215, |
|
"learning_rate": 2.3032410251369882e-05, |
|
"loss": 7.4162, |
|
"step": 44400 |
|
}, |
|
{ |
|
"combined_loss": 1.9456160068511963, |
|
"distill_loss": 1.257858157157898, |
|
"epoch": 1.6264927833540919, |
|
"step": 44400, |
|
"student_mlm_loss": 2.633373737335205 |
|
}, |
|
{ |
|
"epoch": 1.6301560553886731, |
|
"grad_norm": 15.393942832946777, |
|
"learning_rate": 2.2970980661965257e-05, |
|
"loss": 4.8003, |
|
"step": 44500 |
|
}, |
|
{ |
|
"combined_loss": 2.7578635215759277, |
|
"distill_loss": 1.1640808582305908, |
|
"epoch": 1.6301560553886731, |
|
"step": 44500, |
|
"student_mlm_loss": 4.351646423339844 |
|
}, |
|
{ |
|
"epoch": 1.6338193274232544, |
|
"grad_norm": 18.73512077331543, |
|
"learning_rate": 2.290955107256063e-05, |
|
"loss": 5.3592, |
|
"step": 44600 |
|
}, |
|
{ |
|
"combined_loss": 3.758654832839966, |
|
"distill_loss": 1.260606288909912, |
|
"epoch": 1.6338193274232544, |
|
"step": 44600, |
|
"student_mlm_loss": 6.2567033767700195 |
|
}, |
|
{ |
|
"epoch": 1.6374825994578357, |
|
"grad_norm": 6.1570048332214355, |
|
"learning_rate": 2.2848121483156006e-05, |
|
"loss": 10.8594, |
|
"step": 44700 |
|
}, |
|
{ |
|
"combined_loss": 3.205047845840454, |
|
"distill_loss": 1.1495074033737183, |
|
"epoch": 1.6374825994578357, |
|
"step": 44700, |
|
"student_mlm_loss": 5.2605881690979 |
|
}, |
|
{ |
|
"epoch": 1.641145871492417, |
|
"grad_norm": 8.748614311218262, |
|
"learning_rate": 2.2786691893751383e-05, |
|
"loss": 2.611, |
|
"step": 44800 |
|
}, |
|
{ |
|
"combined_loss": 2.7548794746398926, |
|
"distill_loss": 1.153849482536316, |
|
"epoch": 1.641145871492417, |
|
"step": 44800, |
|
"student_mlm_loss": 4.35590934753418 |
|
}, |
|
{ |
|
"epoch": 1.6448091435269983, |
|
"grad_norm": 9.594339370727539, |
|
"learning_rate": 2.2725262304346758e-05, |
|
"loss": 3.621, |
|
"step": 44900 |
|
}, |
|
{ |
|
"combined_loss": 2.63676381111145, |
|
"distill_loss": 1.144437313079834, |
|
"epoch": 1.6448091435269983, |
|
"step": 44900, |
|
"student_mlm_loss": 4.129090309143066 |
|
}, |
|
{ |
|
"epoch": 1.6484724155615798, |
|
"grad_norm": 8.756010055541992, |
|
"learning_rate": 2.2663832714942136e-05, |
|
"loss": 5.0762, |
|
"step": 45000 |
|
}, |
|
{ |
|
"combined_loss": 2.0047507286071777, |
|
"distill_loss": 1.203262209892273, |
|
"epoch": 1.6484724155615798, |
|
"step": 45000, |
|
"student_mlm_loss": 2.806239366531372 |
|
}, |
|
{ |
|
"epoch": 1.6521356875961608, |
|
"grad_norm": 16.163911819458008, |
|
"learning_rate": 2.260240312553751e-05, |
|
"loss": 3.1675, |
|
"step": 45100 |
|
}, |
|
{ |
|
"combined_loss": 1.822305679321289, |
|
"distill_loss": 1.187317967414856, |
|
"epoch": 1.6521356875961608, |
|
"step": 45100, |
|
"student_mlm_loss": 2.4572935104370117 |
|
}, |
|
{ |
|
"epoch": 1.6557989596307423, |
|
"grad_norm": 4.047428607940674, |
|
"learning_rate": 2.2540973536132888e-05, |
|
"loss": 2.6406, |
|
"step": 45200 |
|
}, |
|
{ |
|
"combined_loss": 2.431349039077759, |
|
"distill_loss": 1.2643455266952515, |
|
"epoch": 1.6557989596307423, |
|
"step": 45200, |
|
"student_mlm_loss": 3.5983526706695557 |
|
}, |
|
{ |
|
"epoch": 1.6594622316653234, |
|
"grad_norm": 28.598485946655273, |
|
"learning_rate": 2.247954394672826e-05, |
|
"loss": 3.7667, |
|
"step": 45300 |
|
}, |
|
{ |
|
"combined_loss": 2.274944543838501, |
|
"distill_loss": 1.266087293624878, |
|
"epoch": 1.6594622316653234, |
|
"step": 45300, |
|
"student_mlm_loss": 3.283801794052124 |
|
}, |
|
{ |
|
"epoch": 1.6631255036999049, |
|
"grad_norm": 11.642946243286133, |
|
"learning_rate": 2.2418114357323637e-05, |
|
"loss": 3.0131, |
|
"step": 45400 |
|
}, |
|
{ |
|
"combined_loss": 2.064805507659912, |
|
"distill_loss": 1.2423893213272095, |
|
"epoch": 1.6631255036999049, |
|
"step": 45400, |
|
"student_mlm_loss": 2.8872218132019043 |
|
}, |
|
{ |
|
"epoch": 1.666788775734486, |
|
"grad_norm": 7.227854251861572, |
|
"learning_rate": 2.235668476791901e-05, |
|
"loss": 7.556, |
|
"step": 45500 |
|
}, |
|
{ |
|
"combined_loss": 1.8626993894577026, |
|
"distill_loss": 1.153686761856079, |
|
"epoch": 1.666788775734486, |
|
"step": 45500, |
|
"student_mlm_loss": 2.571712017059326 |
|
}, |
|
{ |
|
"epoch": 1.6704520477690674, |
|
"grad_norm": 11.972105026245117, |
|
"learning_rate": 2.229525517851439e-05, |
|
"loss": 3.9606, |
|
"step": 45600 |
|
}, |
|
{ |
|
"combined_loss": 1.7529842853546143, |
|
"distill_loss": 1.2637630701065063, |
|
"epoch": 1.6704520477690674, |
|
"step": 45600, |
|
"student_mlm_loss": 2.2422056198120117 |
|
}, |
|
{ |
|
"epoch": 1.6741153198036485, |
|
"grad_norm": 4.263253211975098, |
|
"learning_rate": 2.2233825589109764e-05, |
|
"loss": 3.0922, |
|
"step": 45700 |
|
}, |
|
{ |
|
"combined_loss": 2.6089985370635986, |
|
"distill_loss": 1.2136098146438599, |
|
"epoch": 1.6741153198036485, |
|
"step": 45700, |
|
"student_mlm_loss": 4.004387378692627 |
|
}, |
|
{ |
|
"epoch": 1.67777859183823, |
|
"grad_norm": 24.4074764251709, |
|
"learning_rate": 2.2172395999705138e-05, |
|
"loss": 3.2329, |
|
"step": 45800 |
|
}, |
|
{ |
|
"combined_loss": 1.6919562816619873, |
|
"distill_loss": 1.139168381690979, |
|
"epoch": 1.67777859183823, |
|
"step": 45800, |
|
"student_mlm_loss": 2.244744300842285 |
|
}, |
|
{ |
|
"epoch": 1.6814418638728112, |
|
"grad_norm": 5.1518778800964355, |
|
"learning_rate": 2.2110966410300516e-05, |
|
"loss": 9.4019, |
|
"step": 45900 |
|
}, |
|
{ |
|
"combined_loss": 2.1822292804718018, |
|
"distill_loss": 1.3423482179641724, |
|
"epoch": 1.6814418638728112, |
|
"step": 45900, |
|
"student_mlm_loss": 3.0221104621887207 |
|
}, |
|
{ |
|
"epoch": 1.6851051359073925, |
|
"grad_norm": 18.045368194580078, |
|
"learning_rate": 2.204953682089589e-05, |
|
"loss": 3.3662, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.6851051359073925, |
|
"eval_loss": 3.070533275604248, |
|
"eval_runtime": 1.9768, |
|
"eval_samples_per_second": 3539.063, |
|
"eval_steps_per_second": 14.164, |
|
"step": 46000 |
|
}, |
|
{ |
|
"combined_loss": 1.8376495838165283, |
|
"distill_loss": 1.261283278465271, |
|
"epoch": 1.6851051359073925, |
|
"step": 46000, |
|
"student_mlm_loss": 2.414015769958496 |
|
}, |
|
{ |
|
"epoch": 1.6887684079419738, |
|
"grad_norm": 5.69982385635376, |
|
"learning_rate": 2.1988107231491265e-05, |
|
"loss": 3.3451, |
|
"step": 46100 |
|
}, |
|
{ |
|
"combined_loss": 1.7916219234466553, |
|
"distill_loss": 1.2525031566619873, |
|
"epoch": 1.6887684079419738, |
|
"step": 46100, |
|
"student_mlm_loss": 2.3307406902313232 |
|
}, |
|
{ |
|
"epoch": 1.692431679976555, |
|
"grad_norm": 27.134151458740234, |
|
"learning_rate": 2.192667764208664e-05, |
|
"loss": 9.1006, |
|
"step": 46200 |
|
}, |
|
{ |
|
"combined_loss": 59.0687141418457, |
|
"distill_loss": 1.1848413944244385, |
|
"epoch": 1.692431679976555, |
|
"step": 46200, |
|
"student_mlm_loss": 116.95258331298828 |
|
}, |
|
{ |
|
"epoch": 1.6960949520111364, |
|
"grad_norm": 6.624229431152344, |
|
"learning_rate": 2.1865248052682017e-05, |
|
"loss": 3.0016, |
|
"step": 46300 |
|
}, |
|
{ |
|
"combined_loss": 2.7997608184814453, |
|
"distill_loss": 1.1524275541305542, |
|
"epoch": 1.6960949520111364, |
|
"step": 46300, |
|
"student_mlm_loss": 4.447093963623047 |
|
}, |
|
{ |
|
"epoch": 1.6997582240457176, |
|
"grad_norm": 5.472049236297607, |
|
"learning_rate": 2.180381846327739e-05, |
|
"loss": 20.0915, |
|
"step": 46400 |
|
}, |
|
{ |
|
"combined_loss": 1.7153997421264648, |
|
"distill_loss": 1.237658143043518, |
|
"epoch": 1.6997582240457176, |
|
"step": 46400, |
|
"student_mlm_loss": 2.193141460418701 |
|
}, |
|
{ |
|
"epoch": 1.703421496080299, |
|
"grad_norm": 14.290247917175293, |
|
"learning_rate": 2.174238887387277e-05, |
|
"loss": 4.5936, |
|
"step": 46500 |
|
}, |
|
{ |
|
"combined_loss": 1.709627628326416, |
|
"distill_loss": 1.2791212797164917, |
|
"epoch": 1.703421496080299, |
|
"step": 46500, |
|
"student_mlm_loss": 2.140133857727051 |
|
}, |
|
{ |
|
"epoch": 1.7070847681148802, |
|
"grad_norm": 17.962997436523438, |
|
"learning_rate": 2.1680959284468144e-05, |
|
"loss": 3.3627, |
|
"step": 46600 |
|
}, |
|
{ |
|
"combined_loss": 7.8201751708984375, |
|
"distill_loss": 1.3012824058532715, |
|
"epoch": 1.7070847681148802, |
|
"step": 46600, |
|
"student_mlm_loss": 14.339067459106445 |
|
}, |
|
{ |
|
"epoch": 1.7107480401494615, |
|
"grad_norm": 6.800339698791504, |
|
"learning_rate": 2.161952969506352e-05, |
|
"loss": 6.7955, |
|
"step": 46700 |
|
}, |
|
{ |
|
"combined_loss": 1.809753656387329, |
|
"distill_loss": 1.2891262769699097, |
|
"epoch": 1.7107480401494615, |
|
"step": 46700, |
|
"student_mlm_loss": 2.330381155014038 |
|
}, |
|
{ |
|
"epoch": 1.7144113121840427, |
|
"grad_norm": 12.281099319458008, |
|
"learning_rate": 2.1558100105658896e-05, |
|
"loss": 10.3436, |
|
"step": 46800 |
|
}, |
|
{ |
|
"combined_loss": 3.3808600902557373, |
|
"distill_loss": 1.2777303457260132, |
|
"epoch": 1.7144113121840427, |
|
"step": 46800, |
|
"student_mlm_loss": 5.483989715576172 |
|
}, |
|
{ |
|
"epoch": 1.718074584218624, |
|
"grad_norm": 3.3210408687591553, |
|
"learning_rate": 2.149667051625427e-05, |
|
"loss": 2.8055, |
|
"step": 46900 |
|
}, |
|
{ |
|
"combined_loss": 2.1092348098754883, |
|
"distill_loss": 1.2058593034744263, |
|
"epoch": 1.718074584218624, |
|
"step": 46900, |
|
"student_mlm_loss": 3.0126101970672607 |
|
}, |
|
{ |
|
"epoch": 1.7217378562532053, |
|
"grad_norm": 11.694738388061523, |
|
"learning_rate": 2.1435240926849645e-05, |
|
"loss": 4.6311, |
|
"step": 47000 |
|
}, |
|
{ |
|
"combined_loss": 2.2222890853881836, |
|
"distill_loss": 1.218597173690796, |
|
"epoch": 1.7217378562532053, |
|
"step": 47000, |
|
"student_mlm_loss": 3.2259812355041504 |
|
}, |
|
{ |
|
"epoch": 1.7254011282877868, |
|
"grad_norm": 23.036334991455078, |
|
"learning_rate": 2.137381133744502e-05, |
|
"loss": 2.5923, |
|
"step": 47100 |
|
}, |
|
{ |
|
"combined_loss": 1.882810354232788, |
|
"distill_loss": 1.2441027164459229, |
|
"epoch": 1.7254011282877868, |
|
"step": 47100, |
|
"student_mlm_loss": 2.5215179920196533 |
|
}, |
|
{ |
|
"epoch": 1.7290644003223679, |
|
"grad_norm": 65.06354522705078, |
|
"learning_rate": 2.1312381748040397e-05, |
|
"loss": 3.3375, |
|
"step": 47200 |
|
}, |
|
{ |
|
"combined_loss": 1.84983229637146, |
|
"distill_loss": 1.224557876586914, |
|
"epoch": 1.7290644003223679, |
|
"step": 47200, |
|
"student_mlm_loss": 2.475106716156006 |
|
}, |
|
{ |
|
"epoch": 1.7327276723569494, |
|
"grad_norm": 9.202945709228516, |
|
"learning_rate": 2.1250952158635772e-05, |
|
"loss": 3.0094, |
|
"step": 47300 |
|
}, |
|
{ |
|
"combined_loss": 1.6417255401611328, |
|
"distill_loss": 1.2296794652938843, |
|
"epoch": 1.7327276723569494, |
|
"step": 47300, |
|
"student_mlm_loss": 2.053771734237671 |
|
}, |
|
{ |
|
"epoch": 1.7363909443915304, |
|
"grad_norm": 7.1568193435668945, |
|
"learning_rate": 2.118952256923115e-05, |
|
"loss": 3.3413, |
|
"step": 47400 |
|
}, |
|
{ |
|
"combined_loss": 2.165384531021118, |
|
"distill_loss": 1.2572156190872192, |
|
"epoch": 1.7363909443915304, |
|
"step": 47400, |
|
"student_mlm_loss": 3.0735535621643066 |
|
}, |
|
{ |
|
"epoch": 1.740054216426112, |
|
"grad_norm": 39.054439544677734, |
|
"learning_rate": 2.1128092979826524e-05, |
|
"loss": 4.8522, |
|
"step": 47500 |
|
}, |
|
{ |
|
"combined_loss": 2.6122236251831055, |
|
"distill_loss": 1.1487023830413818, |
|
"epoch": 1.740054216426112, |
|
"step": 47500, |
|
"student_mlm_loss": 4.07574462890625 |
|
}, |
|
{ |
|
"epoch": 1.743717488460693, |
|
"grad_norm": 3.18758487701416, |
|
"learning_rate": 2.1066663390421902e-05, |
|
"loss": 4.3993, |
|
"step": 47600 |
|
}, |
|
{ |
|
"combined_loss": 6.344114303588867, |
|
"distill_loss": 1.1341725587844849, |
|
"epoch": 1.743717488460693, |
|
"step": 47600, |
|
"student_mlm_loss": 11.554056167602539 |
|
}, |
|
{ |
|
"epoch": 1.7473807604952745, |
|
"grad_norm": 9.418896675109863, |
|
"learning_rate": 2.1005233801017273e-05, |
|
"loss": 8.7279, |
|
"step": 47700 |
|
}, |
|
{ |
|
"combined_loss": 2.8721518516540527, |
|
"distill_loss": 1.2175838947296143, |
|
"epoch": 1.7473807604952745, |
|
"step": 47700, |
|
"student_mlm_loss": 4.526719570159912 |
|
}, |
|
{ |
|
"epoch": 1.7510440325298555, |
|
"grad_norm": 4.730939865112305, |
|
"learning_rate": 2.094380421161265e-05, |
|
"loss": 2.74, |
|
"step": 47800 |
|
}, |
|
{ |
|
"combined_loss": 1.8483730554580688, |
|
"distill_loss": 1.2789607048034668, |
|
"epoch": 1.7510440325298555, |
|
"step": 47800, |
|
"student_mlm_loss": 2.417785406112671 |
|
}, |
|
{ |
|
"epoch": 1.754707304564437, |
|
"grad_norm": 4.566458225250244, |
|
"learning_rate": 2.0882374622208025e-05, |
|
"loss": 2.63, |
|
"step": 47900 |
|
}, |
|
{ |
|
"combined_loss": 1.8073049783706665, |
|
"distill_loss": 1.3073413372039795, |
|
"epoch": 1.754707304564437, |
|
"step": 47900, |
|
"student_mlm_loss": 2.3072686195373535 |
|
}, |
|
{ |
|
"epoch": 1.7583705765990183, |
|
"grad_norm": 14.967068672180176, |
|
"learning_rate": 2.0820945032803403e-05, |
|
"loss": 2.5821, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.7583705765990183, |
|
"eval_loss": 3.2400870323181152, |
|
"eval_runtime": 1.8322, |
|
"eval_samples_per_second": 3818.29, |
|
"eval_steps_per_second": 15.282, |
|
"step": 48000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 81894, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7150683130961408e+16, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|