diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6151 @@ +{ + "best_metric": 0.65735477, + "best_model_checkpoint": "/cpfs04/shared/rlproject/zhangqi/model_garden/0709_intern2b_v7-1-part15-19-resize-decay/v0-20250710-072707/checkpoint-3000", + "epoch": 0.9599616015359386, + "eval_steps": 250, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003199872005119795, + "grad_norm": 0.06346331978668461, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.4898327589035034, + "memory(GiB)": 37.39, + "step": 1, + "token_acc": 0.8589147286821706, + "train_speed(iter/s)": 0.017141 + }, + { + "epoch": 0.0015999360025598975, + "grad_norm": 0.06701772816278752, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.479672372341156, + "memory(GiB)": 57.22, + "step": 5, + "token_acc": 0.8759901666211418, + "train_speed(iter/s)": 0.029212 + }, + { + "epoch": 0.003199872005119795, + "grad_norm": 0.0577242460197625, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.4767899990081787, + "memory(GiB)": 57.22, + "step": 10, + "token_acc": 0.8920853213584058, + "train_speed(iter/s)": 0.030985 + }, + { + "epoch": 0.004799808007679693, + "grad_norm": 0.05378464715409511, + "learning_rate": 3e-06, + "loss": 0.46750531196594236, + "memory(GiB)": 57.22, + "step": 15, + "token_acc": 0.846137292877125, + "train_speed(iter/s)": 0.030693 + }, + { + "epoch": 0.00639974401023959, + "grad_norm": 0.05482863789544427, + "learning_rate": 4.000000000000001e-06, + "loss": 0.47017059326171873, + "memory(GiB)": 57.22, + "step": 20, + "token_acc": 0.8701638201463925, + "train_speed(iter/s)": 0.031839 + }, + { + "epoch": 0.007999680012799487, + "grad_norm": 0.05575025715512655, + "learning_rate": 5e-06, + "loss": 0.46974716186523435, + "memory(GiB)": 57.22, + "step": 25, + "token_acc": 0.8939419941650936, + "train_speed(iter/s)": 0.031203 + }, + { + "epoch": 0.009599616015359386, + "grad_norm": 0.05007200892621057, + "learning_rate": 6e-06, + "loss": 0.4700496196746826, + "memory(GiB)": 57.22, + "step": 30, + "token_acc": 0.872761844398669, + "train_speed(iter/s)": 0.030738 + }, + { + "epoch": 0.011199552017919284, + "grad_norm": 0.06284973201247342, + "learning_rate": 7e-06, + "loss": 0.477018404006958, + "memory(GiB)": 57.22, + "step": 35, + "token_acc": 0.8738273921200751, + "train_speed(iter/s)": 0.031491 + }, + { + "epoch": 0.01279948802047918, + "grad_norm": 0.051858307829802386, + "learning_rate": 8.000000000000001e-06, + "loss": 0.46584124565124513, + "memory(GiB)": 57.22, + "step": 40, + "token_acc": 0.8791312559017942, + "train_speed(iter/s)": 0.031035 + }, + { + "epoch": 0.014399424023039079, + "grad_norm": 0.05024484287076301, + "learning_rate": 9e-06, + "loss": 0.4685808658599854, + "memory(GiB)": 57.22, + "step": 45, + "token_acc": 0.8846325167037862, + "train_speed(iter/s)": 0.031503 + }, + { + "epoch": 0.015999360025598975, + "grad_norm": 0.06025612278216295, + "learning_rate": 1e-05, + "loss": 0.4697974681854248, + "memory(GiB)": 57.22, + "step": 50, + "token_acc": 0.854153041203401, + "train_speed(iter/s)": 0.031549 + }, + { + "epoch": 0.017599296028158875, + "grad_norm": 0.05171252494611451, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.4622661113739014, + "memory(GiB)": 73.06, + "step": 55, + "token_acc": 0.8281821878812525, + "train_speed(iter/s)": 0.031181 + }, + { + "epoch": 0.01919923203071877, + "grad_norm": 0.06238294030934267, + "learning_rate": 1.2e-05, + "loss": 0.47000856399536134, + "memory(GiB)": 73.06, + "step": 60, + "token_acc": 0.8703465982028241, + "train_speed(iter/s)": 0.031617 + }, + { + "epoch": 0.020799168033278668, + "grad_norm": 0.05576663204628902, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.4765446186065674, + "memory(GiB)": 73.06, + "step": 65, + "token_acc": 0.867019517036057, + "train_speed(iter/s)": 0.031138 + }, + { + "epoch": 0.022399104035838568, + "grad_norm": 0.054406694286476175, + "learning_rate": 1.4e-05, + "loss": 0.47959036827087403, + "memory(GiB)": 73.06, + "step": 70, + "token_acc": 0.8927940657011657, + "train_speed(iter/s)": 0.030807 + }, + { + "epoch": 0.023999040038398464, + "grad_norm": 0.05458413282309297, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.45742173194885255, + "memory(GiB)": 73.06, + "step": 75, + "token_acc": 0.8916037316748112, + "train_speed(iter/s)": 0.031204 + }, + { + "epoch": 0.02559897604095836, + "grad_norm": 0.05986601718141533, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.46221466064453126, + "memory(GiB)": 73.06, + "step": 80, + "token_acc": 0.8513287970214405, + "train_speed(iter/s)": 0.030907 + }, + { + "epoch": 0.02719891204351826, + "grad_norm": 0.0610597448330617, + "learning_rate": 1.7e-05, + "loss": 0.4636178493499756, + "memory(GiB)": 73.06, + "step": 85, + "token_acc": 0.8755391250770179, + "train_speed(iter/s)": 0.030852 + }, + { + "epoch": 0.028798848046078157, + "grad_norm": 0.0537272421077024, + "learning_rate": 1.8e-05, + "loss": 0.4674212455749512, + "memory(GiB)": 73.06, + "step": 90, + "token_acc": 0.8330635370295427, + "train_speed(iter/s)": 0.03096 + }, + { + "epoch": 0.030398784048638054, + "grad_norm": 0.05874793146275866, + "learning_rate": 1.9e-05, + "loss": 0.459043550491333, + "memory(GiB)": 73.06, + "step": 95, + "token_acc": 0.8686557483379277, + "train_speed(iter/s)": 0.030744 + }, + { + "epoch": 0.03199872005119795, + "grad_norm": 0.0635697375742028, + "learning_rate": 2e-05, + "loss": 0.4696988582611084, + "memory(GiB)": 73.06, + "step": 100, + "token_acc": 0.8658624414836155, + "train_speed(iter/s)": 0.030967 + }, + { + "epoch": 0.03359865605375785, + "grad_norm": 0.05212754941929234, + "learning_rate": 1.9999865178850847e-05, + "loss": 0.47245235443115235, + "memory(GiB)": 73.06, + "step": 105, + "token_acc": 0.8773064185459707, + "train_speed(iter/s)": 0.030857 + }, + { + "epoch": 0.03519859205631775, + "grad_norm": 0.0584305183084298, + "learning_rate": 1.999946071903873e-05, + "loss": 0.4617309093475342, + "memory(GiB)": 73.06, + "step": 110, + "token_acc": 0.8634434872058634, + "train_speed(iter/s)": 0.030617 + }, + { + "epoch": 0.03679852805887764, + "grad_norm": 0.0594956422290189, + "learning_rate": 1.9998786631469602e-05, + "loss": 0.4737922191619873, + "memory(GiB)": 73.06, + "step": 115, + "token_acc": 0.8586429725363489, + "train_speed(iter/s)": 0.030917 + }, + { + "epoch": 0.03839846406143754, + "grad_norm": 0.05542732612540538, + "learning_rate": 1.999784293431971e-05, + "loss": 0.46903514862060547, + "memory(GiB)": 73.06, + "step": 120, + "token_acc": 0.8490970309152127, + "train_speed(iter/s)": 0.030707 + }, + { + "epoch": 0.03999840006399744, + "grad_norm": 0.06279644130355853, + "learning_rate": 1.9996629653035128e-05, + "loss": 0.47660508155822756, + "memory(GiB)": 73.06, + "step": 125, + "token_acc": 0.8797399783315276, + "train_speed(iter/s)": 0.03059 + }, + { + "epoch": 0.041598336066557336, + "grad_norm": 0.0589281224878424, + "learning_rate": 1.999514682033104e-05, + "loss": 0.47182955741882326, + "memory(GiB)": 73.06, + "step": 130, + "token_acc": 0.8641215106732348, + "train_speed(iter/s)": 0.030792 + }, + { + "epoch": 0.043198272069117236, + "grad_norm": 0.06201767062726238, + "learning_rate": 1.99933944761909e-05, + "loss": 0.4760914325714111, + "memory(GiB)": 73.06, + "step": 135, + "token_acc": 0.856929955290611, + "train_speed(iter/s)": 0.030572 + }, + { + "epoch": 0.044798208071677136, + "grad_norm": 0.06842512509591067, + "learning_rate": 1.999137266786531e-05, + "loss": 0.4673017501831055, + "memory(GiB)": 73.06, + "step": 140, + "token_acc": 0.8725868725868726, + "train_speed(iter/s)": 0.030583 + }, + { + "epoch": 0.04639814407423703, + "grad_norm": 0.06206860242026748, + "learning_rate": 1.998908144987078e-05, + "loss": 0.4650784969329834, + "memory(GiB)": 73.06, + "step": 145, + "token_acc": 0.8592991206195039, + "train_speed(iter/s)": 0.030589 + }, + { + "epoch": 0.04799808007679693, + "grad_norm": 0.07443005443822684, + "learning_rate": 1.9986520883988233e-05, + "loss": 0.46671695709228517, + "memory(GiB)": 73.06, + "step": 150, + "token_acc": 0.8422288642186165, + "train_speed(iter/s)": 0.030331 + }, + { + "epoch": 0.04959801607935683, + "grad_norm": 0.06105984389773331, + "learning_rate": 1.9983691039261358e-05, + "loss": 0.47356271743774414, + "memory(GiB)": 73.06, + "step": 155, + "token_acc": 0.8814565604591332, + "train_speed(iter/s)": 0.030476 + }, + { + "epoch": 0.05119795208191672, + "grad_norm": 0.08001789963695773, + "learning_rate": 1.998059199199474e-05, + "loss": 0.4710524559020996, + "memory(GiB)": 73.06, + "step": 160, + "token_acc": 0.8710629921259843, + "train_speed(iter/s)": 0.03042 + }, + { + "epoch": 0.05279788808447662, + "grad_norm": 0.05874066660842649, + "learning_rate": 1.9977223825751802e-05, + "loss": 0.46933708190917967, + "memory(GiB)": 73.06, + "step": 165, + "token_acc": 0.894688221709007, + "train_speed(iter/s)": 0.030265 + }, + { + "epoch": 0.05439782408703652, + "grad_norm": 0.06544476253513323, + "learning_rate": 1.997358663135255e-05, + "loss": 0.46097607612609864, + "memory(GiB)": 73.06, + "step": 170, + "token_acc": 0.8832285384319261, + "train_speed(iter/s)": 0.030391 + }, + { + "epoch": 0.055997760089596414, + "grad_norm": 0.052612734205809274, + "learning_rate": 1.9969680506871138e-05, + "loss": 0.4674376010894775, + "memory(GiB)": 73.06, + "step": 175, + "token_acc": 0.8822400558269365, + "train_speed(iter/s)": 0.030298 + }, + { + "epoch": 0.057597696092156314, + "grad_norm": 0.05831524002340728, + "learning_rate": 1.9965505557633188e-05, + "loss": 0.47021942138671874, + "memory(GiB)": 73.06, + "step": 180, + "token_acc": 0.8729036501150937, + "train_speed(iter/s)": 0.030222 + }, + { + "epoch": 0.059197632094716214, + "grad_norm": 0.05558463298745032, + "learning_rate": 1.9961061896213006e-05, + "loss": 0.4707474708557129, + "memory(GiB)": 73.06, + "step": 185, + "token_acc": 0.8507278220269157, + "train_speed(iter/s)": 0.030322 + }, + { + "epoch": 0.06079756809727611, + "grad_norm": 0.05835065786169716, + "learning_rate": 1.9956349642430494e-05, + "loss": 0.4792951583862305, + "memory(GiB)": 73.06, + "step": 190, + "token_acc": 0.8596896665566194, + "train_speed(iter/s)": 0.030221 + }, + { + "epoch": 0.06239750409983601, + "grad_norm": 0.055138234334700054, + "learning_rate": 1.9951368923347945e-05, + "loss": 0.4755210876464844, + "memory(GiB)": 73.06, + "step": 195, + "token_acc": 0.8501170960187353, + "train_speed(iter/s)": 0.030259 + }, + { + "epoch": 0.0639974401023959, + "grad_norm": 0.06763946666899583, + "learning_rate": 1.9946119873266615e-05, + "loss": 0.4560092926025391, + "memory(GiB)": 73.06, + "step": 200, + "token_acc": 0.8891928864569083, + "train_speed(iter/s)": 0.030234 + }, + { + "epoch": 0.0655973761049558, + "grad_norm": 0.056758126149685124, + "learning_rate": 1.9940602633723097e-05, + "loss": 0.470977258682251, + "memory(GiB)": 73.06, + "step": 205, + "token_acc": 0.8635224424698109, + "train_speed(iter/s)": 0.030086 + }, + { + "epoch": 0.0671973121075157, + "grad_norm": 0.06089496841996932, + "learning_rate": 1.99348173534855e-05, + "loss": 0.4699739456176758, + "memory(GiB)": 73.06, + "step": 210, + "token_acc": 0.8621679827709978, + "train_speed(iter/s)": 0.03019 + }, + { + "epoch": 0.06879724811007559, + "grad_norm": 0.05802383504947049, + "learning_rate": 1.9928764188549462e-05, + "loss": 0.46386079788208007, + "memory(GiB)": 73.06, + "step": 215, + "token_acc": 0.8754250939681403, + "train_speed(iter/s)": 0.030114 + }, + { + "epoch": 0.0703971841126355, + "grad_norm": 0.05515703750577632, + "learning_rate": 1.9922443302133906e-05, + "loss": 0.4679898262023926, + "memory(GiB)": 73.06, + "step": 220, + "token_acc": 0.88738807102747, + "train_speed(iter/s)": 0.030007 + }, + { + "epoch": 0.07199712011519539, + "grad_norm": 0.05967474480415036, + "learning_rate": 1.9915854864676665e-05, + "loss": 0.47310919761657716, + "memory(GiB)": 73.06, + "step": 225, + "token_acc": 0.8383795309168444, + "train_speed(iter/s)": 0.030071 + }, + { + "epoch": 0.07359705611775529, + "grad_norm": 0.06076387420670948, + "learning_rate": 1.990899905382988e-05, + "loss": 0.4678232192993164, + "memory(GiB)": 73.06, + "step": 230, + "token_acc": 0.8767123287671232, + "train_speed(iter/s)": 0.029948 + }, + { + "epoch": 0.07519699212031519, + "grad_norm": 0.05635803811030448, + "learning_rate": 1.9901876054455217e-05, + "loss": 0.4821170330047607, + "memory(GiB)": 73.06, + "step": 235, + "token_acc": 0.8841222879684418, + "train_speed(iter/s)": 0.029913 + }, + { + "epoch": 0.07679692812287509, + "grad_norm": 0.053886114557468945, + "learning_rate": 1.9894486058618863e-05, + "loss": 0.46213107109069823, + "memory(GiB)": 73.06, + "step": 240, + "token_acc": 0.8886558627264061, + "train_speed(iter/s)": 0.02992 + }, + { + "epoch": 0.07839686412543498, + "grad_norm": 0.06048992108753748, + "learning_rate": 1.9886829265586368e-05, + "loss": 0.4749046802520752, + "memory(GiB)": 73.06, + "step": 245, + "token_acc": 0.8758281279575999, + "train_speed(iter/s)": 0.0298 + }, + { + "epoch": 0.07999680012799489, + "grad_norm": 0.06662896613700448, + "learning_rate": 1.9878905881817254e-05, + "loss": 0.47487664222717285, + "memory(GiB)": 73.06, + "step": 250, + "token_acc": 0.8239743295897318, + "train_speed(iter/s)": 0.029855 + }, + { + "epoch": 0.07999680012799489, + "eval_loss": 0.6802101731300354, + "eval_runtime": 108.8605, + "eval_samples_per_second": 184.53, + "eval_steps_per_second": 0.928, + "eval_token_acc": 0.8656411339267154, + "step": 250 + }, + { + "epoch": 0.08159673613055478, + "grad_norm": 0.058967589577093804, + "learning_rate": 1.9870716120959462e-05, + "loss": 0.4691306591033936, + "memory(GiB)": 73.24, + "step": 255, + "token_acc": 0.8612697569398327, + "train_speed(iter/s)": 0.029606 + }, + { + "epoch": 0.08319667213311467, + "grad_norm": 0.057353651690814994, + "learning_rate": 1.986226020384359e-05, + "loss": 0.46143622398376466, + "memory(GiB)": 73.24, + "step": 260, + "token_acc": 0.8685547371094742, + "train_speed(iter/s)": 0.029681 + }, + { + "epoch": 0.08479660813567458, + "grad_norm": 0.05409688809510523, + "learning_rate": 1.9853538358476933e-05, + "loss": 0.4704445838928223, + "memory(GiB)": 73.24, + "step": 265, + "token_acc": 0.8804637020144431, + "train_speed(iter/s)": 0.02979 + }, + { + "epoch": 0.08639654413823447, + "grad_norm": 0.06968473514476099, + "learning_rate": 1.9844550820037326e-05, + "loss": 0.4717890739440918, + "memory(GiB)": 73.24, + "step": 270, + "token_acc": 0.8638003174145145, + "train_speed(iter/s)": 0.029834 + }, + { + "epoch": 0.08799648014079436, + "grad_norm": 0.06009720175343309, + "learning_rate": 1.9835297830866827e-05, + "loss": 0.4709662437438965, + "memory(GiB)": 73.24, + "step": 275, + "token_acc": 0.8634590377113134, + "train_speed(iter/s)": 0.029835 + }, + { + "epoch": 0.08959641614335427, + "grad_norm": 0.058778539356308675, + "learning_rate": 1.9825779640465157e-05, + "loss": 0.47084336280822753, + "memory(GiB)": 73.24, + "step": 280, + "token_acc": 0.9203691779351793, + "train_speed(iter/s)": 0.029942 + }, + { + "epoch": 0.09119635214591416, + "grad_norm": 0.054325246749067864, + "learning_rate": 1.9815996505483e-05, + "loss": 0.4666774749755859, + "memory(GiB)": 73.24, + "step": 285, + "token_acc": 0.8521723454119344, + "train_speed(iter/s)": 0.029906 + }, + { + "epoch": 0.09279628814847406, + "grad_norm": 0.058698263071843435, + "learning_rate": 1.9805948689715043e-05, + "loss": 0.45826416015625, + "memory(GiB)": 73.24, + "step": 290, + "token_acc": 0.8421138211382114, + "train_speed(iter/s)": 0.029895 + }, + { + "epoch": 0.09439622415103396, + "grad_norm": 0.05517972536097747, + "learning_rate": 1.979563646409291e-05, + "loss": 0.47627692222595214, + "memory(GiB)": 73.24, + "step": 295, + "token_acc": 0.8784122999686226, + "train_speed(iter/s)": 0.029966 + }, + { + "epoch": 0.09599616015359386, + "grad_norm": 0.06223926082468345, + "learning_rate": 1.9785060106677818e-05, + "loss": 0.4711057186126709, + "memory(GiB)": 73.24, + "step": 300, + "token_acc": 0.876372039283651, + "train_speed(iter/s)": 0.029911 + }, + { + "epoch": 0.09759609615615375, + "grad_norm": 0.060678733702642235, + "learning_rate": 1.97742199026531e-05, + "loss": 0.46833024024963377, + "memory(GiB)": 73.24, + "step": 305, + "token_acc": 0.8586995355484102, + "train_speed(iter/s)": 0.029998 + }, + { + "epoch": 0.09919603215871366, + "grad_norm": 0.06117494885421727, + "learning_rate": 1.9763116144316506e-05, + "loss": 0.4692807197570801, + "memory(GiB)": 73.24, + "step": 310, + "token_acc": 0.8383072793304911, + "train_speed(iter/s)": 0.029983 + }, + { + "epoch": 0.10079596816127355, + "grad_norm": 0.059512004342169564, + "learning_rate": 1.9751749131072335e-05, + "loss": 0.462421178817749, + "memory(GiB)": 73.24, + "step": 315, + "token_acc": 0.865073787772312, + "train_speed(iter/s)": 0.029959 + }, + { + "epoch": 0.10239590416383344, + "grad_norm": 0.05759903892800583, + "learning_rate": 1.9740119169423337e-05, + "loss": 0.4749638080596924, + "memory(GiB)": 73.24, + "step": 320, + "token_acc": 0.8657438292194797, + "train_speed(iter/s)": 0.030064 + }, + { + "epoch": 0.10399584016639335, + "grad_norm": 0.05512670495542287, + "learning_rate": 1.9728226572962474e-05, + "loss": 0.48053979873657227, + "memory(GiB)": 73.24, + "step": 325, + "token_acc": 0.9068181818181819, + "train_speed(iter/s)": 0.03004 + }, + { + "epoch": 0.10559577616895324, + "grad_norm": 0.05723038100011267, + "learning_rate": 1.9716071662364454e-05, + "loss": 0.47551665306091306, + "memory(GiB)": 73.24, + "step": 330, + "token_acc": 0.8362432269717038, + "train_speed(iter/s)": 0.030003 + }, + { + "epoch": 0.10719571217151314, + "grad_norm": 0.057638605082885846, + "learning_rate": 1.970365476537707e-05, + "loss": 0.4652701854705811, + "memory(GiB)": 73.24, + "step": 335, + "token_acc": 0.8735049401976079, + "train_speed(iter/s)": 0.030082 + }, + { + "epoch": 0.10879564817407304, + "grad_norm": 0.05903871731521889, + "learning_rate": 1.9690976216812397e-05, + "loss": 0.4698742389678955, + "memory(GiB)": 73.24, + "step": 340, + "token_acc": 0.8620361560418649, + "train_speed(iter/s)": 0.030027 + }, + { + "epoch": 0.11039558417663294, + "grad_norm": 0.053856521964694516, + "learning_rate": 1.9678036358537726e-05, + "loss": 0.4701416015625, + "memory(GiB)": 73.24, + "step": 345, + "token_acc": 0.8708435421771089, + "train_speed(iter/s)": 0.03002 + }, + { + "epoch": 0.11199552017919283, + "grad_norm": 0.05586893539038131, + "learning_rate": 1.966483553946637e-05, + "loss": 0.47447028160095217, + "memory(GiB)": 73.24, + "step": 350, + "token_acc": 0.8617533718689788, + "train_speed(iter/s)": 0.030041 + }, + { + "epoch": 0.11359545618175274, + "grad_norm": 0.052599438001953325, + "learning_rate": 1.9651374115548255e-05, + "loss": 0.4637298583984375, + "memory(GiB)": 73.24, + "step": 355, + "token_acc": 0.8874341610233258, + "train_speed(iter/s)": 0.029967 + }, + { + "epoch": 0.11519539218431263, + "grad_norm": 0.05804143123407663, + "learning_rate": 1.9637652449760297e-05, + "loss": 0.4660144329071045, + "memory(GiB)": 73.24, + "step": 360, + "token_acc": 0.8349885408708938, + "train_speed(iter/s)": 0.030034 + }, + { + "epoch": 0.11679532818687252, + "grad_norm": 0.06055547849970778, + "learning_rate": 1.9623670912096656e-05, + "loss": 0.4716383934020996, + "memory(GiB)": 73.24, + "step": 365, + "token_acc": 0.8751012473675684, + "train_speed(iter/s)": 0.02998 + }, + { + "epoch": 0.11839526418943243, + "grad_norm": 0.058520598293842735, + "learning_rate": 1.9609429879558726e-05, + "loss": 0.46298699378967284, + "memory(GiB)": 73.24, + "step": 370, + "token_acc": 0.8553921568627451, + "train_speed(iter/s)": 0.029931 + }, + { + "epoch": 0.11999520019199232, + "grad_norm": 0.058584318589478955, + "learning_rate": 1.9594929736144978e-05, + "loss": 0.4756875514984131, + "memory(GiB)": 73.24, + "step": 375, + "token_acc": 0.8618346545866364, + "train_speed(iter/s)": 0.030006 + }, + { + "epoch": 0.12159513619455221, + "grad_norm": 0.05966533070217228, + "learning_rate": 1.958017087284061e-05, + "loss": 0.4596414089202881, + "memory(GiB)": 73.24, + "step": 380, + "token_acc": 0.8836156297165856, + "train_speed(iter/s)": 0.029968 + }, + { + "epoch": 0.12319507219711212, + "grad_norm": 0.06510894340277039, + "learning_rate": 1.9565153687607006e-05, + "loss": 0.4687026023864746, + "memory(GiB)": 73.24, + "step": 385, + "token_acc": 0.8694005270092227, + "train_speed(iter/s)": 0.029956 + }, + { + "epoch": 0.12479500819967201, + "grad_norm": 0.05180588304383506, + "learning_rate": 1.9549878585371006e-05, + "loss": 0.4649878978729248, + "memory(GiB)": 73.24, + "step": 390, + "token_acc": 0.8677233429394813, + "train_speed(iter/s)": 0.029999 + }, + { + "epoch": 0.1263949442022319, + "grad_norm": 0.060875424512666344, + "learning_rate": 1.9534345978013972e-05, + "loss": 0.47073874473571775, + "memory(GiB)": 73.24, + "step": 395, + "token_acc": 0.8484663512894858, + "train_speed(iter/s)": 0.029947 + }, + { + "epoch": 0.1279948802047918, + "grad_norm": 0.0571374353277554, + "learning_rate": 1.9518556284360696e-05, + "loss": 0.4666412353515625, + "memory(GiB)": 73.24, + "step": 400, + "token_acc": 0.8975701436434421, + "train_speed(iter/s)": 0.029998 + }, + { + "epoch": 0.1295948162073517, + "grad_norm": 0.05215050598306155, + "learning_rate": 1.9502509930168113e-05, + "loss": 0.4628121376037598, + "memory(GiB)": 73.24, + "step": 405, + "token_acc": 0.8816677696889477, + "train_speed(iter/s)": 0.029966 + }, + { + "epoch": 0.1311947522099116, + "grad_norm": 0.07947570193916972, + "learning_rate": 1.9486207348113803e-05, + "loss": 0.4593012809753418, + "memory(GiB)": 73.24, + "step": 410, + "token_acc": 0.8692473832862602, + "train_speed(iter/s)": 0.029911 + }, + { + "epoch": 0.1327946882124715, + "grad_norm": 0.07262611466641217, + "learning_rate": 1.946964897778433e-05, + "loss": 0.47004990577697753, + "memory(GiB)": 73.24, + "step": 415, + "token_acc": 0.8736337958983176, + "train_speed(iter/s)": 0.029969 + }, + { + "epoch": 0.1343946242150314, + "grad_norm": 0.053754461298334506, + "learning_rate": 1.9452835265663404e-05, + "loss": 0.4695271015167236, + "memory(GiB)": 73.24, + "step": 420, + "token_acc": 0.8747993579454254, + "train_speed(iter/s)": 0.029901 + }, + { + "epoch": 0.1359945602175913, + "grad_norm": 0.0742051800083311, + "learning_rate": 1.9435766665119823e-05, + "loss": 0.47011446952819824, + "memory(GiB)": 73.24, + "step": 425, + "token_acc": 0.8356736242884251, + "train_speed(iter/s)": 0.029856 + }, + { + "epoch": 0.13759449622015119, + "grad_norm": 0.06429200177825628, + "learning_rate": 1.941844363639525e-05, + "loss": 0.476796817779541, + "memory(GiB)": 73.24, + "step": 430, + "token_acc": 0.869019972131909, + "train_speed(iter/s)": 0.0299 + }, + { + "epoch": 0.13919443222271108, + "grad_norm": 0.06544854557851852, + "learning_rate": 1.9400866646591816e-05, + "loss": 0.4666853904724121, + "memory(GiB)": 73.24, + "step": 435, + "token_acc": 0.8204667863554758, + "train_speed(iter/s)": 0.029847 + }, + { + "epoch": 0.140794368225271, + "grad_norm": 0.0546565929911768, + "learning_rate": 1.9383036169659513e-05, + "loss": 0.4738778591156006, + "memory(GiB)": 73.24, + "step": 440, + "token_acc": 0.8605809128630706, + "train_speed(iter/s)": 0.029855 + }, + { + "epoch": 0.1423943042278309, + "grad_norm": 0.06789336848906298, + "learning_rate": 1.936495268638342e-05, + "loss": 0.47726120948791506, + "memory(GiB)": 73.24, + "step": 445, + "token_acc": 0.8404369243949454, + "train_speed(iter/s)": 0.029875 + }, + { + "epoch": 0.14399424023039079, + "grad_norm": 0.049909982274150465, + "learning_rate": 1.934661668437073e-05, + "loss": 0.47165632247924805, + "memory(GiB)": 73.24, + "step": 450, + "token_acc": 0.848471615720524, + "train_speed(iter/s)": 0.029826 + }, + { + "epoch": 0.14559417623295068, + "grad_norm": 0.057441474731933166, + "learning_rate": 1.932802865803763e-05, + "loss": 0.4703391075134277, + "memory(GiB)": 73.24, + "step": 455, + "token_acc": 0.8466442358774571, + "train_speed(iter/s)": 0.029877 + }, + { + "epoch": 0.14719411223551057, + "grad_norm": 0.07263904251491092, + "learning_rate": 1.930918910859592e-05, + "loss": 0.467697811126709, + "memory(GiB)": 73.24, + "step": 460, + "token_acc": 0.8491142333536957, + "train_speed(iter/s)": 0.02984 + }, + { + "epoch": 0.14879404823807046, + "grad_norm": 0.06769237623086669, + "learning_rate": 1.9290098544039546e-05, + "loss": 0.46541628837585447, + "memory(GiB)": 73.24, + "step": 465, + "token_acc": 0.8555353301340394, + "train_speed(iter/s)": 0.02978 + }, + { + "epoch": 0.15039398424063039, + "grad_norm": 0.06751583633556477, + "learning_rate": 1.927075747913088e-05, + "loss": 0.47134056091308596, + "memory(GiB)": 73.24, + "step": 470, + "token_acc": 0.8708000507163687, + "train_speed(iter/s)": 0.029832 + }, + { + "epoch": 0.15199392024319028, + "grad_norm": 0.0539492567012165, + "learning_rate": 1.9251166435386837e-05, + "loss": 0.4645866394042969, + "memory(GiB)": 73.24, + "step": 475, + "token_acc": 0.849832526981764, + "train_speed(iter/s)": 0.029779 + }, + { + "epoch": 0.15359385624575017, + "grad_norm": 0.06038706866556876, + "learning_rate": 1.923132594106483e-05, + "loss": 0.46890692710876464, + "memory(GiB)": 73.24, + "step": 480, + "token_acc": 0.8665925514174542, + "train_speed(iter/s)": 0.02976 + }, + { + "epoch": 0.15519379224831006, + "grad_norm": 0.05215840717634863, + "learning_rate": 1.92112365311485e-05, + "loss": 0.46829919815063475, + "memory(GiB)": 73.24, + "step": 485, + "token_acc": 0.861963565228023, + "train_speed(iter/s)": 0.029794 + }, + { + "epoch": 0.15679372825086996, + "grad_norm": 0.06554142579397569, + "learning_rate": 1.919089874733332e-05, + "loss": 0.4702622413635254, + "memory(GiB)": 73.24, + "step": 490, + "token_acc": 0.8809186723297153, + "train_speed(iter/s)": 0.029747 + }, + { + "epoch": 0.15839366425342985, + "grad_norm": 0.0601172563145885, + "learning_rate": 1.9170313138011964e-05, + "loss": 0.46490135192871096, + "memory(GiB)": 73.24, + "step": 495, + "token_acc": 0.8890911637025627, + "train_speed(iter/s)": 0.02977 + }, + { + "epoch": 0.15999360025598977, + "grad_norm": 0.05924399402367875, + "learning_rate": 1.9149480258259535e-05, + "loss": 0.46698894500732424, + "memory(GiB)": 73.24, + "step": 500, + "token_acc": 0.8781434114096853, + "train_speed(iter/s)": 0.029766 + }, + { + "epoch": 0.15999360025598977, + "eval_loss": 0.677643895149231, + "eval_runtime": 109.3458, + "eval_samples_per_second": 183.711, + "eval_steps_per_second": 0.924, + "eval_token_acc": 0.8661408286670019, + "step": 500 + }, + { + "epoch": 0.16159353625854966, + "grad_norm": 0.05118070522939682, + "learning_rate": 1.9128400669818586e-05, + "loss": 0.4606743812561035, + "memory(GiB)": 73.24, + "step": 505, + "token_acc": 0.8727327237295758, + "train_speed(iter/s)": 0.029628 + }, + { + "epoch": 0.16319347226110956, + "grad_norm": 0.05904937387674259, + "learning_rate": 1.9107074941083987e-05, + "loss": 0.47115492820739746, + "memory(GiB)": 73.24, + "step": 510, + "token_acc": 0.8801781737193763, + "train_speed(iter/s)": 0.029663 + }, + { + "epoch": 0.16479340826366945, + "grad_norm": 0.061211680590962145, + "learning_rate": 1.9085503647087588e-05, + "loss": 0.46154184341430665, + "memory(GiB)": 73.24, + "step": 515, + "token_acc": 0.8573438874230431, + "train_speed(iter/s)": 0.029714 + }, + { + "epoch": 0.16639334426622934, + "grad_norm": 0.05461804298242196, + "learning_rate": 1.906368736948272e-05, + "loss": 0.46891465187072756, + "memory(GiB)": 73.24, + "step": 520, + "token_acc": 0.8665508756694167, + "train_speed(iter/s)": 0.029721 + }, + { + "epoch": 0.16799328026878924, + "grad_norm": 0.059072521440841075, + "learning_rate": 1.9041626696528503e-05, + "loss": 0.4666083812713623, + "memory(GiB)": 73.24, + "step": 525, + "token_acc": 0.8742783835792175, + "train_speed(iter/s)": 0.029735 + }, + { + "epoch": 0.16959321627134916, + "grad_norm": 0.06762878495647719, + "learning_rate": 1.9019322223073997e-05, + "loss": 0.4684437274932861, + "memory(GiB)": 73.24, + "step": 530, + "token_acc": 0.8906074591493077, + "train_speed(iter/s)": 0.029782 + }, + { + "epoch": 0.17119315227390905, + "grad_norm": 0.05741557316745661, + "learning_rate": 1.899677455054215e-05, + "loss": 0.4690097332000732, + "memory(GiB)": 73.24, + "step": 535, + "token_acc": 0.8231878958479943, + "train_speed(iter/s)": 0.029785 + }, + { + "epoch": 0.17279308827646894, + "grad_norm": 0.049026865135578496, + "learning_rate": 1.8973984286913584e-05, + "loss": 0.469140625, + "memory(GiB)": 73.24, + "step": 540, + "token_acc": 0.8849415539766216, + "train_speed(iter/s)": 0.029789 + }, + { + "epoch": 0.17439302427902884, + "grad_norm": 0.059746465018255104, + "learning_rate": 1.895095204671021e-05, + "loss": 0.4646149158477783, + "memory(GiB)": 73.24, + "step": 545, + "token_acc": 0.8944385405596883, + "train_speed(iter/s)": 0.029813 + }, + { + "epoch": 0.17599296028158873, + "grad_norm": 0.049833714934798115, + "learning_rate": 1.892767845097864e-05, + "loss": 0.47077240943908694, + "memory(GiB)": 73.24, + "step": 550, + "token_acc": 0.8640860961638605, + "train_speed(iter/s)": 0.029794 + }, + { + "epoch": 0.17759289628414862, + "grad_norm": 0.06593845007149325, + "learning_rate": 1.890416412727346e-05, + "loss": 0.46265759468078616, + "memory(GiB)": 73.24, + "step": 555, + "token_acc": 0.8249895412076419, + "train_speed(iter/s)": 0.02984 + }, + { + "epoch": 0.17919283228670854, + "grad_norm": 0.058254003445636866, + "learning_rate": 1.88804097096403e-05, + "loss": 0.459829044342041, + "memory(GiB)": 73.24, + "step": 560, + "token_acc": 0.8835873095178616, + "train_speed(iter/s)": 0.029842 + }, + { + "epoch": 0.18079276828926844, + "grad_norm": 0.07335953644753283, + "learning_rate": 1.8856415838598738e-05, + "loss": 0.45765042304992676, + "memory(GiB)": 73.24, + "step": 565, + "token_acc": 0.8755007210382951, + "train_speed(iter/s)": 0.029818 + }, + { + "epoch": 0.18239270429182833, + "grad_norm": 0.06659181547700674, + "learning_rate": 1.8832183161125026e-05, + "loss": 0.4609128475189209, + "memory(GiB)": 73.24, + "step": 570, + "token_acc": 0.8344311377245509, + "train_speed(iter/s)": 0.029871 + }, + { + "epoch": 0.18399264029438822, + "grad_norm": 0.05836437871791382, + "learning_rate": 1.8807712330634645e-05, + "loss": 0.4691438674926758, + "memory(GiB)": 73.24, + "step": 575, + "token_acc": 0.8848027659908848, + "train_speed(iter/s)": 0.029828 + }, + { + "epoch": 0.18559257629694811, + "grad_norm": 0.05735059462858394, + "learning_rate": 1.87830040069647e-05, + "loss": 0.4602513790130615, + "memory(GiB)": 73.24, + "step": 580, + "token_acc": 0.8959147903465012, + "train_speed(iter/s)": 0.029816 + }, + { + "epoch": 0.187192512299508, + "grad_norm": 0.05337219773586585, + "learning_rate": 1.87580588563561e-05, + "loss": 0.46318631172180175, + "memory(GiB)": 73.24, + "step": 585, + "token_acc": 0.8725881039706586, + "train_speed(iter/s)": 0.029851 + }, + { + "epoch": 0.18879244830206793, + "grad_norm": 0.05886716832883729, + "learning_rate": 1.873287755143563e-05, + "loss": 0.4604507923126221, + "memory(GiB)": 73.24, + "step": 590, + "token_acc": 0.9041755130927105, + "train_speed(iter/s)": 0.029822 + }, + { + "epoch": 0.19039238430462782, + "grad_norm": 0.053483810048332456, + "learning_rate": 1.8707460771197773e-05, + "loss": 0.46618080139160156, + "memory(GiB)": 73.24, + "step": 595, + "token_acc": 0.8785046728971962, + "train_speed(iter/s)": 0.029819 + }, + { + "epoch": 0.1919923203071877, + "grad_norm": 0.0518592001281956, + "learning_rate": 1.868180920098644e-05, + "loss": 0.4680916786193848, + "memory(GiB)": 73.24, + "step": 600, + "token_acc": 0.8467063770147162, + "train_speed(iter/s)": 0.029843 + }, + { + "epoch": 0.1935922563097476, + "grad_norm": 0.07018232236413237, + "learning_rate": 1.8655923532476463e-05, + "loss": 0.46170759201049805, + "memory(GiB)": 73.24, + "step": 605, + "token_acc": 0.889030612244898, + "train_speed(iter/s)": 0.02981 + }, + { + "epoch": 0.1951921923123075, + "grad_norm": 0.06030421269833889, + "learning_rate": 1.8629804463654956e-05, + "loss": 0.46511187553405764, + "memory(GiB)": 73.24, + "step": 610, + "token_acc": 0.8554680664916885, + "train_speed(iter/s)": 0.029852 + }, + { + "epoch": 0.1967921283148674, + "grad_norm": 0.056137765321266526, + "learning_rate": 1.8603452698802498e-05, + "loss": 0.47327299118041993, + "memory(GiB)": 76.61, + "step": 615, + "token_acc": 0.8645191852202747, + "train_speed(iter/s)": 0.029831 + }, + { + "epoch": 0.1983920643174273, + "grad_norm": 0.05458475201274465, + "learning_rate": 1.857686894847413e-05, + "loss": 0.45963249206542967, + "memory(GiB)": 76.61, + "step": 620, + "token_acc": 0.8517509197438343, + "train_speed(iter/s)": 0.029791 + }, + { + "epoch": 0.1999920003199872, + "grad_norm": 0.059902578480064236, + "learning_rate": 1.8550053929480202e-05, + "loss": 0.4687147617340088, + "memory(GiB)": 76.61, + "step": 625, + "token_acc": 0.8958185683912119, + "train_speed(iter/s)": 0.029833 + }, + { + "epoch": 0.2015919363225471, + "grad_norm": 0.0539478773118384, + "learning_rate": 1.8523008364867056e-05, + "loss": 0.4696544647216797, + "memory(GiB)": 76.61, + "step": 630, + "token_acc": 0.8439355385920272, + "train_speed(iter/s)": 0.029796 + }, + { + "epoch": 0.203191872325107, + "grad_norm": 0.05688926646164217, + "learning_rate": 1.8495732983897504e-05, + "loss": 0.4628334045410156, + "memory(GiB)": 76.61, + "step": 635, + "token_acc": 0.8406656465187452, + "train_speed(iter/s)": 0.029775 + }, + { + "epoch": 0.20479180832766689, + "grad_norm": 0.055104479428209605, + "learning_rate": 1.8468228522031197e-05, + "loss": 0.4559271812438965, + "memory(GiB)": 76.61, + "step": 640, + "token_acc": 0.8823529411764706, + "train_speed(iter/s)": 0.029794 + }, + { + "epoch": 0.20639174433022678, + "grad_norm": 0.058080447436547736, + "learning_rate": 1.8440495720904758e-05, + "loss": 0.4649765968322754, + "memory(GiB)": 76.61, + "step": 645, + "token_acc": 0.8708735027753433, + "train_speed(iter/s)": 0.029752 + }, + { + "epoch": 0.2079916803327867, + "grad_norm": 0.06300003986546152, + "learning_rate": 1.8412535328311813e-05, + "loss": 0.47095327377319335, + "memory(GiB)": 76.61, + "step": 650, + "token_acc": 0.8504976200778883, + "train_speed(iter/s)": 0.029755 + }, + { + "epoch": 0.2095916163353466, + "grad_norm": 0.06584526718748161, + "learning_rate": 1.8384348098182815e-05, + "loss": 0.46697392463684084, + "memory(GiB)": 76.61, + "step": 655, + "token_acc": 0.8224407171775593, + "train_speed(iter/s)": 0.029762 + }, + { + "epoch": 0.21119155233790649, + "grad_norm": 0.07147957728971413, + "learning_rate": 1.8355934790564718e-05, + "loss": 0.4684570789337158, + "memory(GiB)": 76.61, + "step": 660, + "token_acc": 0.8842165898617511, + "train_speed(iter/s)": 0.029723 + }, + { + "epoch": 0.21279148834046638, + "grad_norm": 0.06592046292925295, + "learning_rate": 1.832729617160047e-05, + "loss": 0.461454439163208, + "memory(GiB)": 76.61, + "step": 665, + "token_acc": 0.9114801444043321, + "train_speed(iter/s)": 0.02976 + }, + { + "epoch": 0.21439142434302627, + "grad_norm": 0.0656829490109071, + "learning_rate": 1.8298433013508384e-05, + "loss": 0.46404447555541994, + "memory(GiB)": 76.61, + "step": 670, + "token_acc": 0.8516549891278087, + "train_speed(iter/s)": 0.029736 + }, + { + "epoch": 0.21599136034558616, + "grad_norm": 0.05417998837874903, + "learning_rate": 1.826934609456129e-05, + "loss": 0.47208566665649415, + "memory(GiB)": 76.61, + "step": 675, + "token_acc": 0.8798815733822078, + "train_speed(iter/s)": 0.029718 + }, + { + "epoch": 0.21759129634814608, + "grad_norm": 0.06917195844649823, + "learning_rate": 1.8240036199065546e-05, + "loss": 0.4724391460418701, + "memory(GiB)": 76.61, + "step": 680, + "token_acc": 0.875845675626257, + "train_speed(iter/s)": 0.029745 + }, + { + "epoch": 0.21919123235070598, + "grad_norm": 0.055849189404917746, + "learning_rate": 1.8210504117339917e-05, + "loss": 0.463816499710083, + "memory(GiB)": 76.61, + "step": 685, + "token_acc": 0.8841904379268782, + "train_speed(iter/s)": 0.029711 + }, + { + "epoch": 0.22079116835326587, + "grad_norm": 0.059563786969142496, + "learning_rate": 1.8180750645694236e-05, + "loss": 0.4678086757659912, + "memory(GiB)": 76.61, + "step": 690, + "token_acc": 0.8675231977159172, + "train_speed(iter/s)": 0.029714 + }, + { + "epoch": 0.22239110435582576, + "grad_norm": 0.05908606421708839, + "learning_rate": 1.8150776586407957e-05, + "loss": 0.46315860748291016, + "memory(GiB)": 76.61, + "step": 695, + "token_acc": 0.8914956011730205, + "train_speed(iter/s)": 0.029731 + }, + { + "epoch": 0.22399104035838566, + "grad_norm": 0.05617530731492468, + "learning_rate": 1.8120582747708503e-05, + "loss": 0.46682062149047854, + "memory(GiB)": 76.61, + "step": 700, + "token_acc": 0.8805088596092685, + "train_speed(iter/s)": 0.029689 + }, + { + "epoch": 0.22559097636094555, + "grad_norm": 0.06138477303861948, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.47155141830444336, + "memory(GiB)": 76.61, + "step": 705, + "token_acc": 0.8753766681015928, + "train_speed(iter/s)": 0.029703 + }, + { + "epoch": 0.22719091236350547, + "grad_norm": 0.07073141016351848, + "learning_rate": 1.8059538994588715e-05, + "loss": 0.45953845977783203, + "memory(GiB)": 76.61, + "step": 710, + "token_acc": 0.8449233877757198, + "train_speed(iter/s)": 0.0297 + }, + { + "epoch": 0.22879084836606536, + "grad_norm": 0.06266619359839708, + "learning_rate": 1.8028690726166172e-05, + "loss": 0.4604049205780029, + "memory(GiB)": 76.61, + "step": 715, + "token_acc": 0.8688032048072108, + "train_speed(iter/s)": 0.02966 + }, + { + "epoch": 0.23039078436862526, + "grad_norm": 0.0563660774004587, + "learning_rate": 1.7997625970281652e-05, + "loss": 0.4622708797454834, + "memory(GiB)": 76.61, + "step": 720, + "token_acc": 0.8698216735253772, + "train_speed(iter/s)": 0.029685 + }, + { + "epoch": 0.23199072037118515, + "grad_norm": 0.06596213612143108, + "learning_rate": 1.796634556457236e-05, + "loss": 0.4681892395019531, + "memory(GiB)": 76.61, + "step": 725, + "token_acc": 0.8842619184376795, + "train_speed(iter/s)": 0.029661 + }, + { + "epoch": 0.23359065637374504, + "grad_norm": 0.05364579438678848, + "learning_rate": 1.793485035249036e-05, + "loss": 0.46258745193481443, + "memory(GiB)": 76.61, + "step": 730, + "token_acc": 0.8599531615925059, + "train_speed(iter/s)": 0.02965 + }, + { + "epoch": 0.23519059237630494, + "grad_norm": 0.07509450433159735, + "learning_rate": 1.7903141183279776e-05, + "loss": 0.47242441177368166, + "memory(GiB)": 76.61, + "step": 735, + "token_acc": 0.8404958677685951, + "train_speed(iter/s)": 0.029665 + }, + { + "epoch": 0.23679052837886486, + "grad_norm": 0.06478313540282635, + "learning_rate": 1.7871218911953942e-05, + "loss": 0.4565444469451904, + "memory(GiB)": 76.61, + "step": 740, + "token_acc": 0.8338650865998177, + "train_speed(iter/s)": 0.029634 + }, + { + "epoch": 0.23839046438142475, + "grad_norm": 0.06348939893307848, + "learning_rate": 1.7839084399272317e-05, + "loss": 0.4670473575592041, + "memory(GiB)": 76.61, + "step": 745, + "token_acc": 0.8652410477034038, + "train_speed(iter/s)": 0.029638 + }, + { + "epoch": 0.23999040038398464, + "grad_norm": 0.07434587030241245, + "learning_rate": 1.780673851171728e-05, + "loss": 0.47047910690307615, + "memory(GiB)": 76.61, + "step": 750, + "token_acc": 0.88801504530689, + "train_speed(iter/s)": 0.029638 + }, + { + "epoch": 0.23999040038398464, + "eval_loss": 0.6746003031730652, + "eval_runtime": 113.2223, + "eval_samples_per_second": 177.421, + "eval_steps_per_second": 0.892, + "eval_token_acc": 0.8668385651547512, + "step": 750 + }, + { + "epoch": 0.24159033638654454, + "grad_norm": 0.06732795706859432, + "learning_rate": 1.777418212147079e-05, + "loss": 0.46190509796142576, + "memory(GiB)": 76.61, + "step": 755, + "token_acc": 0.8881346728210697, + "train_speed(iter/s)": 0.029543 + }, + { + "epoch": 0.24319027238910443, + "grad_norm": 0.06836940989947664, + "learning_rate": 1.7741416106390828e-05, + "loss": 0.46283302307128904, + "memory(GiB)": 76.61, + "step": 760, + "token_acc": 0.8831443688586545, + "train_speed(iter/s)": 0.029566 + }, + { + "epoch": 0.24479020839166432, + "grad_norm": 0.07072489516219096, + "learning_rate": 1.7708441349987753e-05, + "loss": 0.4619740962982178, + "memory(GiB)": 76.61, + "step": 765, + "token_acc": 0.8610668789808917, + "train_speed(iter/s)": 0.0296 + }, + { + "epoch": 0.24639014439422424, + "grad_norm": 0.07152232857362027, + "learning_rate": 1.767525874140048e-05, + "loss": 0.46694121360778806, + "memory(GiB)": 76.61, + "step": 770, + "token_acc": 0.8397869022869023, + "train_speed(iter/s)": 0.029606 + }, + { + "epoch": 0.24799008039678413, + "grad_norm": 0.059354056163304685, + "learning_rate": 1.7641869175372493e-05, + "loss": 0.4596868991851807, + "memory(GiB)": 76.61, + "step": 775, + "token_acc": 0.8582827406764961, + "train_speed(iter/s)": 0.029599 + }, + { + "epoch": 0.24959001639934403, + "grad_norm": 0.0629690289705531, + "learning_rate": 1.7608273552227723e-05, + "loss": 0.4583168029785156, + "memory(GiB)": 76.61, + "step": 780, + "token_acc": 0.8841532106646639, + "train_speed(iter/s)": 0.029639 + }, + { + "epoch": 0.25118995240190395, + "grad_norm": 0.05810355160479093, + "learning_rate": 1.7574472777846276e-05, + "loss": 0.47337069511413576, + "memory(GiB)": 76.61, + "step": 785, + "token_acc": 0.8676557863501484, + "train_speed(iter/s)": 0.029632 + }, + { + "epoch": 0.2527898884044638, + "grad_norm": 0.05365185572887828, + "learning_rate": 1.7540467763639994e-05, + "loss": 0.46567063331604003, + "memory(GiB)": 76.61, + "step": 790, + "token_acc": 0.8745288099084545, + "train_speed(iter/s)": 0.029629 + }, + { + "epoch": 0.25438982440702373, + "grad_norm": 0.054672322658953366, + "learning_rate": 1.7506259426527903e-05, + "loss": 0.47023472785949705, + "memory(GiB)": 76.61, + "step": 795, + "token_acc": 0.874407844001322, + "train_speed(iter/s)": 0.02965 + }, + { + "epoch": 0.2559897604095836, + "grad_norm": 0.057060955079149434, + "learning_rate": 1.7471848688911465e-05, + "loss": 0.4684537410736084, + "memory(GiB)": 76.61, + "step": 800, + "token_acc": 0.8839382448537378, + "train_speed(iter/s)": 0.029634 + }, + { + "epoch": 0.2575896964121435, + "grad_norm": 0.06051290772323595, + "learning_rate": 1.7437236478649718e-05, + "loss": 0.46199979782104494, + "memory(GiB)": 76.61, + "step": 805, + "token_acc": 0.8673650919153983, + "train_speed(iter/s)": 0.02966 + }, + { + "epoch": 0.2591896324147034, + "grad_norm": 0.0643397562387603, + "learning_rate": 1.7402423729034252e-05, + "loss": 0.4548381805419922, + "memory(GiB)": 76.61, + "step": 810, + "token_acc": 0.83125, + "train_speed(iter/s)": 0.029652 + }, + { + "epoch": 0.2607895684172633, + "grad_norm": 0.065624934571794, + "learning_rate": 1.736741137876405e-05, + "loss": 0.46353764533996583, + "memory(GiB)": 76.61, + "step": 815, + "token_acc": 0.8907202528787537, + "train_speed(iter/s)": 0.029628 + }, + { + "epoch": 0.2623895044198232, + "grad_norm": 0.053961693017135055, + "learning_rate": 1.7332200371920173e-05, + "loss": 0.46685361862182617, + "memory(GiB)": 76.61, + "step": 820, + "token_acc": 0.8522188711762172, + "train_speed(iter/s)": 0.029672 + }, + { + "epoch": 0.2639894404223831, + "grad_norm": 0.054388550053431586, + "learning_rate": 1.72967916579403e-05, + "loss": 0.46024084091186523, + "memory(GiB)": 76.61, + "step": 825, + "token_acc": 0.8684630384683567, + "train_speed(iter/s)": 0.02966 + }, + { + "epoch": 0.265589376424943, + "grad_norm": 0.0583019332597641, + "learning_rate": 1.7261186191593135e-05, + "loss": 0.47214059829711913, + "memory(GiB)": 76.61, + "step": 830, + "token_acc": 0.8717123935666982, + "train_speed(iter/s)": 0.029645 + }, + { + "epoch": 0.2671893124275029, + "grad_norm": 0.06004272220759217, + "learning_rate": 1.7225384932952655e-05, + "loss": 0.4626835823059082, + "memory(GiB)": 76.61, + "step": 835, + "token_acc": 0.8737211788059246, + "train_speed(iter/s)": 0.02967 + }, + { + "epoch": 0.2687892484300628, + "grad_norm": 0.05611993161069816, + "learning_rate": 1.7189388847372227e-05, + "loss": 0.46799750328063966, + "memory(GiB)": 76.61, + "step": 840, + "token_acc": 0.8781684382665577, + "train_speed(iter/s)": 0.029642 + }, + { + "epoch": 0.2703891844326227, + "grad_norm": 0.06345947319153013, + "learning_rate": 1.715319890545857e-05, + "loss": 0.4568619728088379, + "memory(GiB)": 76.61, + "step": 845, + "token_acc": 0.860916969527537, + "train_speed(iter/s)": 0.029655 + }, + { + "epoch": 0.2719891204351826, + "grad_norm": 0.0592531603954309, + "learning_rate": 1.7116816083045603e-05, + "loss": 0.46942729949951173, + "memory(GiB)": 76.61, + "step": 850, + "token_acc": 0.8726317245194303, + "train_speed(iter/s)": 0.029655 + }, + { + "epoch": 0.2735890564377425, + "grad_norm": 0.05711267065318382, + "learning_rate": 1.7080241361168108e-05, + "loss": 0.45801239013671874, + "memory(GiB)": 76.61, + "step": 855, + "token_acc": 0.8834167608590344, + "train_speed(iter/s)": 0.02963 + }, + { + "epoch": 0.27518899244030237, + "grad_norm": 0.05715792257951623, + "learning_rate": 1.704347572603529e-05, + "loss": 0.4675910472869873, + "memory(GiB)": 76.61, + "step": 860, + "token_acc": 0.8361073624231519, + "train_speed(iter/s)": 0.029659 + }, + { + "epoch": 0.2767889284428623, + "grad_norm": 0.056617536923221766, + "learning_rate": 1.700652016900419e-05, + "loss": 0.467483377456665, + "memory(GiB)": 76.61, + "step": 865, + "token_acc": 0.8753590807532716, + "train_speed(iter/s)": 0.029639 + }, + { + "epoch": 0.27838886444542216, + "grad_norm": 0.060433939578350394, + "learning_rate": 1.696937568655294e-05, + "loss": 0.46129570007324217, + "memory(GiB)": 76.61, + "step": 870, + "token_acc": 0.8700755748512623, + "train_speed(iter/s)": 0.029622 + }, + { + "epoch": 0.2799888004479821, + "grad_norm": 0.06826391103956585, + "learning_rate": 1.6932043280253892e-05, + "loss": 0.47449960708618166, + "memory(GiB)": 76.61, + "step": 875, + "token_acc": 0.8767408356010885, + "train_speed(iter/s)": 0.02965 + }, + { + "epoch": 0.281588736450542, + "grad_norm": 0.060978189753072065, + "learning_rate": 1.689452395674664e-05, + "loss": 0.464243745803833, + "memory(GiB)": 76.61, + "step": 880, + "token_acc": 0.8622170179547228, + "train_speed(iter/s)": 0.029624 + }, + { + "epoch": 0.28318867245310186, + "grad_norm": 0.0760276206328267, + "learning_rate": 1.6856818727710847e-05, + "loss": 0.4566212177276611, + "memory(GiB)": 76.61, + "step": 885, + "token_acc": 0.8465499485066942, + "train_speed(iter/s)": 0.029618 + }, + { + "epoch": 0.2847886084556618, + "grad_norm": 0.05693121191664627, + "learning_rate": 1.6818928609838967e-05, + "loss": 0.46042599678039553, + "memory(GiB)": 76.61, + "step": 890, + "token_acc": 0.8798391728891441, + "train_speed(iter/s)": 0.029627 + }, + { + "epoch": 0.28638854445822165, + "grad_norm": 0.05744826995499506, + "learning_rate": 1.678085462480885e-05, + "loss": 0.4604465961456299, + "memory(GiB)": 76.61, + "step": 895, + "token_acc": 0.8780676542118063, + "train_speed(iter/s)": 0.029599 + }, + { + "epoch": 0.28798848046078157, + "grad_norm": 0.06271464886952488, + "learning_rate": 1.6742597799256182e-05, + "loss": 0.46231966018676757, + "memory(GiB)": 76.61, + "step": 900, + "token_acc": 0.8866765515780555, + "train_speed(iter/s)": 0.029611 + }, + { + "epoch": 0.2895884164633415, + "grad_norm": 0.06044356676681803, + "learning_rate": 1.6704159164746797e-05, + "loss": 0.47655544281005857, + "memory(GiB)": 76.61, + "step": 905, + "token_acc": 0.8872944211544663, + "train_speed(iter/s)": 0.029601 + }, + { + "epoch": 0.29118835246590136, + "grad_norm": 0.05103569816400521, + "learning_rate": 1.6665539757748866e-05, + "loss": 0.4603917121887207, + "memory(GiB)": 76.61, + "step": 910, + "token_acc": 0.8611705475141599, + "train_speed(iter/s)": 0.029574 + }, + { + "epoch": 0.2927882884684613, + "grad_norm": 0.055811472748585486, + "learning_rate": 1.6626740619604967e-05, + "loss": 0.46213679313659667, + "memory(GiB)": 76.61, + "step": 915, + "token_acc": 0.8148507643775783, + "train_speed(iter/s)": 0.029594 + }, + { + "epoch": 0.29438822447102114, + "grad_norm": 0.05463929857953068, + "learning_rate": 1.658776279650397e-05, + "loss": 0.4658839702606201, + "memory(GiB)": 76.61, + "step": 920, + "token_acc": 0.8766637089618456, + "train_speed(iter/s)": 0.029577 + }, + { + "epoch": 0.29598816047358106, + "grad_norm": 0.06343067949686905, + "learning_rate": 1.6548607339452853e-05, + "loss": 0.46423888206481934, + "memory(GiB)": 76.61, + "step": 925, + "token_acc": 0.8785782119115453, + "train_speed(iter/s)": 0.029564 + }, + { + "epoch": 0.29758809647614093, + "grad_norm": 0.052431934937864355, + "learning_rate": 1.6509275304248366e-05, + "loss": 0.46324734687805175, + "memory(GiB)": 76.61, + "step": 930, + "token_acc": 0.8571011956838729, + "train_speed(iter/s)": 0.02958 + }, + { + "epoch": 0.29918803247870085, + "grad_norm": 0.059009943510604755, + "learning_rate": 1.6469767751448538e-05, + "loss": 0.46290836334228513, + "memory(GiB)": 76.61, + "step": 935, + "token_acc": 0.8388616290480864, + "train_speed(iter/s)": 0.029556 + }, + { + "epoch": 0.30078796848126077, + "grad_norm": 0.05160057372757322, + "learning_rate": 1.6430085746344107e-05, + "loss": 0.45898871421813964, + "memory(GiB)": 76.61, + "step": 940, + "token_acc": 0.8690580344123651, + "train_speed(iter/s)": 0.029556 + }, + { + "epoch": 0.30238790448382064, + "grad_norm": 0.05612231994140208, + "learning_rate": 1.639023035892978e-05, + "loss": 0.4546724796295166, + "memory(GiB)": 76.61, + "step": 945, + "token_acc": 0.876509544215037, + "train_speed(iter/s)": 0.02956 + }, + { + "epoch": 0.30398784048638056, + "grad_norm": 0.06733149115024578, + "learning_rate": 1.6350202663875385e-05, + "loss": 0.4598522663116455, + "memory(GiB)": 76.61, + "step": 950, + "token_acc": 0.8623452294246177, + "train_speed(iter/s)": 0.029531 + }, + { + "epoch": 0.3055877764889404, + "grad_norm": 0.05450569676621943, + "learning_rate": 1.6310003740496887e-05, + "loss": 0.4602477550506592, + "memory(GiB)": 76.61, + "step": 955, + "token_acc": 0.8647700701480904, + "train_speed(iter/s)": 0.029548 + }, + { + "epoch": 0.30718771249150034, + "grad_norm": 0.06736921151917717, + "learning_rate": 1.6269634672727296e-05, + "loss": 0.4589672565460205, + "memory(GiB)": 76.61, + "step": 960, + "token_acc": 0.877502001601281, + "train_speed(iter/s)": 0.029536 + }, + { + "epoch": 0.30878764849406026, + "grad_norm": 0.06166660436042404, + "learning_rate": 1.6229096549087434e-05, + "loss": 0.4601268291473389, + "memory(GiB)": 76.61, + "step": 965, + "token_acc": 0.8723534201954397, + "train_speed(iter/s)": 0.029518 + }, + { + "epoch": 0.31038758449662013, + "grad_norm": 0.055128746386822226, + "learning_rate": 1.618839046265658e-05, + "loss": 0.4666788101196289, + "memory(GiB)": 76.61, + "step": 970, + "token_acc": 0.8550563360689943, + "train_speed(iter/s)": 0.029541 + }, + { + "epoch": 0.31198752049918005, + "grad_norm": 0.056867326711030626, + "learning_rate": 1.614751751104301e-05, + "loss": 0.4646125793457031, + "memory(GiB)": 76.61, + "step": 975, + "token_acc": 0.8651571964234208, + "train_speed(iter/s)": 0.029524 + }, + { + "epoch": 0.3135874565017399, + "grad_norm": 0.05501107287069041, + "learning_rate": 1.6106478796354382e-05, + "loss": 0.4588280200958252, + "memory(GiB)": 76.61, + "step": 980, + "token_acc": 0.8767766331985918, + "train_speed(iter/s)": 0.029517 + }, + { + "epoch": 0.31518739250429983, + "grad_norm": 0.08099201898186387, + "learning_rate": 1.6065275425168034e-05, + "loss": 0.4589373111724854, + "memory(GiB)": 76.61, + "step": 985, + "token_acc": 0.8917890157694399, + "train_speed(iter/s)": 0.029526 + }, + { + "epoch": 0.3167873285068597, + "grad_norm": 0.0522899382710734, + "learning_rate": 1.602390850850113e-05, + "loss": 0.46761279106140136, + "memory(GiB)": 76.61, + "step": 990, + "token_acc": 0.8461229409401366, + "train_speed(iter/s)": 0.029505 + }, + { + "epoch": 0.3183872645094196, + "grad_norm": 0.05838858698011934, + "learning_rate": 1.5982379161780722e-05, + "loss": 0.44941887855529783, + "memory(GiB)": 76.61, + "step": 995, + "token_acc": 0.8547228871294421, + "train_speed(iter/s)": 0.029511 + }, + { + "epoch": 0.31998720051197954, + "grad_norm": 0.054930484370324516, + "learning_rate": 1.5940688504813664e-05, + "loss": 0.4591392517089844, + "memory(GiB)": 76.61, + "step": 1000, + "token_acc": 0.8995555555555556, + "train_speed(iter/s)": 0.029505 + }, + { + "epoch": 0.31998720051197954, + "eval_loss": 0.671963095664978, + "eval_runtime": 110.8694, + "eval_samples_per_second": 181.186, + "eval_steps_per_second": 0.911, + "eval_token_acc": 0.8676077802864524, + "step": 1000 + }, + { + "epoch": 0.3215871365145394, + "grad_norm": 0.0578985798516978, + "learning_rate": 1.5898837661756405e-05, + "loss": 0.46222972869873047, + "memory(GiB)": 76.61, + "step": 1005, + "token_acc": 0.8840002569208042, + "train_speed(iter/s)": 0.029425 + }, + { + "epoch": 0.3231870725170993, + "grad_norm": 0.05872050053297838, + "learning_rate": 1.5856827761084698e-05, + "loss": 0.45543718338012695, + "memory(GiB)": 76.61, + "step": 1010, + "token_acc": 0.8753668220265838, + "train_speed(iter/s)": 0.02945 + }, + { + "epoch": 0.3247870085196592, + "grad_norm": 0.05268695066428434, + "learning_rate": 1.5814659935563165e-05, + "loss": 0.46614727973937986, + "memory(GiB)": 76.61, + "step": 1015, + "token_acc": 0.8792250035355678, + "train_speed(iter/s)": 0.029474 + }, + { + "epoch": 0.3263869445222191, + "grad_norm": 0.059454673806441594, + "learning_rate": 1.577233532221474e-05, + "loss": 0.45902605056762696, + "memory(GiB)": 76.61, + "step": 1020, + "token_acc": 0.86709886547812, + "train_speed(iter/s)": 0.029475 + }, + { + "epoch": 0.32798688052477903, + "grad_norm": 0.053728974295076275, + "learning_rate": 1.5729855062290024e-05, + "loss": 0.46491541862487795, + "memory(GiB)": 76.61, + "step": 1025, + "token_acc": 0.8708870261478794, + "train_speed(iter/s)": 0.029469 + }, + { + "epoch": 0.3295868165273389, + "grad_norm": 0.07030309576814114, + "learning_rate": 1.568722030123651e-05, + "loss": 0.453840970993042, + "memory(GiB)": 76.61, + "step": 1030, + "token_acc": 0.8568111455108359, + "train_speed(iter/s)": 0.029496 + }, + { + "epoch": 0.3311867525298988, + "grad_norm": 0.07385415365022158, + "learning_rate": 1.5644432188667695e-05, + "loss": 0.45582828521728513, + "memory(GiB)": 76.61, + "step": 1035, + "token_acc": 0.8800162140251317, + "train_speed(iter/s)": 0.029488 + }, + { + "epoch": 0.3327866885324587, + "grad_norm": 0.05407863995123405, + "learning_rate": 1.5601491878332077e-05, + "loss": 0.4665637969970703, + "memory(GiB)": 76.61, + "step": 1040, + "token_acc": 0.8628481345244351, + "train_speed(iter/s)": 0.029487 + }, + { + "epoch": 0.3343866245350186, + "grad_norm": 0.05879461372080454, + "learning_rate": 1.5558400528082057e-05, + "loss": 0.4657593250274658, + "memory(GiB)": 76.61, + "step": 1045, + "token_acc": 0.879185119574845, + "train_speed(iter/s)": 0.02951 + }, + { + "epoch": 0.33598656053757847, + "grad_norm": 0.06618244368029796, + "learning_rate": 1.551515929984271e-05, + "loss": 0.45760574340820315, + "memory(GiB)": 76.61, + "step": 1050, + "token_acc": 0.8899380348185305, + "train_speed(iter/s)": 0.029502 + }, + { + "epoch": 0.3375864965401384, + "grad_norm": 0.06388796415692906, + "learning_rate": 1.547176935958044e-05, + "loss": 0.46065597534179686, + "memory(GiB)": 76.61, + "step": 1055, + "token_acc": 0.8536853685368537, + "train_speed(iter/s)": 0.029524 + }, + { + "epoch": 0.3391864325426983, + "grad_norm": 0.05811152365312673, + "learning_rate": 1.5428231877271584e-05, + "loss": 0.46312780380249025, + "memory(GiB)": 76.61, + "step": 1060, + "token_acc": 0.8520375161707633, + "train_speed(iter/s)": 0.029515 + }, + { + "epoch": 0.3407863685452582, + "grad_norm": 0.05545936328508829, + "learning_rate": 1.538454802687081e-05, + "loss": 0.4615220546722412, + "memory(GiB)": 76.61, + "step": 1065, + "token_acc": 0.8744265080713679, + "train_speed(iter/s)": 0.029504 + }, + { + "epoch": 0.3423863045478181, + "grad_norm": 0.05964362984731802, + "learning_rate": 1.5340718986279505e-05, + "loss": 0.46706466674804686, + "memory(GiB)": 76.61, + "step": 1070, + "token_acc": 0.8592233009708737, + "train_speed(iter/s)": 0.029536 + }, + { + "epoch": 0.34398624055037796, + "grad_norm": 0.05356886450328198, + "learning_rate": 1.529674593731399e-05, + "loss": 0.45301499366760256, + "memory(GiB)": 76.61, + "step": 1075, + "token_acc": 0.8575192096597146, + "train_speed(iter/s)": 0.029526 + }, + { + "epoch": 0.3455861765529379, + "grad_norm": 0.05995962073425321, + "learning_rate": 1.5252630065673662e-05, + "loss": 0.46819314956665037, + "memory(GiB)": 76.61, + "step": 1080, + "token_acc": 0.8875031814711123, + "train_speed(iter/s)": 0.029518 + }, + { + "epoch": 0.3471861125554978, + "grad_norm": 0.05389432634852101, + "learning_rate": 1.5208372560909031e-05, + "loss": 0.46298394203186033, + "memory(GiB)": 76.61, + "step": 1085, + "token_acc": 0.8872426699937617, + "train_speed(iter/s)": 0.029543 + }, + { + "epoch": 0.34878604855805767, + "grad_norm": 0.06642390255342462, + "learning_rate": 1.5163974616389621e-05, + "loss": 0.45978522300720215, + "memory(GiB)": 76.61, + "step": 1090, + "token_acc": 0.8246258860593332, + "train_speed(iter/s)": 0.029525 + }, + { + "epoch": 0.3503859845606176, + "grad_norm": 0.06115184110491886, + "learning_rate": 1.5119437429271813e-05, + "loss": 0.4637304782867432, + "memory(GiB)": 76.61, + "step": 1095, + "token_acc": 0.8666082895504962, + "train_speed(iter/s)": 0.029534 + }, + { + "epoch": 0.35198592056317746, + "grad_norm": 0.060865150660591956, + "learning_rate": 1.5074762200466557e-05, + "loss": 0.4542848587036133, + "memory(GiB)": 76.61, + "step": 1100, + "token_acc": 0.8913602663035255, + "train_speed(iter/s)": 0.029544 + }, + { + "epoch": 0.3535858565657374, + "grad_norm": 0.057666943430007674, + "learning_rate": 1.5029950134606991e-05, + "loss": 0.4574248790740967, + "memory(GiB)": 76.61, + "step": 1105, + "token_acc": 0.8634470336597996, + "train_speed(iter/s)": 0.029524 + }, + { + "epoch": 0.35518579256829724, + "grad_norm": 0.054034554153381265, + "learning_rate": 1.4985002440015959e-05, + "loss": 0.4520272254943848, + "memory(GiB)": 76.61, + "step": 1110, + "token_acc": 0.8674898358680921, + "train_speed(iter/s)": 0.029551 + }, + { + "epoch": 0.35678572857085716, + "grad_norm": 0.06416854479766453, + "learning_rate": 1.4939920328673422e-05, + "loss": 0.4668846130371094, + "memory(GiB)": 76.61, + "step": 1115, + "token_acc": 0.9170854271356784, + "train_speed(iter/s)": 0.029541 + }, + { + "epoch": 0.3583856645734171, + "grad_norm": 0.05775941336987237, + "learning_rate": 1.4894705016183803e-05, + "loss": 0.4518620491027832, + "memory(GiB)": 76.61, + "step": 1120, + "token_acc": 0.8672782874617737, + "train_speed(iter/s)": 0.029531 + }, + { + "epoch": 0.35998560057597695, + "grad_norm": 0.0625175589581686, + "learning_rate": 1.4849357721743169e-05, + "loss": 0.4566941738128662, + "memory(GiB)": 76.61, + "step": 1125, + "token_acc": 0.8505491793163026, + "train_speed(iter/s)": 0.029557 + }, + { + "epoch": 0.36158553657853687, + "grad_norm": 0.05911529293553411, + "learning_rate": 1.4803879668106393e-05, + "loss": 0.4640664577484131, + "memory(GiB)": 76.61, + "step": 1130, + "token_acc": 0.8772325625117503, + "train_speed(iter/s)": 0.029544 + }, + { + "epoch": 0.36318547258109674, + "grad_norm": 0.06483783687935218, + "learning_rate": 1.4758272081554168e-05, + "loss": 0.45419878959655763, + "memory(GiB)": 76.61, + "step": 1135, + "token_acc": 0.8594914930223667, + "train_speed(iter/s)": 0.029539 + }, + { + "epoch": 0.36478540858365666, + "grad_norm": 0.06032730304497941, + "learning_rate": 1.4712536191859934e-05, + "loss": 0.45779004096984866, + "memory(GiB)": 76.61, + "step": 1140, + "token_acc": 0.8938053097345132, + "train_speed(iter/s)": 0.029564 + }, + { + "epoch": 0.3663853445862166, + "grad_norm": 0.0637380940226065, + "learning_rate": 1.4666673232256738e-05, + "loss": 0.46385722160339354, + "memory(GiB)": 76.61, + "step": 1145, + "token_acc": 0.8621830209481808, + "train_speed(iter/s)": 0.029544 + }, + { + "epoch": 0.36798528058877644, + "grad_norm": 0.057006770373085346, + "learning_rate": 1.4620684439403962e-05, + "loss": 0.4613553524017334, + "memory(GiB)": 76.61, + "step": 1150, + "token_acc": 0.8831837819873712, + "train_speed(iter/s)": 0.029558 + }, + { + "epoch": 0.36958521659133636, + "grad_norm": 0.057569299635009126, + "learning_rate": 1.4574571053353987e-05, + "loss": 0.4598341464996338, + "memory(GiB)": 76.61, + "step": 1155, + "token_acc": 0.8825154371140721, + "train_speed(iter/s)": 0.029557 + }, + { + "epoch": 0.37118515259389623, + "grad_norm": 0.06747695219063263, + "learning_rate": 1.452833431751875e-05, + "loss": 0.4570640563964844, + "memory(GiB)": 76.61, + "step": 1160, + "token_acc": 0.8726823238566132, + "train_speed(iter/s)": 0.029543 + }, + { + "epoch": 0.37278508859645615, + "grad_norm": 0.05405367649749466, + "learning_rate": 1.448197547863622e-05, + "loss": 0.4516812801361084, + "memory(GiB)": 76.61, + "step": 1165, + "token_acc": 0.8704696273608984, + "train_speed(iter/s)": 0.029568 + }, + { + "epoch": 0.374385024599016, + "grad_norm": 0.06041157710672601, + "learning_rate": 1.4435495786736796e-05, + "loss": 0.465837287902832, + "memory(GiB)": 76.61, + "step": 1170, + "token_acc": 0.8673412029229904, + "train_speed(iter/s)": 0.029554 + }, + { + "epoch": 0.37598496060157593, + "grad_norm": 0.05229585247228306, + "learning_rate": 1.438889649510956e-05, + "loss": 0.4427653789520264, + "memory(GiB)": 76.61, + "step": 1175, + "token_acc": 0.8558139534883721, + "train_speed(iter/s)": 0.02954 + }, + { + "epoch": 0.37758489660413586, + "grad_norm": 0.0547875272797444, + "learning_rate": 1.4342178860268523e-05, + "loss": 0.45673260688781736, + "memory(GiB)": 76.61, + "step": 1180, + "token_acc": 0.880563238622077, + "train_speed(iter/s)": 0.029563 + }, + { + "epoch": 0.3791848326066957, + "grad_norm": 0.0565328006493161, + "learning_rate": 1.4295344141918734e-05, + "loss": 0.46208748817443845, + "memory(GiB)": 76.61, + "step": 1185, + "token_acc": 0.8671328671328671, + "train_speed(iter/s)": 0.029544 + }, + { + "epoch": 0.38078476860925564, + "grad_norm": 0.062473905403265834, + "learning_rate": 1.4248393602922299e-05, + "loss": 0.46883163452148435, + "memory(GiB)": 76.61, + "step": 1190, + "token_acc": 0.8412252145605209, + "train_speed(iter/s)": 0.029548 + }, + { + "epoch": 0.3823847046118155, + "grad_norm": 0.05646151042315891, + "learning_rate": 1.420132850926434e-05, + "loss": 0.45732822418212893, + "memory(GiB)": 76.61, + "step": 1195, + "token_acc": 0.8820655966503839, + "train_speed(iter/s)": 0.02956 + }, + { + "epoch": 0.3839846406143754, + "grad_norm": 0.052981558367052706, + "learning_rate": 1.4154150130018867e-05, + "loss": 0.45579113960266116, + "memory(GiB)": 76.61, + "step": 1200, + "token_acc": 0.8677085226240233, + "train_speed(iter/s)": 0.029546 + }, + { + "epoch": 0.38558457661693535, + "grad_norm": 0.052315204322432474, + "learning_rate": 1.4106859737314532e-05, + "loss": 0.45348801612854006, + "memory(GiB)": 76.61, + "step": 1205, + "token_acc": 0.8616187989556136, + "train_speed(iter/s)": 0.029561 + }, + { + "epoch": 0.3871845126194952, + "grad_norm": 0.05319888084520812, + "learning_rate": 1.4059458606300358e-05, + "loss": 0.45279593467712403, + "memory(GiB)": 76.61, + "step": 1210, + "token_acc": 0.86090645233311, + "train_speed(iter/s)": 0.029565 + }, + { + "epoch": 0.38878444862205513, + "grad_norm": 0.054475973938428034, + "learning_rate": 1.4011948015111334e-05, + "loss": 0.4616706848144531, + "memory(GiB)": 76.61, + "step": 1215, + "token_acc": 0.8390133684805121, + "train_speed(iter/s)": 0.029549 + }, + { + "epoch": 0.390384384624615, + "grad_norm": 0.054891067059900926, + "learning_rate": 1.396432924483396e-05, + "loss": 0.4553243637084961, + "memory(GiB)": 76.61, + "step": 1220, + "token_acc": 0.8715350793347353, + "train_speed(iter/s)": 0.029571 + }, + { + "epoch": 0.3919843206271749, + "grad_norm": 0.06058246643434403, + "learning_rate": 1.3916603579471705e-05, + "loss": 0.47067904472351074, + "memory(GiB)": 76.61, + "step": 1225, + "token_acc": 0.8662144337667232, + "train_speed(iter/s)": 0.029556 + }, + { + "epoch": 0.3935842566297348, + "grad_norm": 0.05715510214651738, + "learning_rate": 1.3868772305910376e-05, + "loss": 0.46147928237915037, + "memory(GiB)": 76.61, + "step": 1230, + "token_acc": 0.868918375552875, + "train_speed(iter/s)": 0.029548 + }, + { + "epoch": 0.3951841926322947, + "grad_norm": 0.06593047910666934, + "learning_rate": 1.3820836713883424e-05, + "loss": 0.45935769081115724, + "memory(GiB)": 76.61, + "step": 1235, + "token_acc": 0.8596291476903057, + "train_speed(iter/s)": 0.02957 + }, + { + "epoch": 0.3967841286348546, + "grad_norm": 0.056071042953882384, + "learning_rate": 1.3772798095937172e-05, + "loss": 0.4495890140533447, + "memory(GiB)": 76.61, + "step": 1240, + "token_acc": 0.8471917163476623, + "train_speed(iter/s)": 0.029553 + }, + { + "epoch": 0.3983840646374145, + "grad_norm": 0.05810589720196263, + "learning_rate": 1.3724657747395957e-05, + "loss": 0.4619898319244385, + "memory(GiB)": 76.61, + "step": 1245, + "token_acc": 0.8691186216037111, + "train_speed(iter/s)": 0.029561 + }, + { + "epoch": 0.3999840006399744, + "grad_norm": 0.055604926632171425, + "learning_rate": 1.3676416966327201e-05, + "loss": 0.4587514400482178, + "memory(GiB)": 76.61, + "step": 1250, + "token_acc": 0.8369355461211887, + "train_speed(iter/s)": 0.029564 + }, + { + "epoch": 0.3999840006399744, + "eval_loss": 0.6690404415130615, + "eval_runtime": 106.3444, + "eval_samples_per_second": 188.896, + "eval_steps_per_second": 0.95, + "eval_token_acc": 0.8683678146748934, + "step": 1250 + }, + { + "epoch": 0.4015839366425343, + "grad_norm": 0.04782987834900457, + "learning_rate": 1.362807705350641e-05, + "loss": 0.46315851211547854, + "memory(GiB)": 76.61, + "step": 1255, + "token_acc": 0.8767961498796838, + "train_speed(iter/s)": 0.029512 + }, + { + "epoch": 0.4031838726450942, + "grad_norm": 0.05995996443795485, + "learning_rate": 1.3579639312382105e-05, + "loss": 0.46349530220031737, + "memory(GiB)": 76.61, + "step": 1260, + "token_acc": 0.8588617886178862, + "train_speed(iter/s)": 0.029524 + }, + { + "epoch": 0.4047838086476541, + "grad_norm": 0.06488882353036057, + "learning_rate": 1.3531105049040667e-05, + "loss": 0.45726447105407714, + "memory(GiB)": 76.61, + "step": 1265, + "token_acc": 0.8802249582003344, + "train_speed(iter/s)": 0.029543 + }, + { + "epoch": 0.406383744650214, + "grad_norm": 0.05350128050935312, + "learning_rate": 1.3482475572171132e-05, + "loss": 0.4516806125640869, + "memory(GiB)": 76.61, + "step": 1270, + "token_acc": 0.8560765550239234, + "train_speed(iter/s)": 0.029549 + }, + { + "epoch": 0.4079836806527739, + "grad_norm": 0.05672697687392494, + "learning_rate": 1.3433752193029888e-05, + "loss": 0.46581568717956545, + "memory(GiB)": 76.61, + "step": 1275, + "token_acc": 0.8881742738589211, + "train_speed(iter/s)": 0.029547 + }, + { + "epoch": 0.40958361665533377, + "grad_norm": 0.0598115330947421, + "learning_rate": 1.3384936225405326e-05, + "loss": 0.46333680152893064, + "memory(GiB)": 76.61, + "step": 1280, + "token_acc": 0.8608710985716804, + "train_speed(iter/s)": 0.029573 + }, + { + "epoch": 0.4111835526578937, + "grad_norm": 0.05384417907735887, + "learning_rate": 1.333602898558242e-05, + "loss": 0.4611030578613281, + "memory(GiB)": 76.61, + "step": 1285, + "token_acc": 0.8845689770746749, + "train_speed(iter/s)": 0.029567 + }, + { + "epoch": 0.41278348866045356, + "grad_norm": 0.06043637267465684, + "learning_rate": 1.3287031792307226e-05, + "loss": 0.46013875007629396, + "memory(GiB)": 76.61, + "step": 1290, + "token_acc": 0.870195210303884, + "train_speed(iter/s)": 0.029565 + }, + { + "epoch": 0.4143834246630135, + "grad_norm": 0.06140603532631629, + "learning_rate": 1.323794596675132e-05, + "loss": 0.45681238174438477, + "memory(GiB)": 76.61, + "step": 1295, + "token_acc": 0.8450012281994596, + "train_speed(iter/s)": 0.029583 + }, + { + "epoch": 0.4159833606655734, + "grad_norm": 0.062077229851937275, + "learning_rate": 1.318877283247619e-05, + "loss": 0.4490199565887451, + "memory(GiB)": 76.61, + "step": 1300, + "token_acc": 0.89259877573734, + "train_speed(iter/s)": 0.029573 + }, + { + "epoch": 0.41758329666813326, + "grad_norm": 0.05216177276902916, + "learning_rate": 1.3139513715397521e-05, + "loss": 0.45108351707458494, + "memory(GiB)": 76.61, + "step": 1305, + "token_acc": 0.8547701815372731, + "train_speed(iter/s)": 0.029594 + }, + { + "epoch": 0.4191832326706932, + "grad_norm": 0.05738628087610287, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.4558550834655762, + "memory(GiB)": 76.61, + "step": 1310, + "token_acc": 0.840696686491079, + "train_speed(iter/s)": 0.029587 + }, + { + "epoch": 0.42078316867325305, + "grad_norm": 0.05518036740697275, + "learning_rate": 1.304074284804885e-05, + "loss": 0.4631648063659668, + "memory(GiB)": 76.61, + "step": 1315, + "token_acc": 0.8788111708941839, + "train_speed(iter/s)": 0.029578 + }, + { + "epoch": 0.42238310467581297, + "grad_norm": 0.05902492258138098, + "learning_rate": 1.2991233761059214e-05, + "loss": 0.45921921730041504, + "memory(GiB)": 76.61, + "step": 1320, + "token_acc": 0.866059646344682, + "train_speed(iter/s)": 0.029604 + }, + { + "epoch": 0.4239830406783729, + "grad_norm": 0.059535437419073044, + "learning_rate": 1.2941644017754964e-05, + "loss": 0.46445517539978026, + "memory(GiB)": 76.61, + "step": 1325, + "token_acc": 0.8831345826235094, + "train_speed(iter/s)": 0.029591 + }, + { + "epoch": 0.42558297668093276, + "grad_norm": 0.04863893443696892, + "learning_rate": 1.289197495528534e-05, + "loss": 0.45836362838745115, + "memory(GiB)": 76.61, + "step": 1330, + "token_acc": 0.8986429177268872, + "train_speed(iter/s)": 0.029582 + }, + { + "epoch": 0.4271829126834927, + "grad_norm": 0.05945822860509985, + "learning_rate": 1.284222791293836e-05, + "loss": 0.45783252716064454, + "memory(GiB)": 76.61, + "step": 1335, + "token_acc": 0.8500874125874126, + "train_speed(iter/s)": 0.029596 + }, + { + "epoch": 0.42878284868605254, + "grad_norm": 0.05989616737178823, + "learning_rate": 1.2792404232104699e-05, + "loss": 0.45293269157409666, + "memory(GiB)": 76.61, + "step": 1340, + "token_acc": 0.8637480798771121, + "train_speed(iter/s)": 0.029584 + }, + { + "epoch": 0.43038278468861246, + "grad_norm": 0.0586629819404024, + "learning_rate": 1.2742505256241543e-05, + "loss": 0.45876450538635255, + "memory(GiB)": 76.61, + "step": 1345, + "token_acc": 0.8296499119890475, + "train_speed(iter/s)": 0.029588 + }, + { + "epoch": 0.43198272069117233, + "grad_norm": 0.052924904785980484, + "learning_rate": 1.2692532330836346e-05, + "loss": 0.45821080207824705, + "memory(GiB)": 76.61, + "step": 1350, + "token_acc": 0.8636084374360025, + "train_speed(iter/s)": 0.029594 + }, + { + "epoch": 0.43358265669373225, + "grad_norm": 0.059304249814977644, + "learning_rate": 1.2642486803370553e-05, + "loss": 0.45485148429870603, + "memory(GiB)": 76.61, + "step": 1355, + "token_acc": 0.8686690223792697, + "train_speed(iter/s)": 0.029579 + }, + { + "epoch": 0.43518259269629217, + "grad_norm": 0.06253442360689314, + "learning_rate": 1.2592370023283268e-05, + "loss": 0.45198469161987304, + "memory(GiB)": 76.61, + "step": 1360, + "token_acc": 0.8737075332348597, + "train_speed(iter/s)": 0.0296 + }, + { + "epoch": 0.43678252869885204, + "grad_norm": 0.05314091037792793, + "learning_rate": 1.2542183341934873e-05, + "loss": 0.4516898155212402, + "memory(GiB)": 76.61, + "step": 1365, + "token_acc": 0.8714476021314387, + "train_speed(iter/s)": 0.029596 + }, + { + "epoch": 0.43838246470141196, + "grad_norm": 0.06014404788689081, + "learning_rate": 1.2491928112570568e-05, + "loss": 0.45399184226989747, + "memory(GiB)": 76.61, + "step": 1370, + "token_acc": 0.8657097288676237, + "train_speed(iter/s)": 0.029583 + }, + { + "epoch": 0.4399824007039718, + "grad_norm": 0.05910144328100835, + "learning_rate": 1.2441605690283915e-05, + "loss": 0.4607128143310547, + "memory(GiB)": 76.61, + "step": 1375, + "token_acc": 0.8990952307928232, + "train_speed(iter/s)": 0.029603 + }, + { + "epoch": 0.44158233670653174, + "grad_norm": 0.059073628736854025, + "learning_rate": 1.2391217431980273e-05, + "loss": 0.4515543937683105, + "memory(GiB)": 76.61, + "step": 1380, + "token_acc": 0.9016349860428021, + "train_speed(iter/s)": 0.029591 + }, + { + "epoch": 0.44318227270909166, + "grad_norm": 0.058358968679540275, + "learning_rate": 1.234076469634022e-05, + "loss": 0.45762925148010253, + "memory(GiB)": 76.61, + "step": 1385, + "token_acc": 0.8919261822376009, + "train_speed(iter/s)": 0.029584 + }, + { + "epoch": 0.4447822087116515, + "grad_norm": 0.0672513399669503, + "learning_rate": 1.2290248843782915e-05, + "loss": 0.44803729057312014, + "memory(GiB)": 76.61, + "step": 1390, + "token_acc": 0.8975998070196599, + "train_speed(iter/s)": 0.029597 + }, + { + "epoch": 0.44638214471421145, + "grad_norm": 0.05793114375836921, + "learning_rate": 1.2239671236429413e-05, + "loss": 0.4537235736846924, + "memory(GiB)": 76.61, + "step": 1395, + "token_acc": 0.8839514422541486, + "train_speed(iter/s)": 0.02958 + }, + { + "epoch": 0.4479820807167713, + "grad_norm": 0.05955306099185102, + "learning_rate": 1.218903323806595e-05, + "loss": 0.4573692798614502, + "memory(GiB)": 76.61, + "step": 1400, + "token_acc": 0.8418099547511312, + "train_speed(iter/s)": 0.029594 + }, + { + "epoch": 0.44958201671933123, + "grad_norm": 0.058484796569864064, + "learning_rate": 1.2138336214107148e-05, + "loss": 0.44894704818725584, + "memory(GiB)": 76.61, + "step": 1405, + "token_acc": 0.8525200458190149, + "train_speed(iter/s)": 0.029594 + }, + { + "epoch": 0.4511819527218911, + "grad_norm": 0.05092836798588581, + "learning_rate": 1.2087581531559208e-05, + "loss": 0.45393967628479004, + "memory(GiB)": 76.61, + "step": 1410, + "token_acc": 0.8791390728476821, + "train_speed(iter/s)": 0.02958 + }, + { + "epoch": 0.452781888724451, + "grad_norm": 0.07033477253264378, + "learning_rate": 1.2036770558983067e-05, + "loss": 0.45307221412658694, + "memory(GiB)": 76.61, + "step": 1415, + "token_acc": 0.8387482900136799, + "train_speed(iter/s)": 0.029599 + }, + { + "epoch": 0.45438182472701094, + "grad_norm": 0.05966547548288182, + "learning_rate": 1.1985904666457455e-05, + "loss": 0.455959415435791, + "memory(GiB)": 76.61, + "step": 1420, + "token_acc": 0.9042096902303416, + "train_speed(iter/s)": 0.029583 + }, + { + "epoch": 0.4559817607295708, + "grad_norm": 0.08159145764722696, + "learning_rate": 1.1934985225541998e-05, + "loss": 0.462065601348877, + "memory(GiB)": 76.61, + "step": 1425, + "token_acc": 0.885252444621832, + "train_speed(iter/s)": 0.029573 + }, + { + "epoch": 0.4575816967321307, + "grad_norm": 0.05540814227664117, + "learning_rate": 1.18840136092402e-05, + "loss": 0.4551572322845459, + "memory(GiB)": 76.61, + "step": 1430, + "token_acc": 0.8559651934966797, + "train_speed(iter/s)": 0.029592 + }, + { + "epoch": 0.4591816327346906, + "grad_norm": 0.05534004007067895, + "learning_rate": 1.1832991191962435e-05, + "loss": 0.4455368995666504, + "memory(GiB)": 76.61, + "step": 1435, + "token_acc": 0.875560538116592, + "train_speed(iter/s)": 0.029576 + }, + { + "epoch": 0.4607815687372505, + "grad_norm": 0.058276771895487044, + "learning_rate": 1.1781919349488894e-05, + "loss": 0.4590908527374268, + "memory(GiB)": 76.61, + "step": 1440, + "token_acc": 0.8510418460478733, + "train_speed(iter/s)": 0.029576 + }, + { + "epoch": 0.46238150473981043, + "grad_norm": 0.05839975543902795, + "learning_rate": 1.1730799458932473e-05, + "loss": 0.462816858291626, + "memory(GiB)": 76.61, + "step": 1445, + "token_acc": 0.9052378085490669, + "train_speed(iter/s)": 0.029586 + }, + { + "epoch": 0.4639814407423703, + "grad_norm": 0.07084434546926481, + "learning_rate": 1.1679632898701649e-05, + "loss": 0.4550295829772949, + "memory(GiB)": 76.61, + "step": 1450, + "token_acc": 0.8805626598465474, + "train_speed(iter/s)": 0.029572 + }, + { + "epoch": 0.4655813767449302, + "grad_norm": 0.06519996046237972, + "learning_rate": 1.1628421048463315e-05, + "loss": 0.46291208267211914, + "memory(GiB)": 76.61, + "step": 1455, + "token_acc": 0.8565744600227359, + "train_speed(iter/s)": 0.029581 + }, + { + "epoch": 0.4671813127474901, + "grad_norm": 0.05799269979733804, + "learning_rate": 1.1577165289105565e-05, + "loss": 0.4474311351776123, + "memory(GiB)": 76.61, + "step": 1460, + "token_acc": 0.8579789309403043, + "train_speed(iter/s)": 0.029568 + }, + { + "epoch": 0.46878124875005, + "grad_norm": 0.057120675003187855, + "learning_rate": 1.1525867002700484e-05, + "loss": 0.46109714508056643, + "memory(GiB)": 76.61, + "step": 1465, + "token_acc": 0.8752182516587126, + "train_speed(iter/s)": 0.029548 + }, + { + "epoch": 0.47038118475260987, + "grad_norm": 0.05696370798749074, + "learning_rate": 1.1474527572466847e-05, + "loss": 0.4501948833465576, + "memory(GiB)": 76.61, + "step": 1470, + "token_acc": 0.8529032258064516, + "train_speed(iter/s)": 0.029562 + }, + { + "epoch": 0.4719811207551698, + "grad_norm": 0.05518112754329221, + "learning_rate": 1.1423148382732854e-05, + "loss": 0.45941987037658694, + "memory(GiB)": 76.61, + "step": 1475, + "token_acc": 0.9009282399143164, + "train_speed(iter/s)": 0.029545 + }, + { + "epoch": 0.4735810567577297, + "grad_norm": 0.051496444525703684, + "learning_rate": 1.1371730818898785e-05, + "loss": 0.45296878814697267, + "memory(GiB)": 76.61, + "step": 1480, + "token_acc": 0.8814303638644918, + "train_speed(iter/s)": 0.029538 + }, + { + "epoch": 0.4751809927602896, + "grad_norm": 0.0677105428949175, + "learning_rate": 1.132027626739965e-05, + "loss": 0.45635080337524414, + "memory(GiB)": 76.61, + "step": 1485, + "token_acc": 0.880248833592535, + "train_speed(iter/s)": 0.029546 + }, + { + "epoch": 0.4767809287628495, + "grad_norm": 0.0673509631098402, + "learning_rate": 1.1268786115667798e-05, + "loss": 0.4614115715026855, + "memory(GiB)": 76.61, + "step": 1490, + "token_acc": 0.8609592251210748, + "train_speed(iter/s)": 0.029525 + }, + { + "epoch": 0.47838086476540936, + "grad_norm": 0.053337771378298794, + "learning_rate": 1.1217261752095518e-05, + "loss": 0.45500664710998534, + "memory(GiB)": 76.61, + "step": 1495, + "token_acc": 0.8794466403162056, + "train_speed(iter/s)": 0.029522 + }, + { + "epoch": 0.4799808007679693, + "grad_norm": 0.05429302474155136, + "learning_rate": 1.1165704565997593e-05, + "loss": 0.44763407707214353, + "memory(GiB)": 76.61, + "step": 1500, + "token_acc": 0.8700440528634361, + "train_speed(iter/s)": 0.02952 + }, + { + "epoch": 0.4799808007679693, + "eval_loss": 0.6668144464492798, + "eval_runtime": 124.2589, + "eval_samples_per_second": 161.662, + "eval_steps_per_second": 0.813, + "eval_token_acc": 0.8694583558206896, + "step": 1500 + }, + { + "epoch": 0.4815807367705292, + "grad_norm": 0.06146640524587408, + "learning_rate": 1.1114115947573834e-05, + "loss": 0.45711498260498046, + "memory(GiB)": 76.61, + "step": 1505, + "token_acc": 0.8695166967121641, + "train_speed(iter/s)": 0.029461 + }, + { + "epoch": 0.48318067277308907, + "grad_norm": 0.06183782693437151, + "learning_rate": 1.1062497287871606e-05, + "loss": 0.4499336242675781, + "memory(GiB)": 76.61, + "step": 1510, + "token_acc": 0.8487557381009906, + "train_speed(iter/s)": 0.029468 + }, + { + "epoch": 0.484780608775649, + "grad_norm": 0.056753269624682155, + "learning_rate": 1.1010849978748314e-05, + "loss": 0.4551094055175781, + "memory(GiB)": 76.61, + "step": 1515, + "token_acc": 0.8579035448045033, + "train_speed(iter/s)": 0.02948 + }, + { + "epoch": 0.48638054477820886, + "grad_norm": 0.05188962595699218, + "learning_rate": 1.0959175412833869e-05, + "loss": 0.4483503818511963, + "memory(GiB)": 76.61, + "step": 1520, + "token_acc": 0.8649127992905705, + "train_speed(iter/s)": 0.029477 + }, + { + "epoch": 0.4879804807807688, + "grad_norm": 0.0526414480661873, + "learning_rate": 1.0907474983493144e-05, + "loss": 0.45140752792358396, + "memory(GiB)": 76.61, + "step": 1525, + "token_acc": 0.8700204290091931, + "train_speed(iter/s)": 0.029464 + }, + { + "epoch": 0.48958041678332864, + "grad_norm": 0.06682159988119828, + "learning_rate": 1.08557500847884e-05, + "loss": 0.4480952262878418, + "memory(GiB)": 76.61, + "step": 1530, + "token_acc": 0.8906385187748745, + "train_speed(iter/s)": 0.029478 + }, + { + "epoch": 0.49118035278588856, + "grad_norm": 0.06117568492897364, + "learning_rate": 1.080400211144169e-05, + "loss": 0.453688907623291, + "memory(GiB)": 76.61, + "step": 1535, + "token_acc": 0.8130096719135217, + "train_speed(iter/s)": 0.029469 + }, + { + "epoch": 0.4927802887884485, + "grad_norm": 0.05143211947513191, + "learning_rate": 1.0752232458797262e-05, + "loss": 0.44568753242492676, + "memory(GiB)": 76.61, + "step": 1540, + "token_acc": 0.847257743677181, + "train_speed(iter/s)": 0.029464 + }, + { + "epoch": 0.49438022479100835, + "grad_norm": 0.05201971134010435, + "learning_rate": 1.070044252278393e-05, + "loss": 0.46500363349914553, + "memory(GiB)": 76.61, + "step": 1545, + "token_acc": 0.84, + "train_speed(iter/s)": 0.029474 + }, + { + "epoch": 0.49598016079356827, + "grad_norm": 0.05304880581645989, + "learning_rate": 1.064863369987743e-05, + "loss": 0.4501206398010254, + "memory(GiB)": 76.61, + "step": 1550, + "token_acc": 0.8888641920426762, + "train_speed(iter/s)": 0.029465 + }, + { + "epoch": 0.49758009679612814, + "grad_norm": 0.050584443072610216, + "learning_rate": 1.0596807387062772e-05, + "loss": 0.456621789932251, + "memory(GiB)": 76.61, + "step": 1555, + "token_acc": 0.8793768317137128, + "train_speed(iter/s)": 0.02948 + }, + { + "epoch": 0.49918003279868806, + "grad_norm": 0.05907676168100355, + "learning_rate": 1.0544964981796563e-05, + "loss": 0.4567122936248779, + "memory(GiB)": 76.61, + "step": 1560, + "token_acc": 0.8505747126436781, + "train_speed(iter/s)": 0.029477 + }, + { + "epoch": 0.500779968801248, + "grad_norm": 0.055037989511506104, + "learning_rate": 1.0493107881969335e-05, + "loss": 0.44720020294189455, + "memory(GiB)": 76.61, + "step": 1565, + "token_acc": 0.8853304383227032, + "train_speed(iter/s)": 0.029466 + }, + { + "epoch": 0.5023799048038079, + "grad_norm": 0.0597376748229471, + "learning_rate": 1.0441237485867845e-05, + "loss": 0.4492997169494629, + "memory(GiB)": 76.61, + "step": 1570, + "token_acc": 0.8809347181008902, + "train_speed(iter/s)": 0.029489 + }, + { + "epoch": 0.5039798408063677, + "grad_norm": 0.060265741182571844, + "learning_rate": 1.0389355192137379e-05, + "loss": 0.4525942325592041, + "memory(GiB)": 76.61, + "step": 1575, + "token_acc": 0.8839541547277937, + "train_speed(iter/s)": 0.029481 + }, + { + "epoch": 0.5055797768089276, + "grad_norm": 0.06015007204584338, + "learning_rate": 1.0337462399744025e-05, + "loss": 0.4606604099273682, + "memory(GiB)": 76.61, + "step": 1580, + "token_acc": 0.8439696373348328, + "train_speed(iter/s)": 0.029471 + }, + { + "epoch": 0.5071797128114875, + "grad_norm": 0.0539606724017438, + "learning_rate": 1.0285560507936962e-05, + "loss": 0.46471481323242186, + "memory(GiB)": 76.61, + "step": 1585, + "token_acc": 0.8212732305258995, + "train_speed(iter/s)": 0.029486 + }, + { + "epoch": 0.5087796488140475, + "grad_norm": 0.0588254805138369, + "learning_rate": 1.0233650916210736e-05, + "loss": 0.45154604911804197, + "memory(GiB)": 76.61, + "step": 1590, + "token_acc": 0.883357041251778, + "train_speed(iter/s)": 0.029474 + }, + { + "epoch": 0.5103795848166074, + "grad_norm": 0.06409304780777438, + "learning_rate": 1.0181735024267504e-05, + "loss": 0.45000271797180175, + "memory(GiB)": 76.61, + "step": 1595, + "token_acc": 0.8340197693574959, + "train_speed(iter/s)": 0.029485 + }, + { + "epoch": 0.5119795208191672, + "grad_norm": 0.058200459243944895, + "learning_rate": 1.012981423197931e-05, + "loss": 0.4608008861541748, + "memory(GiB)": 76.61, + "step": 1600, + "token_acc": 0.8793913904007917, + "train_speed(iter/s)": 0.029484 + }, + { + "epoch": 0.5135794568217271, + "grad_norm": 0.052541653818392466, + "learning_rate": 1.007788993935033e-05, + "loss": 0.45448942184448243, + "memory(GiB)": 76.61, + "step": 1605, + "token_acc": 0.8615229110512129, + "train_speed(iter/s)": 0.029472 + }, + { + "epoch": 0.515179392824287, + "grad_norm": 0.06526426191856917, + "learning_rate": 1.002596354647912e-05, + "loss": 0.45614914894104003, + "memory(GiB)": 76.61, + "step": 1610, + "token_acc": 0.871312462372065, + "train_speed(iter/s)": 0.029489 + }, + { + "epoch": 0.516779328826847, + "grad_norm": 0.05743481751682084, + "learning_rate": 9.974036453520881e-06, + "loss": 0.447450590133667, + "memory(GiB)": 76.61, + "step": 1615, + "token_acc": 0.8760352658295485, + "train_speed(iter/s)": 0.02948 + }, + { + "epoch": 0.5183792648294068, + "grad_norm": 0.06577378911981274, + "learning_rate": 9.922110060649672e-06, + "loss": 0.45809640884399416, + "memory(GiB)": 76.61, + "step": 1620, + "token_acc": 0.9048299514146899, + "train_speed(iter/s)": 0.029468 + }, + { + "epoch": 0.5199792008319667, + "grad_norm": 0.05180626448607971, + "learning_rate": 9.870185768020694e-06, + "loss": 0.4360641002655029, + "memory(GiB)": 76.61, + "step": 1625, + "token_acc": 0.890797148412184, + "train_speed(iter/s)": 0.029481 + }, + { + "epoch": 0.5215791368345266, + "grad_norm": 0.048795347699578454, + "learning_rate": 9.818264975732497e-06, + "loss": 0.4505919933319092, + "memory(GiB)": 76.61, + "step": 1630, + "token_acc": 0.8830155979202773, + "train_speed(iter/s)": 0.029462 + }, + { + "epoch": 0.5231790728370865, + "grad_norm": 0.054325754690003863, + "learning_rate": 9.766349083789266e-06, + "loss": 0.4518167495727539, + "memory(GiB)": 76.61, + "step": 1635, + "token_acc": 0.8740636704119851, + "train_speed(iter/s)": 0.029458 + }, + { + "epoch": 0.5247790088396465, + "grad_norm": 0.05473270131153146, + "learning_rate": 9.71443949206304e-06, + "loss": 0.4629377841949463, + "memory(GiB)": 76.61, + "step": 1640, + "token_acc": 0.8927648578811369, + "train_speed(iter/s)": 0.029469 + }, + { + "epoch": 0.5263789448422063, + "grad_norm": 0.05962553624327487, + "learning_rate": 9.662537600255979e-06, + "loss": 0.4535552501678467, + "memory(GiB)": 76.61, + "step": 1645, + "token_acc": 0.8980960623106881, + "train_speed(iter/s)": 0.029457 + }, + { + "epoch": 0.5279788808447662, + "grad_norm": 0.06367541172972058, + "learning_rate": 9.610644807862625e-06, + "loss": 0.44418978691101074, + "memory(GiB)": 76.61, + "step": 1650, + "token_acc": 0.8769371011850501, + "train_speed(iter/s)": 0.029468 + }, + { + "epoch": 0.5295788168473261, + "grad_norm": 0.05288367644088033, + "learning_rate": 9.558762514132157e-06, + "loss": 0.4513704299926758, + "memory(GiB)": 76.61, + "step": 1655, + "token_acc": 0.8576478906434126, + "train_speed(iter/s)": 0.029464 + }, + { + "epoch": 0.531178752849886, + "grad_norm": 0.054919719691940275, + "learning_rate": 9.506892118030668e-06, + "loss": 0.4454075336456299, + "memory(GiB)": 76.61, + "step": 1660, + "token_acc": 0.8535078688042359, + "train_speed(iter/s)": 0.029456 + }, + { + "epoch": 0.532778688852446, + "grad_norm": 0.0561391497524031, + "learning_rate": 9.455035018203439e-06, + "loss": 0.4459484100341797, + "memory(GiB)": 76.61, + "step": 1665, + "token_acc": 0.8793124922157181, + "train_speed(iter/s)": 0.029471 + }, + { + "epoch": 0.5343786248550058, + "grad_norm": 0.051526152917333715, + "learning_rate": 9.40319261293723e-06, + "loss": 0.4593966484069824, + "memory(GiB)": 76.61, + "step": 1670, + "token_acc": 0.8957568638966378, + "train_speed(iter/s)": 0.029466 + }, + { + "epoch": 0.5359785608575657, + "grad_norm": 0.05336577516465571, + "learning_rate": 9.351366300122569e-06, + "loss": 0.45195541381835935, + "memory(GiB)": 76.61, + "step": 1675, + "token_acc": 0.8254120659305488, + "train_speed(iter/s)": 0.029459 + }, + { + "epoch": 0.5375784968601256, + "grad_norm": 0.05605931671991975, + "learning_rate": 9.299557477216073e-06, + "loss": 0.4473400115966797, + "memory(GiB)": 76.61, + "step": 1680, + "token_acc": 0.8684433164128595, + "train_speed(iter/s)": 0.029474 + }, + { + "epoch": 0.5391784328626855, + "grad_norm": 0.05284398220938548, + "learning_rate": 9.247767541202738e-06, + "loss": 0.4539934158325195, + "memory(GiB)": 76.61, + "step": 1685, + "token_acc": 0.8787515006002401, + "train_speed(iter/s)": 0.029458 + }, + { + "epoch": 0.5407783688652454, + "grad_norm": 0.06074778175058899, + "learning_rate": 9.195997888558312e-06, + "loss": 0.4540121078491211, + "memory(GiB)": 76.61, + "step": 1690, + "token_acc": 0.882076702321941, + "train_speed(iter/s)": 0.029458 + }, + { + "epoch": 0.5423783048678052, + "grad_norm": 0.06072504661929311, + "learning_rate": 9.144249915211605e-06, + "loss": 0.45176243782043457, + "memory(GiB)": 76.61, + "step": 1695, + "token_acc": 0.8652057386094908, + "train_speed(iter/s)": 0.029465 + }, + { + "epoch": 0.5439782408703652, + "grad_norm": 0.058552695609385315, + "learning_rate": 9.092525016506858e-06, + "loss": 0.4491862773895264, + "memory(GiB)": 76.61, + "step": 1700, + "token_acc": 0.8822588020118884, + "train_speed(iter/s)": 0.02945 + }, + { + "epoch": 0.5455781768729251, + "grad_norm": 0.056892490634495974, + "learning_rate": 9.040824587166136e-06, + "loss": 0.45043745040893557, + "memory(GiB)": 76.61, + "step": 1705, + "token_acc": 0.8825789923142613, + "train_speed(iter/s)": 0.029461 + }, + { + "epoch": 0.547178112875485, + "grad_norm": 0.05885692671807609, + "learning_rate": 8.98915002125169e-06, + "loss": 0.4475353240966797, + "memory(GiB)": 76.61, + "step": 1710, + "token_acc": 0.8721031538595574, + "train_speed(iter/s)": 0.029454 + }, + { + "epoch": 0.5487780488780449, + "grad_norm": 0.060276094585736115, + "learning_rate": 8.9375027121284e-06, + "loss": 0.4502556800842285, + "memory(GiB)": 76.61, + "step": 1715, + "token_acc": 0.8562842259917189, + "train_speed(iter/s)": 0.029449 + }, + { + "epoch": 0.5503779848806047, + "grad_norm": 0.06782068962590707, + "learning_rate": 8.885884052426168e-06, + "loss": 0.4532322883605957, + "memory(GiB)": 76.61, + "step": 1720, + "token_acc": 0.8593545573484518, + "train_speed(iter/s)": 0.029466 + }, + { + "epoch": 0.5519779208831647, + "grad_norm": 0.06070839045848377, + "learning_rate": 8.83429543400241e-06, + "loss": 0.45258092880249023, + "memory(GiB)": 76.61, + "step": 1725, + "token_acc": 0.8751242791807516, + "train_speed(iter/s)": 0.029452 + }, + { + "epoch": 0.5535778568857246, + "grad_norm": 0.049979952181739715, + "learning_rate": 8.78273824790448e-06, + "loss": 0.4340657234191895, + "memory(GiB)": 76.61, + "step": 1730, + "token_acc": 0.8650843222985634, + "train_speed(iter/s)": 0.029451 + }, + { + "epoch": 0.5551777928882845, + "grad_norm": 0.059124658124222323, + "learning_rate": 8.731213884332205e-06, + "loss": 0.43556828498840333, + "memory(GiB)": 76.61, + "step": 1735, + "token_acc": 0.8524390243902439, + "train_speed(iter/s)": 0.029459 + }, + { + "epoch": 0.5567777288908443, + "grad_norm": 0.05228309031135195, + "learning_rate": 8.679723732600355e-06, + "loss": 0.4483633041381836, + "memory(GiB)": 76.61, + "step": 1740, + "token_acc": 0.9039166284928997, + "train_speed(iter/s)": 0.029445 + }, + { + "epoch": 0.5583776648934042, + "grad_norm": 0.05659321921396489, + "learning_rate": 8.628269181101216e-06, + "loss": 0.45377864837646487, + "memory(GiB)": 76.61, + "step": 1745, + "token_acc": 0.8812897628687102, + "train_speed(iter/s)": 0.029449 + }, + { + "epoch": 0.5599776008959642, + "grad_norm": 0.0610469666222746, + "learning_rate": 8.576851617267151e-06, + "loss": 0.4495216369628906, + "memory(GiB)": 76.61, + "step": 1750, + "token_acc": 0.8734145104008117, + "train_speed(iter/s)": 0.029452 + }, + { + "epoch": 0.5599776008959642, + "eval_loss": 0.6640093922615051, + "eval_runtime": 114.9985, + "eval_samples_per_second": 174.681, + "eval_steps_per_second": 0.878, + "eval_token_acc": 0.8701128116616424, + "step": 1750 + }, + { + "epoch": 0.5615775368985241, + "grad_norm": 0.061306770256929585, + "learning_rate": 8.525472427533156e-06, + "loss": 0.44908857345581055, + "memory(GiB)": 77.63, + "step": 1755, + "token_acc": 0.8715457946180765, + "train_speed(iter/s)": 0.029409 + }, + { + "epoch": 0.563177472901084, + "grad_norm": 0.05235639827008535, + "learning_rate": 8.474132997299521e-06, + "loss": 0.4579316139221191, + "memory(GiB)": 77.63, + "step": 1760, + "token_acc": 0.8922923256201098, + "train_speed(iter/s)": 0.029422 + }, + { + "epoch": 0.5647774089036438, + "grad_norm": 0.051281007426132216, + "learning_rate": 8.422834710894434e-06, + "loss": 0.45467004776000974, + "memory(GiB)": 77.63, + "step": 1765, + "token_acc": 0.903878366189924, + "train_speed(iter/s)": 0.029438 + }, + { + "epoch": 0.5663773449062037, + "grad_norm": 0.05049109520782513, + "learning_rate": 8.371578951536689e-06, + "loss": 0.45294957160949706, + "memory(GiB)": 77.63, + "step": 1770, + "token_acc": 0.8928110202324581, + "train_speed(iter/s)": 0.029439 + }, + { + "epoch": 0.5679772809087636, + "grad_norm": 0.04946427707777728, + "learning_rate": 8.320367101298351e-06, + "loss": 0.4473431587219238, + "memory(GiB)": 77.63, + "step": 1775, + "token_acc": 0.8723599632690542, + "train_speed(iter/s)": 0.029439 + }, + { + "epoch": 0.5695772169113236, + "grad_norm": 0.053606352244487274, + "learning_rate": 8.26920054106753e-06, + "loss": 0.4495864391326904, + "memory(GiB)": 77.63, + "step": 1780, + "token_acc": 0.8844315111203492, + "train_speed(iter/s)": 0.029459 + }, + { + "epoch": 0.5711771529138835, + "grad_norm": 0.05525614374940963, + "learning_rate": 8.218080650511107e-06, + "loss": 0.44890499114990234, + "memory(GiB)": 77.63, + "step": 1785, + "token_acc": 0.8749736453721273, + "train_speed(iter/s)": 0.02946 + }, + { + "epoch": 0.5727770889164433, + "grad_norm": 0.05882148265537131, + "learning_rate": 8.167008808037568e-06, + "loss": 0.44676194190979, + "memory(GiB)": 77.63, + "step": 1790, + "token_acc": 0.8807511737089202, + "train_speed(iter/s)": 0.029457 + }, + { + "epoch": 0.5743770249190032, + "grad_norm": 0.048821121641334515, + "learning_rate": 8.115986390759805e-06, + "loss": 0.4417415142059326, + "memory(GiB)": 77.63, + "step": 1795, + "token_acc": 0.8531673379714391, + "train_speed(iter/s)": 0.029469 + }, + { + "epoch": 0.5759769609215631, + "grad_norm": 0.054949264031140505, + "learning_rate": 8.065014774458004e-06, + "loss": 0.46439437866210936, + "memory(GiB)": 77.63, + "step": 1800, + "token_acc": 0.8333022213925705, + "train_speed(iter/s)": 0.029467 + }, + { + "epoch": 0.5775768969241231, + "grad_norm": 0.059507220518762304, + "learning_rate": 8.014095333542548e-06, + "loss": 0.4539642333984375, + "memory(GiB)": 77.63, + "step": 1805, + "token_acc": 0.8577178858942948, + "train_speed(iter/s)": 0.029483 + }, + { + "epoch": 0.579176832926683, + "grad_norm": 0.05302143027350534, + "learning_rate": 7.963229441016938e-06, + "loss": 0.4606470108032227, + "memory(GiB)": 77.63, + "step": 1810, + "token_acc": 0.8760775862068966, + "train_speed(iter/s)": 0.029484 + }, + { + "epoch": 0.5807767689292428, + "grad_norm": 0.0699581228289572, + "learning_rate": 7.912418468440794e-06, + "loss": 0.4488551139831543, + "memory(GiB)": 77.63, + "step": 1815, + "token_acc": 0.8892276422764228, + "train_speed(iter/s)": 0.029481 + }, + { + "epoch": 0.5823767049318027, + "grad_norm": 0.053456667148895104, + "learning_rate": 7.861663785892857e-06, + "loss": 0.45035881996154786, + "memory(GiB)": 77.63, + "step": 1820, + "token_acc": 0.8806643202815662, + "train_speed(iter/s)": 0.029498 + }, + { + "epoch": 0.5839766409343626, + "grad_norm": 0.05451209338463787, + "learning_rate": 7.810966761934053e-06, + "loss": 0.44800753593444825, + "memory(GiB)": 77.63, + "step": 1825, + "token_acc": 0.8771571298819255, + "train_speed(iter/s)": 0.029496 + }, + { + "epoch": 0.5855765769369226, + "grad_norm": 0.05912934985203241, + "learning_rate": 7.760328763570589e-06, + "loss": 0.4499057769775391, + "memory(GiB)": 77.63, + "step": 1830, + "token_acc": 0.868710326675956, + "train_speed(iter/s)": 0.029487 + }, + { + "epoch": 0.5871765129394825, + "grad_norm": 0.052841905445767515, + "learning_rate": 7.709751156217088e-06, + "loss": 0.4497323989868164, + "memory(GiB)": 77.63, + "step": 1835, + "token_acc": 0.8117094325984822, + "train_speed(iter/s)": 0.029501 + }, + { + "epoch": 0.5887764489420423, + "grad_norm": 0.060076953422732254, + "learning_rate": 7.659235303659784e-06, + "loss": 0.4582187652587891, + "memory(GiB)": 77.63, + "step": 1840, + "token_acc": 0.8795674258561363, + "train_speed(iter/s)": 0.029492 + }, + { + "epoch": 0.5903763849446022, + "grad_norm": 0.06307528499562465, + "learning_rate": 7.608782568019729e-06, + "loss": 0.4430552005767822, + "memory(GiB)": 77.63, + "step": 1845, + "token_acc": 0.8452444922084901, + "train_speed(iter/s)": 0.029498 + }, + { + "epoch": 0.5919763209471621, + "grad_norm": 0.05378691938628143, + "learning_rate": 7.558394309716088e-06, + "loss": 0.459810209274292, + "memory(GiB)": 77.63, + "step": 1850, + "token_acc": 0.8506092736192435, + "train_speed(iter/s)": 0.029503 + }, + { + "epoch": 0.593576256949722, + "grad_norm": 0.0586506530143339, + "learning_rate": 7.508071887429433e-06, + "loss": 0.46239190101623534, + "memory(GiB)": 77.63, + "step": 1855, + "token_acc": 0.9115304709141274, + "train_speed(iter/s)": 0.029495 + }, + { + "epoch": 0.5951761929522819, + "grad_norm": 0.053290473896441634, + "learning_rate": 7.4578166580651335e-06, + "loss": 0.4524221897125244, + "memory(GiB)": 77.63, + "step": 1860, + "token_acc": 0.8817879571481345, + "train_speed(iter/s)": 0.029508 + }, + { + "epoch": 0.5967761289548418, + "grad_norm": 0.051901913358510056, + "learning_rate": 7.4076299767167325e-06, + "loss": 0.4579325675964355, + "memory(GiB)": 77.63, + "step": 1865, + "token_acc": 0.8617401668653158, + "train_speed(iter/s)": 0.029506 + }, + { + "epoch": 0.5983760649574017, + "grad_norm": 0.05256077511072294, + "learning_rate": 7.35751319662945e-06, + "loss": 0.45406513214111327, + "memory(GiB)": 77.63, + "step": 1870, + "token_acc": 0.8924402944873406, + "train_speed(iter/s)": 0.029497 + }, + { + "epoch": 0.5999760009599616, + "grad_norm": 0.056121622843709036, + "learning_rate": 7.307467669163655e-06, + "loss": 0.450104284286499, + "memory(GiB)": 77.63, + "step": 1875, + "token_acc": 0.8646184340931615, + "train_speed(iter/s)": 0.02951 + }, + { + "epoch": 0.6015759369625215, + "grad_norm": 0.051068951060234354, + "learning_rate": 7.25749474375846e-06, + "loss": 0.45695791244506834, + "memory(GiB)": 77.63, + "step": 1880, + "token_acc": 0.9112655568126717, + "train_speed(iter/s)": 0.029503 + }, + { + "epoch": 0.6031758729650813, + "grad_norm": 0.05120698584703106, + "learning_rate": 7.207595767895303e-06, + "loss": 0.4460740089416504, + "memory(GiB)": 77.63, + "step": 1885, + "token_acc": 0.8637192342752963, + "train_speed(iter/s)": 0.029499 + }, + { + "epoch": 0.6047758089676413, + "grad_norm": 0.05826366701259215, + "learning_rate": 7.157772087061645e-06, + "loss": 0.4498391628265381, + "memory(GiB)": 77.63, + "step": 1890, + "token_acc": 0.8602477214302408, + "train_speed(iter/s)": 0.029509 + }, + { + "epoch": 0.6063757449702012, + "grad_norm": 0.05454678875604061, + "learning_rate": 7.108025044714661e-06, + "loss": 0.44768247604370115, + "memory(GiB)": 77.63, + "step": 1895, + "token_acc": 0.8998014357721094, + "train_speed(iter/s)": 0.029496 + }, + { + "epoch": 0.6079756809727611, + "grad_norm": 0.04862560763785379, + "learning_rate": 7.058355982245038e-06, + "loss": 0.44283151626586914, + "memory(GiB)": 77.63, + "step": 1900, + "token_acc": 0.8749580958766343, + "train_speed(iter/s)": 0.029501 + }, + { + "epoch": 0.609575616975321, + "grad_norm": 0.05390239428952395, + "learning_rate": 7.00876623894079e-06, + "loss": 0.4445077419281006, + "memory(GiB)": 77.63, + "step": 1905, + "token_acc": 0.8588156123822341, + "train_speed(iter/s)": 0.029504 + }, + { + "epoch": 0.6111755529778808, + "grad_norm": 0.052917745372876655, + "learning_rate": 6.959257151951153e-06, + "loss": 0.45001955032348634, + "memory(GiB)": 77.63, + "step": 1910, + "token_acc": 0.8768155911013054, + "train_speed(iter/s)": 0.029494 + }, + { + "epoch": 0.6127754889804408, + "grad_norm": 0.05432256049056495, + "learning_rate": 6.909830056250527e-06, + "loss": 0.44944238662719727, + "memory(GiB)": 77.63, + "step": 1915, + "token_acc": 0.8941244909831297, + "train_speed(iter/s)": 0.029507 + }, + { + "epoch": 0.6143754249830007, + "grad_norm": 0.05852297407331436, + "learning_rate": 6.860486284602479e-06, + "loss": 0.4477729797363281, + "memory(GiB)": 77.63, + "step": 1920, + "token_acc": 0.8854845719252499, + "train_speed(iter/s)": 0.029501 + }, + { + "epoch": 0.6159753609855606, + "grad_norm": 0.05474007805899836, + "learning_rate": 6.8112271675238154e-06, + "loss": 0.4501204013824463, + "memory(GiB)": 77.63, + "step": 1925, + "token_acc": 0.8803290949887809, + "train_speed(iter/s)": 0.029496 + }, + { + "epoch": 0.6175752969881205, + "grad_norm": 0.05545012433641634, + "learning_rate": 6.762054033248681e-06, + "loss": 0.44565958976745607, + "memory(GiB)": 77.63, + "step": 1930, + "token_acc": 0.8480542195015304, + "train_speed(iter/s)": 0.029507 + }, + { + "epoch": 0.6191752329906803, + "grad_norm": 0.05495247298953925, + "learning_rate": 6.712968207692778e-06, + "loss": 0.44170804023742677, + "memory(GiB)": 77.63, + "step": 1935, + "token_acc": 0.8709073900841908, + "train_speed(iter/s)": 0.029498 + }, + { + "epoch": 0.6207751689932403, + "grad_norm": 0.05792014047889592, + "learning_rate": 6.663971014417585e-06, + "loss": 0.4454016208648682, + "memory(GiB)": 77.63, + "step": 1940, + "token_acc": 0.8606651376146789, + "train_speed(iter/s)": 0.0295 + }, + { + "epoch": 0.6223751049958002, + "grad_norm": 0.04853659131630362, + "learning_rate": 6.615063774594677e-06, + "loss": 0.4387532711029053, + "memory(GiB)": 77.63, + "step": 1945, + "token_acc": 0.8920454545454546, + "train_speed(iter/s)": 0.029507 + }, + { + "epoch": 0.6239750409983601, + "grad_norm": 0.05266495974136303, + "learning_rate": 6.566247806970119e-06, + "loss": 0.4472493171691895, + "memory(GiB)": 77.63, + "step": 1950, + "token_acc": 0.848505251817937, + "train_speed(iter/s)": 0.029497 + }, + { + "epoch": 0.62557497700092, + "grad_norm": 0.054994759694813, + "learning_rate": 6.5175244278288705e-06, + "loss": 0.44487895965576174, + "memory(GiB)": 77.63, + "step": 1955, + "token_acc": 0.8689320388349514, + "train_speed(iter/s)": 0.029506 + }, + { + "epoch": 0.6271749130034798, + "grad_norm": 0.057067387083368365, + "learning_rate": 6.468894950959336e-06, + "loss": 0.4466127395629883, + "memory(GiB)": 77.63, + "step": 1960, + "token_acc": 0.846737755286463, + "train_speed(iter/s)": 0.0295 + }, + { + "epoch": 0.6287748490060397, + "grad_norm": 0.05417940634204734, + "learning_rate": 6.420360687617897e-06, + "loss": 0.44883151054382325, + "memory(GiB)": 77.63, + "step": 1965, + "token_acc": 0.8795967892477132, + "train_speed(iter/s)": 0.02949 + }, + { + "epoch": 0.6303747850085997, + "grad_norm": 0.05176488752108695, + "learning_rate": 6.3719229464935915e-06, + "loss": 0.4542849063873291, + "memory(GiB)": 77.63, + "step": 1970, + "token_acc": 0.8568893191352049, + "train_speed(iter/s)": 0.029503 + }, + { + "epoch": 0.6319747210111596, + "grad_norm": 0.04867135924369273, + "learning_rate": 6.323583033672799e-06, + "loss": 0.44331774711608884, + "memory(GiB)": 77.63, + "step": 1975, + "token_acc": 0.8647865559204172, + "train_speed(iter/s)": 0.029491 + }, + { + "epoch": 0.6335746570137194, + "grad_norm": 0.06076783884358601, + "learning_rate": 6.275342252604044e-06, + "loss": 0.44751858711242676, + "memory(GiB)": 77.63, + "step": 1980, + "token_acc": 0.871765773944621, + "train_speed(iter/s)": 0.029486 + }, + { + "epoch": 0.6351745930162793, + "grad_norm": 0.0520886449098567, + "learning_rate": 6.22720190406283e-06, + "loss": 0.46150927543640136, + "memory(GiB)": 77.63, + "step": 1985, + "token_acc": 0.8921661480178595, + "train_speed(iter/s)": 0.029497 + }, + { + "epoch": 0.6367745290188392, + "grad_norm": 0.058090405193780774, + "learning_rate": 6.179163286116581e-06, + "loss": 0.44019436836242676, + "memory(GiB)": 77.63, + "step": 1990, + "token_acc": 0.9157033805888768, + "train_speed(iter/s)": 0.029488 + }, + { + "epoch": 0.6383744650213992, + "grad_norm": 0.057472120727550105, + "learning_rate": 6.13122769408963e-06, + "loss": 0.4466409683227539, + "memory(GiB)": 77.63, + "step": 1995, + "token_acc": 0.8608313968499871, + "train_speed(iter/s)": 0.029492 + }, + { + "epoch": 0.6399744010239591, + "grad_norm": 0.05665485079826101, + "learning_rate": 6.083396420528298e-06, + "loss": 0.451153039932251, + "memory(GiB)": 77.63, + "step": 2000, + "token_acc": 0.8910367046369808, + "train_speed(iter/s)": 0.029496 + }, + { + "epoch": 0.6399744010239591, + "eval_loss": 0.6622327566146851, + "eval_runtime": 115.9166, + "eval_samples_per_second": 173.297, + "eval_steps_per_second": 0.871, + "eval_token_acc": 0.8709580958089251, + "step": 2000 + }, + { + "epoch": 0.6415743370265189, + "grad_norm": 0.05247711719910443, + "learning_rate": 6.0356707551660434e-06, + "loss": 0.45055346488952636, + "memory(GiB)": 77.63, + "step": 2005, + "token_acc": 0.8961562482257424, + "train_speed(iter/s)": 0.029453 + }, + { + "epoch": 0.6431742730290788, + "grad_norm": 0.0533769803289562, + "learning_rate": 5.988051984888668e-06, + "loss": 0.4436792373657227, + "memory(GiB)": 77.63, + "step": 2010, + "token_acc": 0.8894836272040302, + "train_speed(iter/s)": 0.029463 + }, + { + "epoch": 0.6447742090316387, + "grad_norm": 0.05428299581856707, + "learning_rate": 5.940541393699646e-06, + "loss": 0.44562363624572754, + "memory(GiB)": 77.63, + "step": 2015, + "token_acc": 0.8804031789106416, + "train_speed(iter/s)": 0.029477 + }, + { + "epoch": 0.6463741450341987, + "grad_norm": 0.055930584071511934, + "learning_rate": 5.893140262685469e-06, + "loss": 0.4412201404571533, + "memory(GiB)": 77.63, + "step": 2020, + "token_acc": 0.8791348600508906, + "train_speed(iter/s)": 0.029482 + }, + { + "epoch": 0.6479740810367586, + "grad_norm": 0.06077731970466293, + "learning_rate": 5.845849869981137e-06, + "loss": 0.44964237213134767, + "memory(GiB)": 77.63, + "step": 2025, + "token_acc": 0.8710450018908358, + "train_speed(iter/s)": 0.029484 + }, + { + "epoch": 0.6495740170393184, + "grad_norm": 0.05824848510516177, + "learning_rate": 5.7986714907356614e-06, + "loss": 0.4586543083190918, + "memory(GiB)": 77.63, + "step": 2030, + "token_acc": 0.8852591792656588, + "train_speed(iter/s)": 0.029498 + }, + { + "epoch": 0.6511739530418783, + "grad_norm": 0.06066761562869553, + "learning_rate": 5.751606397077703e-06, + "loss": 0.44632205963134763, + "memory(GiB)": 77.63, + "step": 2035, + "token_acc": 0.8871352785145888, + "train_speed(iter/s)": 0.029494 + }, + { + "epoch": 0.6527738890444382, + "grad_norm": 0.055201144436432543, + "learning_rate": 5.704655858081268e-06, + "loss": 0.43164916038513185, + "memory(GiB)": 77.63, + "step": 2040, + "token_acc": 0.8937977909940527, + "train_speed(iter/s)": 0.029496 + }, + { + "epoch": 0.6543738250469981, + "grad_norm": 0.05987076771844116, + "learning_rate": 5.6578211397314765e-06, + "loss": 0.4560856819152832, + "memory(GiB)": 77.63, + "step": 2045, + "token_acc": 0.8462914545204349, + "train_speed(iter/s)": 0.029507 + }, + { + "epoch": 0.6559737610495581, + "grad_norm": 0.05549604363093839, + "learning_rate": 5.611103504890444e-06, + "loss": 0.44809746742248535, + "memory(GiB)": 77.63, + "step": 2050, + "token_acc": 0.8783254279232832, + "train_speed(iter/s)": 0.029503 + }, + { + "epoch": 0.6575736970521179, + "grad_norm": 0.060605872447174955, + "learning_rate": 5.564504213263205e-06, + "loss": 0.43492536544799804, + "memory(GiB)": 77.63, + "step": 2055, + "token_acc": 0.8383036405886909, + "train_speed(iter/s)": 0.029515 + }, + { + "epoch": 0.6591736330546778, + "grad_norm": 0.05003885513998493, + "learning_rate": 5.5180245213637785e-06, + "loss": 0.44741315841674806, + "memory(GiB)": 77.63, + "step": 2060, + "token_acc": 0.8784857874174862, + "train_speed(iter/s)": 0.029515 + }, + { + "epoch": 0.6607735690572377, + "grad_norm": 0.053248832099036005, + "learning_rate": 5.4716656824812505e-06, + "loss": 0.4469279766082764, + "memory(GiB)": 77.63, + "step": 2065, + "token_acc": 0.8853107344632768, + "train_speed(iter/s)": 0.029507 + }, + { + "epoch": 0.6623735050597976, + "grad_norm": 0.050937098304756526, + "learning_rate": 5.425428946646016e-06, + "loss": 0.44948582649230956, + "memory(GiB)": 77.63, + "step": 2070, + "token_acc": 0.8934210526315789, + "train_speed(iter/s)": 0.029526 + }, + { + "epoch": 0.6639734410623576, + "grad_norm": 0.050018563981163396, + "learning_rate": 5.379315560596038e-06, + "loss": 0.4475410461425781, + "memory(GiB)": 77.63, + "step": 2075, + "token_acc": 0.8478816857555876, + "train_speed(iter/s)": 0.029518 + }, + { + "epoch": 0.6655733770649174, + "grad_norm": 0.05745892430696422, + "learning_rate": 5.333326767743263e-06, + "loss": 0.45008225440979005, + "memory(GiB)": 77.63, + "step": 2080, + "token_acc": 0.8264099454214675, + "train_speed(iter/s)": 0.029515 + }, + { + "epoch": 0.6671733130674773, + "grad_norm": 0.05536507134607956, + "learning_rate": 5.287463808140069e-06, + "loss": 0.4393789291381836, + "memory(GiB)": 77.63, + "step": 2085, + "token_acc": 0.8450008816787162, + "train_speed(iter/s)": 0.029527 + }, + { + "epoch": 0.6687732490700372, + "grad_norm": 0.06142641017026178, + "learning_rate": 5.241727918445836e-06, + "loss": 0.4437687873840332, + "memory(GiB)": 77.63, + "step": 2090, + "token_acc": 0.8837277242185217, + "train_speed(iter/s)": 0.02952 + }, + { + "epoch": 0.6703731850725971, + "grad_norm": 0.05426196603270913, + "learning_rate": 5.1961203318936116e-06, + "loss": 0.4427367687225342, + "memory(GiB)": 77.63, + "step": 2095, + "token_acc": 0.856048805815161, + "train_speed(iter/s)": 0.029524 + }, + { + "epoch": 0.6719731210751569, + "grad_norm": 0.054169398190345976, + "learning_rate": 5.1506422782568345e-06, + "loss": 0.4520686626434326, + "memory(GiB)": 77.63, + "step": 2100, + "token_acc": 0.8747133027522935, + "train_speed(iter/s)": 0.029529 + }, + { + "epoch": 0.6735730570777169, + "grad_norm": 0.054436537230257924, + "learning_rate": 5.105294983816203e-06, + "loss": 0.44482645988464353, + "memory(GiB)": 77.63, + "step": 2105, + "token_acc": 0.8637377049180328, + "train_speed(iter/s)": 0.029517 + }, + { + "epoch": 0.6751729930802768, + "grad_norm": 0.05860088154390529, + "learning_rate": 5.060079671326577e-06, + "loss": 0.44719686508178713, + "memory(GiB)": 77.63, + "step": 2110, + "token_acc": 0.8593150866058442, + "train_speed(iter/s)": 0.029529 + }, + { + "epoch": 0.6767729290828367, + "grad_norm": 0.05264024149284518, + "learning_rate": 5.014997559984045e-06, + "loss": 0.43972039222717285, + "memory(GiB)": 77.63, + "step": 2115, + "token_acc": 0.8533221194280909, + "train_speed(iter/s)": 0.029526 + }, + { + "epoch": 0.6783728650853966, + "grad_norm": 0.0534652970629265, + "learning_rate": 4.970049865393009e-06, + "loss": 0.4468375205993652, + "memory(GiB)": 77.63, + "step": 2120, + "token_acc": 0.8628782287822878, + "train_speed(iter/s)": 0.029518 + }, + { + "epoch": 0.6799728010879564, + "grad_norm": 0.05246927821047006, + "learning_rate": 4.925237799533445e-06, + "loss": 0.4498266696929932, + "memory(GiB)": 77.63, + "step": 2125, + "token_acc": 0.9048205760049284, + "train_speed(iter/s)": 0.029533 + }, + { + "epoch": 0.6815727370905164, + "grad_norm": 0.05104237083350841, + "learning_rate": 4.880562570728188e-06, + "loss": 0.4389338970184326, + "memory(GiB)": 77.63, + "step": 2130, + "token_acc": 0.8844444444444445, + "train_speed(iter/s)": 0.029525 + }, + { + "epoch": 0.6831726730930763, + "grad_norm": 0.05297787940328326, + "learning_rate": 4.836025383610382e-06, + "loss": 0.4495584487915039, + "memory(GiB)": 77.63, + "step": 2135, + "token_acc": 0.8647426233038984, + "train_speed(iter/s)": 0.029524 + }, + { + "epoch": 0.6847726090956362, + "grad_norm": 0.05092547333787791, + "learning_rate": 4.791627439090975e-06, + "loss": 0.4421692848205566, + "memory(GiB)": 77.63, + "step": 2140, + "token_acc": 0.8828041384231181, + "train_speed(iter/s)": 0.029534 + }, + { + "epoch": 0.6863725450981961, + "grad_norm": 0.053418572817851825, + "learning_rate": 4.74736993432634e-06, + "loss": 0.44208922386169436, + "memory(GiB)": 77.63, + "step": 2145, + "token_acc": 0.8888520238885202, + "train_speed(iter/s)": 0.029522 + }, + { + "epoch": 0.6879724811007559, + "grad_norm": 0.053760421496406786, + "learning_rate": 4.703254062686017e-06, + "loss": 0.4469425201416016, + "memory(GiB)": 77.63, + "step": 2150, + "token_acc": 0.8764145324597975, + "train_speed(iter/s)": 0.02953 + }, + { + "epoch": 0.6895724171033158, + "grad_norm": 0.05280613332203591, + "learning_rate": 4.6592810137205e-06, + "loss": 0.45023741722106936, + "memory(GiB)": 77.63, + "step": 2155, + "token_acc": 0.8968010517090271, + "train_speed(iter/s)": 0.029531 + }, + { + "epoch": 0.6911723531058758, + "grad_norm": 0.05438679052798784, + "learning_rate": 4.615451973129196e-06, + "loss": 0.4470167636871338, + "memory(GiB)": 77.63, + "step": 2160, + "token_acc": 0.8761696818465378, + "train_speed(iter/s)": 0.02952 + }, + { + "epoch": 0.6927722891084357, + "grad_norm": 0.056426544102266905, + "learning_rate": 4.571768122728421e-06, + "loss": 0.4486443042755127, + "memory(GiB)": 77.63, + "step": 2165, + "token_acc": 0.8781996587030717, + "train_speed(iter/s)": 0.029533 + }, + { + "epoch": 0.6943722251109956, + "grad_norm": 0.05337656902490804, + "learning_rate": 4.528230640419562e-06, + "loss": 0.4497722625732422, + "memory(GiB)": 77.63, + "step": 2170, + "token_acc": 0.8571428571428571, + "train_speed(iter/s)": 0.029522 + }, + { + "epoch": 0.6959721611135554, + "grad_norm": 0.054129658100736014, + "learning_rate": 4.4848407001572945e-06, + "loss": 0.44121665954589845, + "memory(GiB)": 77.63, + "step": 2175, + "token_acc": 0.8674278464954012, + "train_speed(iter/s)": 0.029518 + }, + { + "epoch": 0.6975720971161153, + "grad_norm": 0.05332136363084243, + "learning_rate": 4.441599471917946e-06, + "loss": 0.43872866630554197, + "memory(GiB)": 77.63, + "step": 2180, + "token_acc": 0.8575067664384652, + "train_speed(iter/s)": 0.029529 + }, + { + "epoch": 0.6991720331186753, + "grad_norm": 0.06093731322456081, + "learning_rate": 4.398508121667925e-06, + "loss": 0.42902402877807616, + "memory(GiB)": 77.63, + "step": 2185, + "token_acc": 0.8526187576126675, + "train_speed(iter/s)": 0.029519 + }, + { + "epoch": 0.7007719691212352, + "grad_norm": 0.05959591977220614, + "learning_rate": 4.355567811332311e-06, + "loss": 0.44504075050354003, + "memory(GiB)": 77.63, + "step": 2190, + "token_acc": 0.8604511878618487, + "train_speed(iter/s)": 0.029519 + }, + { + "epoch": 0.702371905123795, + "grad_norm": 0.052994813328955795, + "learning_rate": 4.312779698763493e-06, + "loss": 0.4408130168914795, + "memory(GiB)": 77.63, + "step": 2195, + "token_acc": 0.8787728847105394, + "train_speed(iter/s)": 0.029526 + }, + { + "epoch": 0.7039718411263549, + "grad_norm": 0.05402320661185779, + "learning_rate": 4.270144937709981e-06, + "loss": 0.4396658897399902, + "memory(GiB)": 77.63, + "step": 2200, + "token_acc": 0.8677248677248677, + "train_speed(iter/s)": 0.029515 + }, + { + "epoch": 0.7055717771289148, + "grad_norm": 0.053270864469091045, + "learning_rate": 4.227664677785264e-06, + "loss": 0.4493250846862793, + "memory(GiB)": 77.63, + "step": 2205, + "token_acc": 0.8801270542742715, + "train_speed(iter/s)": 0.029523 + }, + { + "epoch": 0.7071717131314748, + "grad_norm": 0.06248819511930574, + "learning_rate": 4.1853400644368395e-06, + "loss": 0.44740095138549807, + "memory(GiB)": 77.63, + "step": 2210, + "token_acc": 0.8988747408942849, + "train_speed(iter/s)": 0.029514 + }, + { + "epoch": 0.7087716491340347, + "grad_norm": 0.05807018681919018, + "learning_rate": 4.143172238915302e-06, + "loss": 0.4508991241455078, + "memory(GiB)": 77.63, + "step": 2215, + "token_acc": 0.8739803562510404, + "train_speed(iter/s)": 0.029506 + }, + { + "epoch": 0.7103715851365945, + "grad_norm": 0.05376168037907147, + "learning_rate": 4.101162338243595e-06, + "loss": 0.4486696243286133, + "memory(GiB)": 77.63, + "step": 2220, + "token_acc": 0.8750778169744761, + "train_speed(iter/s)": 0.029514 + }, + { + "epoch": 0.7119715211391544, + "grad_norm": 0.05460944158847028, + "learning_rate": 4.059311495186338e-06, + "loss": 0.4484865188598633, + "memory(GiB)": 77.63, + "step": 2225, + "token_acc": 0.8524350054924936, + "train_speed(iter/s)": 0.029506 + }, + { + "epoch": 0.7135714571417143, + "grad_norm": 0.05716955035585288, + "learning_rate": 4.017620838219276e-06, + "loss": 0.44258599281311034, + "memory(GiB)": 77.63, + "step": 2230, + "token_acc": 0.8597191629955947, + "train_speed(iter/s)": 0.029504 + }, + { + "epoch": 0.7151713931442742, + "grad_norm": 0.05984913995816041, + "learning_rate": 3.9760914914988716e-06, + "loss": 0.4547589778900146, + "memory(GiB)": 77.63, + "step": 2235, + "token_acc": 0.8679617117117117, + "train_speed(iter/s)": 0.029511 + }, + { + "epoch": 0.7167713291468342, + "grad_norm": 0.05686589162715874, + "learning_rate": 3.93472457483197e-06, + "loss": 0.4416301727294922, + "memory(GiB)": 77.63, + "step": 2240, + "token_acc": 0.826577064816822, + "train_speed(iter/s)": 0.029498 + }, + { + "epoch": 0.718371265149394, + "grad_norm": 0.05780707586931182, + "learning_rate": 3.893521203645618e-06, + "loss": 0.45052361488342285, + "memory(GiB)": 77.63, + "step": 2245, + "token_acc": 0.8836182062608028, + "train_speed(iter/s)": 0.0295 + }, + { + "epoch": 0.7199712011519539, + "grad_norm": 0.049110615928360885, + "learning_rate": 3.852482488956992e-06, + "loss": 0.4427218437194824, + "memory(GiB)": 77.63, + "step": 2250, + "token_acc": 0.8621255642183012, + "train_speed(iter/s)": 0.029501 + }, + { + "epoch": 0.7199712011519539, + "eval_loss": 0.659950852394104, + "eval_runtime": 108.3142, + "eval_samples_per_second": 185.461, + "eval_steps_per_second": 0.932, + "eval_token_acc": 0.8716289458342705, + "step": 2250 + }, + { + "epoch": 0.7215711371545138, + "grad_norm": 0.04899882607919235, + "learning_rate": 3.8116095373434204e-06, + "loss": 0.4487879753112793, + "memory(GiB)": 77.63, + "step": 2255, + "token_acc": 0.8912671818368324, + "train_speed(iter/s)": 0.029473 + }, + { + "epoch": 0.7231710731570737, + "grad_norm": 0.05080548488435112, + "learning_rate": 3.7709034509125706e-06, + "loss": 0.44452829360961915, + "memory(GiB)": 77.63, + "step": 2260, + "token_acc": 0.8442477876106195, + "train_speed(iter/s)": 0.029482 + }, + { + "epoch": 0.7247710091596337, + "grad_norm": 0.048986009146357284, + "learning_rate": 3.7303653272727057e-06, + "loss": 0.4472095012664795, + "memory(GiB)": 77.63, + "step": 2265, + "token_acc": 0.870567815521944, + "train_speed(iter/s)": 0.029495 + }, + { + "epoch": 0.7263709451621935, + "grad_norm": 0.05152412916361422, + "learning_rate": 3.689996259503116e-06, + "loss": 0.440493106842041, + "memory(GiB)": 77.63, + "step": 2270, + "token_acc": 0.8795436455293181, + "train_speed(iter/s)": 0.029499 + }, + { + "epoch": 0.7279708811647534, + "grad_norm": 0.055480142184644934, + "learning_rate": 3.6497973361246153e-06, + "loss": 0.4417555809020996, + "memory(GiB)": 77.63, + "step": 2275, + "token_acc": 0.8660460713158725, + "train_speed(iter/s)": 0.029499 + }, + { + "epoch": 0.7295708171673133, + "grad_norm": 0.05625540509082736, + "learning_rate": 3.609769641070221e-06, + "loss": 0.4407214164733887, + "memory(GiB)": 77.63, + "step": 2280, + "token_acc": 0.8890608875128999, + "train_speed(iter/s)": 0.029513 + }, + { + "epoch": 0.7311707531698732, + "grad_norm": 0.05002588428206622, + "learning_rate": 3.569914253655896e-06, + "loss": 0.4413386344909668, + "memory(GiB)": 77.63, + "step": 2285, + "token_acc": 0.8921049390319005, + "train_speed(iter/s)": 0.029513 + }, + { + "epoch": 0.7327706891724332, + "grad_norm": 0.05088814815973685, + "learning_rate": 3.530232248551466e-06, + "loss": 0.4507819652557373, + "memory(GiB)": 77.63, + "step": 2290, + "token_acc": 0.8278411830895355, + "train_speed(iter/s)": 0.029513 + }, + { + "epoch": 0.734370625174993, + "grad_norm": 0.05399937134620822, + "learning_rate": 3.4907246957516416e-06, + "loss": 0.4447961330413818, + "memory(GiB)": 77.63, + "step": 2295, + "token_acc": 0.8888263967004124, + "train_speed(iter/s)": 0.029522 + }, + { + "epoch": 0.7359705611775529, + "grad_norm": 0.06200035405309708, + "learning_rate": 3.4513926605471504e-06, + "loss": 0.45868444442749023, + "memory(GiB)": 77.63, + "step": 2300, + "token_acc": 0.8513141426783479, + "train_speed(iter/s)": 0.02952 + }, + { + "epoch": 0.7375704971801128, + "grad_norm": 0.055876255912378235, + "learning_rate": 3.412237203496036e-06, + "loss": 0.4431456089019775, + "memory(GiB)": 77.63, + "step": 2305, + "token_acc": 0.8651997041420119, + "train_speed(iter/s)": 0.029532 + }, + { + "epoch": 0.7391704331826727, + "grad_norm": 0.06032844036632358, + "learning_rate": 3.3732593803950354e-06, + "loss": 0.4452229976654053, + "memory(GiB)": 77.63, + "step": 2310, + "token_acc": 0.8915232899706252, + "train_speed(iter/s)": 0.02953 + }, + { + "epoch": 0.7407703691852325, + "grad_norm": 0.05255216039270682, + "learning_rate": 3.3344602422511343e-06, + "loss": 0.4414207458496094, + "memory(GiB)": 77.63, + "step": 2315, + "token_acc": 0.8901802257032171, + "train_speed(iter/s)": 0.029522 + }, + { + "epoch": 0.7423703051877925, + "grad_norm": 0.05535966142690852, + "learning_rate": 3.2958408352532055e-06, + "loss": 0.43938393592834474, + "memory(GiB)": 77.63, + "step": 2320, + "token_acc": 0.8354077253218885, + "train_speed(iter/s)": 0.029536 + }, + { + "epoch": 0.7439702411903524, + "grad_norm": 0.0505418855319798, + "learning_rate": 3.257402200743821e-06, + "loss": 0.44445362091064455, + "memory(GiB)": 77.63, + "step": 2325, + "token_acc": 0.864262790258637, + "train_speed(iter/s)": 0.02953 + }, + { + "epoch": 0.7455701771929123, + "grad_norm": 0.0563107101835597, + "learning_rate": 3.2191453751911505e-06, + "loss": 0.45569453239440916, + "memory(GiB)": 77.63, + "step": 2330, + "token_acc": 0.8784313725490196, + "train_speed(iter/s)": 0.029523 + }, + { + "epoch": 0.7471701131954722, + "grad_norm": 0.05000152007266613, + "learning_rate": 3.1810713901610367e-06, + "loss": 0.4395348072052002, + "memory(GiB)": 77.63, + "step": 2335, + "token_acc": 0.8867111781175964, + "train_speed(iter/s)": 0.029536 + }, + { + "epoch": 0.748770049198032, + "grad_norm": 0.057169590375126145, + "learning_rate": 3.1431812722891598e-06, + "loss": 0.4397278785705566, + "memory(GiB)": 77.63, + "step": 2340, + "token_acc": 0.8577532891037895, + "train_speed(iter/s)": 0.029529 + }, + { + "epoch": 0.750369985200592, + "grad_norm": 0.05730745865195846, + "learning_rate": 3.1054760432533626e-06, + "loss": 0.45998029708862304, + "memory(GiB)": 77.63, + "step": 2345, + "token_acc": 0.8845755097339016, + "train_speed(iter/s)": 0.029534 + }, + { + "epoch": 0.7519699212031519, + "grad_norm": 0.05180470840824953, + "learning_rate": 3.0679567197461135e-06, + "loss": 0.45008273124694825, + "memory(GiB)": 77.63, + "step": 2350, + "token_acc": 0.8394425931535898, + "train_speed(iter/s)": 0.029537 + }, + { + "epoch": 0.7535698572057118, + "grad_norm": 0.06025883780673481, + "learning_rate": 3.0306243134470668e-06, + "loss": 0.4444745540618896, + "memory(GiB)": 77.63, + "step": 2355, + "token_acc": 0.889631386074585, + "train_speed(iter/s)": 0.02953 + }, + { + "epoch": 0.7551697932082717, + "grad_norm": 0.05199872680450009, + "learning_rate": 2.993479830995815e-06, + "loss": 0.451768159866333, + "memory(GiB)": 77.63, + "step": 2360, + "token_acc": 0.8736520199581522, + "train_speed(iter/s)": 0.029541 + }, + { + "epoch": 0.7567697292108315, + "grad_norm": 0.05489027404588469, + "learning_rate": 2.9565242739647115e-06, + "loss": 0.4442115306854248, + "memory(GiB)": 77.63, + "step": 2365, + "token_acc": 0.8865552903739061, + "train_speed(iter/s)": 0.029538 + }, + { + "epoch": 0.7583696652133914, + "grad_norm": 0.06334021131539457, + "learning_rate": 2.919758638831893e-06, + "loss": 0.4570741653442383, + "memory(GiB)": 77.63, + "step": 2370, + "token_acc": 0.8652606912712361, + "train_speed(iter/s)": 0.029531 + }, + { + "epoch": 0.7599696012159514, + "grad_norm": 0.053831666314624105, + "learning_rate": 2.8831839169543998e-06, + "loss": 0.44495415687561035, + "memory(GiB)": 77.63, + "step": 2375, + "token_acc": 0.8756476683937824, + "train_speed(iter/s)": 0.029541 + }, + { + "epoch": 0.7615695372185113, + "grad_norm": 0.0527583973457582, + "learning_rate": 2.84680109454143e-06, + "loss": 0.4472104549407959, + "memory(GiB)": 77.63, + "step": 2380, + "token_acc": 0.8725328947368421, + "train_speed(iter/s)": 0.029536 + }, + { + "epoch": 0.7631694732210712, + "grad_norm": 0.058941021305098804, + "learning_rate": 2.810611152627777e-06, + "loss": 0.4499720573425293, + "memory(GiB)": 77.63, + "step": 2385, + "token_acc": 0.8632213889794588, + "train_speed(iter/s)": 0.029535 + }, + { + "epoch": 0.764769409223631, + "grad_norm": 0.05393008170855123, + "learning_rate": 2.774615067047346e-06, + "loss": 0.43872222900390623, + "memory(GiB)": 77.63, + "step": 2390, + "token_acc": 0.8742202234150588, + "train_speed(iter/s)": 0.02954 + }, + { + "epoch": 0.7663693452261909, + "grad_norm": 0.0556417645809335, + "learning_rate": 2.738813808406866e-06, + "loss": 0.4399220943450928, + "memory(GiB)": 77.63, + "step": 2395, + "token_acc": 0.8997599039615847, + "train_speed(iter/s)": 0.029531 + }, + { + "epoch": 0.7679692812287509, + "grad_norm": 0.05379432234404155, + "learning_rate": 2.7032083420597e-06, + "loss": 0.4382453441619873, + "memory(GiB)": 77.63, + "step": 2400, + "token_acc": 0.8875784668061633, + "train_speed(iter/s)": 0.029541 + }, + { + "epoch": 0.7695692172313108, + "grad_norm": 0.05806842163630925, + "learning_rate": 2.667799628079829e-06, + "loss": 0.44454326629638674, + "memory(GiB)": 77.63, + "step": 2405, + "token_acc": 0.8880662020905923, + "train_speed(iter/s)": 0.029535 + }, + { + "epoch": 0.7711691532338707, + "grad_norm": 0.06148704112133217, + "learning_rate": 2.6325886212359496e-06, + "loss": 0.43945813179016113, + "memory(GiB)": 77.63, + "step": 2410, + "token_acc": 0.8767772511848341, + "train_speed(iter/s)": 0.029527 + }, + { + "epoch": 0.7727690892364305, + "grad_norm": 0.056530065759685846, + "learning_rate": 2.5975762709657506e-06, + "loss": 0.4438450813293457, + "memory(GiB)": 77.63, + "step": 2415, + "token_acc": 0.8570975416336241, + "train_speed(iter/s)": 0.029538 + }, + { + "epoch": 0.7743690252389904, + "grad_norm": 0.056107845444701834, + "learning_rate": 2.5627635213502832e-06, + "loss": 0.43836054801940916, + "memory(GiB)": 77.63, + "step": 2420, + "token_acc": 0.8966822253059165, + "train_speed(iter/s)": 0.029532 + }, + { + "epoch": 0.7759689612415503, + "grad_norm": 0.05796065696017405, + "learning_rate": 2.528151311088537e-06, + "loss": 0.4400279998779297, + "memory(GiB)": 77.63, + "step": 2425, + "token_acc": 0.8552805280528053, + "train_speed(iter/s)": 0.029526 + }, + { + "epoch": 0.7775688972441103, + "grad_norm": 0.05418546630146028, + "learning_rate": 2.4937405734720964e-06, + "loss": 0.44541444778442385, + "memory(GiB)": 77.63, + "step": 2430, + "token_acc": 0.8620764552562988, + "train_speed(iter/s)": 0.029532 + }, + { + "epoch": 0.7791688332466701, + "grad_norm": 0.05654250831277805, + "learning_rate": 2.459532236360007e-06, + "loss": 0.43491110801696775, + "memory(GiB)": 77.63, + "step": 2435, + "token_acc": 0.8570184983677911, + "train_speed(iter/s)": 0.029522 + }, + { + "epoch": 0.78076876924923, + "grad_norm": 0.05526372242621089, + "learning_rate": 2.4255272221537295e-06, + "loss": 0.4378859043121338, + "memory(GiB)": 77.63, + "step": 2440, + "token_acc": 0.8631236857197476, + "train_speed(iter/s)": 0.029526 + }, + { + "epoch": 0.7823687052517899, + "grad_norm": 0.05404315424969483, + "learning_rate": 2.391726447772279e-06, + "loss": 0.45857391357421873, + "memory(GiB)": 77.63, + "step": 2445, + "token_acc": 0.8634816932081122, + "train_speed(iter/s)": 0.029527 + }, + { + "epoch": 0.7839686412543498, + "grad_norm": 0.05765113554061621, + "learning_rate": 2.3581308246275103e-06, + "loss": 0.4473139762878418, + "memory(GiB)": 77.63, + "step": 2450, + "token_acc": 0.8979846898922044, + "train_speed(iter/s)": 0.029518 + }, + { + "epoch": 0.7855685772569098, + "grad_norm": 0.058353221842389495, + "learning_rate": 2.324741258599521e-06, + "loss": 0.44444866180419923, + "memory(GiB)": 77.63, + "step": 2455, + "token_acc": 0.8648913576213038, + "train_speed(iter/s)": 0.029527 + }, + { + "epoch": 0.7871685132594696, + "grad_norm": 0.05309289512529438, + "learning_rate": 2.29155865001225e-06, + "loss": 0.43857607841491697, + "memory(GiB)": 77.63, + "step": 2460, + "token_acc": 0.894580549368968, + "train_speed(iter/s)": 0.029521 + }, + { + "epoch": 0.7887684492620295, + "grad_norm": 0.0540412312893473, + "learning_rate": 2.2585838936091753e-06, + "loss": 0.43953213691711424, + "memory(GiB)": 77.63, + "step": 2465, + "token_acc": 0.8868672731513879, + "train_speed(iter/s)": 0.029515 + }, + { + "epoch": 0.7903683852645894, + "grad_norm": 0.05880520800742855, + "learning_rate": 2.225817878529214e-06, + "loss": 0.4457580089569092, + "memory(GiB)": 77.63, + "step": 2470, + "token_acc": 0.8630282437884901, + "train_speed(iter/s)": 0.029525 + }, + { + "epoch": 0.7919683212671493, + "grad_norm": 0.058328885138964066, + "learning_rate": 2.1932614882827196e-06, + "loss": 0.4424918174743652, + "memory(GiB)": 77.63, + "step": 2475, + "token_acc": 0.8814697747925722, + "train_speed(iter/s)": 0.029517 + }, + { + "epoch": 0.7935682572697093, + "grad_norm": 0.05685263085809571, + "learning_rate": 2.160915600727688e-06, + "loss": 0.43921732902526855, + "memory(GiB)": 77.63, + "step": 2480, + "token_acc": 0.913681738109219, + "train_speed(iter/s)": 0.029516 + }, + { + "epoch": 0.7951681932722691, + "grad_norm": 0.056639845561812056, + "learning_rate": 2.1287810880460636e-06, + "loss": 0.44060502052307127, + "memory(GiB)": 77.63, + "step": 2485, + "token_acc": 0.8829075425790754, + "train_speed(iter/s)": 0.02952 + }, + { + "epoch": 0.796768129274829, + "grad_norm": 0.05230960490349676, + "learning_rate": 2.0968588167202265e-06, + "loss": 0.43935480117797854, + "memory(GiB)": 77.63, + "step": 2490, + "token_acc": 0.8856997455470738, + "train_speed(iter/s)": 0.029511 + }, + { + "epoch": 0.7983680652773889, + "grad_norm": 0.05305183045142263, + "learning_rate": 2.0651496475096455e-06, + "loss": 0.4360368728637695, + "memory(GiB)": 77.63, + "step": 2495, + "token_acc": 0.8394655704008221, + "train_speed(iter/s)": 0.029517 + }, + { + "epoch": 0.7999680012799488, + "grad_norm": 0.05620012484228566, + "learning_rate": 2.03365443542764e-06, + "loss": 0.44507203102111814, + "memory(GiB)": 77.63, + "step": 2500, + "token_acc": 0.8857098429482195, + "train_speed(iter/s)": 0.029515 + }, + { + "epoch": 0.7999680012799488, + "eval_loss": 0.6586322784423828, + "eval_runtime": 105.1966, + "eval_samples_per_second": 190.957, + "eval_steps_per_second": 0.96, + "eval_token_acc": 0.8721292963419328, + "step": 2500 + }, + { + "epoch": 0.8015679372825087, + "grad_norm": 0.05541548293022976, + "learning_rate": 2.0023740297183536e-06, + "loss": 0.44654192924499514, + "memory(GiB)": 77.63, + "step": 2505, + "token_acc": 0.8819252077562327, + "train_speed(iter/s)": 0.029489 + }, + { + "epoch": 0.8031678732850686, + "grad_norm": 0.057374199419424524, + "learning_rate": 1.971309273833828e-06, + "loss": 0.44596128463745116, + "memory(GiB)": 77.63, + "step": 2510, + "token_acc": 0.855553561815898, + "train_speed(iter/s)": 0.029494 + }, + { + "epoch": 0.8047678092876285, + "grad_norm": 0.05297595418967434, + "learning_rate": 1.940461005411288e-06, + "loss": 0.45099148750305174, + "memory(GiB)": 77.63, + "step": 2515, + "token_acc": 0.8958361962347121, + "train_speed(iter/s)": 0.029503 + }, + { + "epoch": 0.8063677452901884, + "grad_norm": 0.056714681257095015, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.4423669338226318, + "memory(GiB)": 77.63, + "step": 2520, + "token_acc": 0.879045996592845, + "train_speed(iter/s)": 0.029507 + }, + { + "epoch": 0.8079676812927483, + "grad_norm": 0.05868047347236466, + "learning_rate": 1.8794172522915022e-06, + "loss": 0.4462554931640625, + "memory(GiB)": 77.63, + "step": 2525, + "token_acc": 0.8811320754716981, + "train_speed(iter/s)": 0.029507 + }, + { + "epoch": 0.8095676172953082, + "grad_norm": 0.04860202133053876, + "learning_rate": 1.849223413592046e-06, + "loss": 0.4488513946533203, + "memory(GiB)": 77.63, + "step": 2530, + "token_acc": 0.8654490616621984, + "train_speed(iter/s)": 0.029517 + }, + { + "epoch": 0.811167553297868, + "grad_norm": 0.055005201314914515, + "learning_rate": 1.8192493543057676e-06, + "loss": 0.45094904899597166, + "memory(GiB)": 77.63, + "step": 2535, + "token_acc": 0.8800874078120732, + "train_speed(iter/s)": 0.029516 + }, + { + "epoch": 0.812767489300428, + "grad_norm": 0.058350813115584405, + "learning_rate": 1.7894958826600884e-06, + "loss": 0.4489152908325195, + "memory(GiB)": 77.63, + "step": 2540, + "token_acc": 0.8784655623365301, + "train_speed(iter/s)": 0.02952 + }, + { + "epoch": 0.8143674253029879, + "grad_norm": 0.05243037926696882, + "learning_rate": 1.7599638009344566e-06, + "loss": 0.4506648063659668, + "memory(GiB)": 77.63, + "step": 2545, + "token_acc": 0.8711162255466053, + "train_speed(iter/s)": 0.029531 + }, + { + "epoch": 0.8159673613055478, + "grad_norm": 0.055874525296942985, + "learning_rate": 1.730653905438714e-06, + "loss": 0.451121187210083, + "memory(GiB)": 77.63, + "step": 2550, + "token_acc": 0.875531914893617, + "train_speed(iter/s)": 0.029527 + }, + { + "epoch": 0.8175672973081076, + "grad_norm": 0.052382476873803714, + "learning_rate": 1.701566986491614e-06, + "loss": 0.43659415245056155, + "memory(GiB)": 77.63, + "step": 2555, + "token_acc": 0.8824301518844928, + "train_speed(iter/s)": 0.029539 + }, + { + "epoch": 0.8191672333106675, + "grad_norm": 0.05679998544269888, + "learning_rate": 1.672703828399529e-06, + "loss": 0.44143290519714357, + "memory(GiB)": 77.63, + "step": 2560, + "token_acc": 0.9194786645241921, + "train_speed(iter/s)": 0.029538 + }, + { + "epoch": 0.8207671693132275, + "grad_norm": 0.05177151194489443, + "learning_rate": 1.6440652094352838e-06, + "loss": 0.44036478996276857, + "memory(GiB)": 77.63, + "step": 2565, + "token_acc": 0.86801315171442, + "train_speed(iter/s)": 0.029534 + }, + { + "epoch": 0.8223671053157874, + "grad_norm": 0.047985617439266506, + "learning_rate": 1.6156519018171856e-06, + "loss": 0.44090909957885743, + "memory(GiB)": 77.63, + "step": 2570, + "token_acc": 0.8926761055759482, + "train_speed(iter/s)": 0.029546 + }, + { + "epoch": 0.8239670413183473, + "grad_norm": 0.06022869670658613, + "learning_rate": 1.587464671688187e-06, + "loss": 0.4480876922607422, + "memory(GiB)": 77.63, + "step": 2575, + "token_acc": 0.8749468913751594, + "train_speed(iter/s)": 0.029542 + }, + { + "epoch": 0.8255669773209071, + "grad_norm": 0.05591525813204934, + "learning_rate": 1.5595042790952442e-06, + "loss": 0.4516183853149414, + "memory(GiB)": 77.63, + "step": 2580, + "token_acc": 0.8408594319009468, + "train_speed(iter/s)": 0.029539 + }, + { + "epoch": 0.827166913323467, + "grad_norm": 0.05315795994218538, + "learning_rate": 1.5317714779688076e-06, + "loss": 0.44116387367248533, + "memory(GiB)": 77.63, + "step": 2585, + "token_acc": 0.8697747394374089, + "train_speed(iter/s)": 0.029549 + }, + { + "epoch": 0.828766849326027, + "grad_norm": 0.054322305095737905, + "learning_rate": 1.5042670161024975e-06, + "loss": 0.4457075119018555, + "memory(GiB)": 77.63, + "step": 2590, + "token_acc": 0.8946940985381701, + "train_speed(iter/s)": 0.029543 + }, + { + "epoch": 0.8303667853285869, + "grad_norm": 0.05625328099370699, + "learning_rate": 1.4769916351329495e-06, + "loss": 0.4413478851318359, + "memory(GiB)": 77.63, + "step": 2595, + "token_acc": 0.8992218637312583, + "train_speed(iter/s)": 0.029548 + }, + { + "epoch": 0.8319667213311468, + "grad_norm": 0.055362561635933255, + "learning_rate": 1.4499460705198e-06, + "loss": 0.4511932373046875, + "memory(GiB)": 77.63, + "step": 2600, + "token_acc": 0.8438552188552189, + "train_speed(iter/s)": 0.029552 + }, + { + "epoch": 0.8335666573337066, + "grad_norm": 0.052438473367666195, + "learning_rate": 1.4231310515258745e-06, + "loss": 0.441973352432251, + "memory(GiB)": 77.63, + "step": 2605, + "token_acc": 0.8753952017853822, + "train_speed(iter/s)": 0.029546 + }, + { + "epoch": 0.8351665933362665, + "grad_norm": 0.05274974730016364, + "learning_rate": 1.396547301197504e-06, + "loss": 0.4393311977386475, + "memory(GiB)": 77.63, + "step": 2610, + "token_acc": 0.8518848700967906, + "train_speed(iter/s)": 0.029557 + }, + { + "epoch": 0.8367665293388264, + "grad_norm": 0.05853294004614818, + "learning_rate": 1.3701955363450447e-06, + "loss": 0.4380232810974121, + "memory(GiB)": 77.63, + "step": 2615, + "token_acc": 0.8570597362296354, + "train_speed(iter/s)": 0.029554 + }, + { + "epoch": 0.8383664653413864, + "grad_norm": 0.05410978790127522, + "learning_rate": 1.3440764675235384e-06, + "loss": 0.4373164653778076, + "memory(GiB)": 77.63, + "step": 2620, + "token_acc": 0.8798353909465021, + "train_speed(iter/s)": 0.029552 + }, + { + "epoch": 0.8399664013439463, + "grad_norm": 0.048967955063799855, + "learning_rate": 1.3181907990135624e-06, + "loss": 0.4333020210266113, + "memory(GiB)": 77.63, + "step": 2625, + "token_acc": 0.8836341008089608, + "train_speed(iter/s)": 0.029564 + }, + { + "epoch": 0.8415663373465061, + "grad_norm": 0.048274089580157754, + "learning_rate": 1.2925392288022299e-06, + "loss": 0.4414947509765625, + "memory(GiB)": 77.63, + "step": 2630, + "token_acc": 0.8760546404178385, + "train_speed(iter/s)": 0.02956 + }, + { + "epoch": 0.843166273349066, + "grad_norm": 0.053684823491607934, + "learning_rate": 1.267122448564374e-06, + "loss": 0.44922800064086915, + "memory(GiB)": 77.63, + "step": 2635, + "token_acc": 0.8554064052425748, + "train_speed(iter/s)": 0.029558 + }, + { + "epoch": 0.8447662093516259, + "grad_norm": 0.05262528572569429, + "learning_rate": 1.2419411436439021e-06, + "loss": 0.4328805923461914, + "memory(GiB)": 77.63, + "step": 2640, + "token_acc": 0.8400081317340923, + "train_speed(iter/s)": 0.029565 + }, + { + "epoch": 0.8463661453541859, + "grad_norm": 0.05549273276976771, + "learning_rate": 1.2169959930353049e-06, + "loss": 0.4460554599761963, + "memory(GiB)": 77.63, + "step": 2645, + "token_acc": 0.8804424157303371, + "train_speed(iter/s)": 0.029559 + }, + { + "epoch": 0.8479660813567458, + "grad_norm": 0.05180537064402683, + "learning_rate": 1.1922876693653584e-06, + "loss": 0.4503427505493164, + "memory(GiB)": 77.63, + "step": 2650, + "token_acc": 0.8934362934362934, + "train_speed(iter/s)": 0.029564 + }, + { + "epoch": 0.8495660173593056, + "grad_norm": 0.05011013702559624, + "learning_rate": 1.1678168388749788e-06, + "loss": 0.4415099620819092, + "memory(GiB)": 77.63, + "step": 2655, + "token_acc": 0.8995949690897463, + "train_speed(iter/s)": 0.029566 + }, + { + "epoch": 0.8511659533618655, + "grad_norm": 0.057170743989864214, + "learning_rate": 1.1435841614012666e-06, + "loss": 0.44884433746337893, + "memory(GiB)": 77.63, + "step": 2660, + "token_acc": 0.854816112084063, + "train_speed(iter/s)": 0.02956 + }, + { + "epoch": 0.8527658893644254, + "grad_norm": 0.054388272142607975, + "learning_rate": 1.1195902903597023e-06, + "loss": 0.439667797088623, + "memory(GiB)": 77.63, + "step": 2665, + "token_acc": 0.8846260387811634, + "train_speed(iter/s)": 0.02957 + }, + { + "epoch": 0.8543658253669854, + "grad_norm": 0.05039207218424233, + "learning_rate": 1.0958358727265438e-06, + "loss": 0.4384475231170654, + "memory(GiB)": 77.63, + "step": 2670, + "token_acc": 0.8525793222533995, + "train_speed(iter/s)": 0.029565 + }, + { + "epoch": 0.8559657613695452, + "grad_norm": 0.0543645938545219, + "learning_rate": 1.0723215490213635e-06, + "loss": 0.4338691711425781, + "memory(GiB)": 77.63, + "step": 2675, + "token_acc": 0.853824495541999, + "train_speed(iter/s)": 0.029558 + }, + { + "epoch": 0.8575656973721051, + "grad_norm": 0.0579168704227633, + "learning_rate": 1.0490479532897946e-06, + "loss": 0.458463716506958, + "memory(GiB)": 77.63, + "step": 2680, + "token_acc": 0.867092866756393, + "train_speed(iter/s)": 0.029566 + }, + { + "epoch": 0.859165633374665, + "grad_norm": 0.04996147776053655, + "learning_rate": 1.0260157130864178e-06, + "loss": 0.43754091262817385, + "memory(GiB)": 77.63, + "step": 2685, + "token_acc": 0.8611873713109128, + "train_speed(iter/s)": 0.029558 + }, + { + "epoch": 0.8607655693772249, + "grad_norm": 0.053537353272037014, + "learning_rate": 1.0032254494578519e-06, + "loss": 0.44204487800598147, + "memory(GiB)": 77.63, + "step": 2690, + "token_acc": 0.8575780654988576, + "train_speed(iter/s)": 0.029558 + }, + { + "epoch": 0.8623655053797848, + "grad_norm": 0.055119127073411836, + "learning_rate": 9.806777769260034e-07, + "loss": 0.4500781536102295, + "memory(GiB)": 77.63, + "step": 2695, + "token_acc": 0.8872294372294373, + "train_speed(iter/s)": 0.029562 + }, + { + "epoch": 0.8639654413823447, + "grad_norm": 0.0558713442289911, + "learning_rate": 9.583733034714982e-07, + "loss": 0.43947248458862304, + "memory(GiB)": 77.63, + "step": 2700, + "token_acc": 0.8926744522729466, + "train_speed(iter/s)": 0.029555 + }, + { + "epoch": 0.8655653773849046, + "grad_norm": 0.0552196144062876, + "learning_rate": 9.363126305172831e-07, + "loss": 0.4443229675292969, + "memory(GiB)": 77.63, + "step": 2705, + "token_acc": 0.9038031319910514, + "train_speed(iter/s)": 0.029561 + }, + { + "epoch": 0.8671653133874645, + "grad_norm": 0.055723745057826034, + "learning_rate": 9.144963529124163e-07, + "loss": 0.42942004203796386, + "memory(GiB)": 77.63, + "step": 2710, + "token_acc": 0.8680161943319838, + "train_speed(iter/s)": 0.029557 + }, + { + "epoch": 0.8687652493900244, + "grad_norm": 0.0589746686821641, + "learning_rate": 8.929250589160166e-07, + "loss": 0.4397599220275879, + "memory(GiB)": 77.63, + "step": 2715, + "token_acc": 0.8713878713878714, + "train_speed(iter/s)": 0.029552 + }, + { + "epoch": 0.8703651853925843, + "grad_norm": 0.04909314017257213, + "learning_rate": 8.715993301814174e-07, + "loss": 0.44155421257019045, + "memory(GiB)": 77.63, + "step": 2720, + "token_acc": 0.8710053650571495, + "train_speed(iter/s)": 0.029561 + }, + { + "epoch": 0.8719651213951441, + "grad_norm": 0.05047518453544575, + "learning_rate": 8.505197417404687e-07, + "loss": 0.43677616119384766, + "memory(GiB)": 77.63, + "step": 2725, + "token_acc": 0.8809886575249704, + "train_speed(iter/s)": 0.029556 + }, + { + "epoch": 0.8735650573977041, + "grad_norm": 0.05102151204327215, + "learning_rate": 8.296868619880372e-07, + "loss": 0.44188566207885743, + "memory(GiB)": 77.63, + "step": 2730, + "token_acc": 0.8547172833573602, + "train_speed(iter/s)": 0.029553 + }, + { + "epoch": 0.875164993400264, + "grad_norm": 0.04729834705444575, + "learning_rate": 8.091012526666797e-07, + "loss": 0.4441237926483154, + "memory(GiB)": 77.63, + "step": 2735, + "token_acc": 0.8537975972307066, + "train_speed(iter/s)": 0.029561 + }, + { + "epoch": 0.8767649294028239, + "grad_norm": 0.047668539210598965, + "learning_rate": 7.887634688515e-07, + "loss": 0.4462736129760742, + "memory(GiB)": 77.63, + "step": 2740, + "token_acc": 0.903437815975733, + "train_speed(iter/s)": 0.029554 + }, + { + "epoch": 0.8783648654053838, + "grad_norm": 0.052216823887528664, + "learning_rate": 7.686740589351704e-07, + "loss": 0.44857120513916016, + "memory(GiB)": 77.63, + "step": 2745, + "token_acc": 0.8033573141486811, + "train_speed(iter/s)": 0.029556 + }, + { + "epoch": 0.8799648014079436, + "grad_norm": 0.055862979558343906, + "learning_rate": 7.488335646131628e-07, + "loss": 0.44959425926208496, + "memory(GiB)": 77.63, + "step": 2750, + "token_acc": 0.8605054151624548, + "train_speed(iter/s)": 0.029558 + }, + { + "epoch": 0.8799648014079436, + "eval_loss": 0.6577034592628479, + "eval_runtime": 106.4875, + "eval_samples_per_second": 188.642, + "eval_steps_per_second": 0.948, + "eval_token_acc": 0.8722958612553617, + "step": 2750 + }, + { + "epoch": 0.8815647374105036, + "grad_norm": 0.051528350081992934, + "learning_rate": 7.292425208691212e-07, + "loss": 0.43878631591796874, + "memory(GiB)": 77.63, + "step": 2755, + "token_acc": 0.8812832745626772, + "train_speed(iter/s)": 0.029532 + }, + { + "epoch": 0.8831646734130635, + "grad_norm": 0.05310175482611414, + "learning_rate": 7.099014559604556e-07, + "loss": 0.45635418891906737, + "memory(GiB)": 77.63, + "step": 2760, + "token_acc": 0.8894999360532038, + "train_speed(iter/s)": 0.029537 + }, + { + "epoch": 0.8847646094156234, + "grad_norm": 0.04975738369955541, + "learning_rate": 6.908108914040823e-07, + "loss": 0.4421397686004639, + "memory(GiB)": 77.63, + "step": 2765, + "token_acc": 0.9070493575117089, + "train_speed(iter/s)": 0.029548 + }, + { + "epoch": 0.8863645454181833, + "grad_norm": 0.053564472486824076, + "learning_rate": 6.71971341962373e-07, + "loss": 0.4513510227203369, + "memory(GiB)": 77.63, + "step": 2770, + "token_acc": 0.8660589060308556, + "train_speed(iter/s)": 0.02955 + }, + { + "epoch": 0.8879644814207431, + "grad_norm": 0.06332925713320489, + "learning_rate": 6.53383315629268e-07, + "loss": 0.4404273509979248, + "memory(GiB)": 77.63, + "step": 2775, + "token_acc": 0.8507806501151779, + "train_speed(iter/s)": 0.029546 + }, + { + "epoch": 0.889564417423303, + "grad_norm": 0.063294227744794, + "learning_rate": 6.350473136165836e-07, + "loss": 0.4379493236541748, + "memory(GiB)": 77.63, + "step": 2780, + "token_acc": 0.8879898461050294, + "train_speed(iter/s)": 0.029561 + }, + { + "epoch": 0.891164353425863, + "grad_norm": 0.05151642870451994, + "learning_rate": 6.169638303404912e-07, + "loss": 0.4380655765533447, + "memory(GiB)": 77.63, + "step": 2785, + "token_acc": 0.8904059040590406, + "train_speed(iter/s)": 0.02956 + }, + { + "epoch": 0.8927642894284229, + "grad_norm": 0.05406168921762394, + "learning_rate": 5.991333534081878e-07, + "loss": 0.4479250907897949, + "memory(GiB)": 77.63, + "step": 2790, + "token_acc": 0.8831118813787792, + "train_speed(iter/s)": 0.02956 + }, + { + "epoch": 0.8943642254309827, + "grad_norm": 0.054911478200183335, + "learning_rate": 5.815563636047539e-07, + "loss": 0.43634886741638185, + "memory(GiB)": 77.63, + "step": 2795, + "token_acc": 0.8714865708931917, + "train_speed(iter/s)": 0.02957 + }, + { + "epoch": 0.8959641614335426, + "grad_norm": 0.05257509941727236, + "learning_rate": 5.64233334880181e-07, + "loss": 0.44048466682434084, + "memory(GiB)": 77.63, + "step": 2800, + "token_acc": 0.891296869625043, + "train_speed(iter/s)": 0.029568 + }, + { + "epoch": 0.8975640974361025, + "grad_norm": 0.05532206374459385, + "learning_rate": 5.471647343365982e-07, + "loss": 0.44726853370666503, + "memory(GiB)": 77.63, + "step": 2805, + "token_acc": 0.8828892005610098, + "train_speed(iter/s)": 0.029576 + }, + { + "epoch": 0.8991640334386625, + "grad_norm": 0.05107208389411162, + "learning_rate": 5.303510222156716e-07, + "loss": 0.4470540523529053, + "memory(GiB)": 77.63, + "step": 2810, + "token_acc": 0.8489765812281025, + "train_speed(iter/s)": 0.029574 + }, + { + "epoch": 0.9007639694412224, + "grad_norm": 0.055452171728558604, + "learning_rate": 5.137926518862013e-07, + "loss": 0.4417248249053955, + "memory(GiB)": 77.63, + "step": 2815, + "token_acc": 0.8739084132055378, + "train_speed(iter/s)": 0.029569 + }, + { + "epoch": 0.9023639054437822, + "grad_norm": 0.04866974543895629, + "learning_rate": 4.974900698318885e-07, + "loss": 0.4414045810699463, + "memory(GiB)": 77.63, + "step": 2820, + "token_acc": 0.8659420289855072, + "train_speed(iter/s)": 0.029581 + }, + { + "epoch": 0.9039638414463421, + "grad_norm": 0.05275403785935388, + "learning_rate": 4.814437156393048e-07, + "loss": 0.4543337821960449, + "memory(GiB)": 77.63, + "step": 2825, + "token_acc": 0.8396122896854425, + "train_speed(iter/s)": 0.029574 + }, + { + "epoch": 0.905563777448902, + "grad_norm": 0.05372217346495556, + "learning_rate": 4.656540219860317e-07, + "loss": 0.45271754264831543, + "memory(GiB)": 77.63, + "step": 2830, + "token_acc": 0.8707617789520036, + "train_speed(iter/s)": 0.029571 + }, + { + "epoch": 0.907163713451462, + "grad_norm": 0.05715427837146615, + "learning_rate": 4.501214146289956e-07, + "loss": 0.4418344497680664, + "memory(GiB)": 77.63, + "step": 2835, + "token_acc": 0.8788416882939489, + "train_speed(iter/s)": 0.02958 + }, + { + "epoch": 0.9087636494540219, + "grad_norm": 0.04986985198768239, + "learning_rate": 4.3484631239299356e-07, + "loss": 0.4437891960144043, + "memory(GiB)": 77.63, + "step": 2840, + "token_acc": 0.8431597023468803, + "train_speed(iter/s)": 0.029572 + }, + { + "epoch": 0.9103635854565817, + "grad_norm": 0.04999552116510165, + "learning_rate": 4.198291271593924e-07, + "loss": 0.44283204078674315, + "memory(GiB)": 77.63, + "step": 2845, + "token_acc": 0.8843727072633896, + "train_speed(iter/s)": 0.029575 + }, + { + "epoch": 0.9119635214591416, + "grad_norm": 0.047844581858855956, + "learning_rate": 4.0507026385502747e-07, + "loss": 0.4449836254119873, + "memory(GiB)": 77.63, + "step": 2850, + "token_acc": 0.9034812490661885, + "train_speed(iter/s)": 0.029576 + }, + { + "epoch": 0.9135634574617015, + "grad_norm": 0.05675006848739315, + "learning_rate": 3.9057012044127817e-07, + "loss": 0.44204154014587405, + "memory(GiB)": 77.63, + "step": 2855, + "token_acc": 0.8655569782330346, + "train_speed(iter/s)": 0.029569 + }, + { + "epoch": 0.9151633934642615, + "grad_norm": 0.054077703664588216, + "learning_rate": 3.7632908790334656e-07, + "loss": 0.4383398532867432, + "memory(GiB)": 77.63, + "step": 2860, + "token_acc": 0.8899396929824561, + "train_speed(iter/s)": 0.029578 + }, + { + "epoch": 0.9167633294668214, + "grad_norm": 0.055142279260510525, + "learning_rate": 3.6234755023970447e-07, + "loss": 0.4388674259185791, + "memory(GiB)": 77.63, + "step": 2865, + "token_acc": 0.8406133828996283, + "train_speed(iter/s)": 0.029574 + }, + { + "epoch": 0.9183632654693812, + "grad_norm": 0.05242909357272202, + "learning_rate": 3.4862588445174985e-07, + "loss": 0.44350008964538573, + "memory(GiB)": 77.63, + "step": 2870, + "token_acc": 0.8773854961832062, + "train_speed(iter/s)": 0.029568 + }, + { + "epoch": 0.9199632014719411, + "grad_norm": 0.051431062939426, + "learning_rate": 3.3516446053363015e-07, + "loss": 0.43948516845703123, + "memory(GiB)": 77.63, + "step": 2875, + "token_acc": 0.8812238692512353, + "train_speed(iter/s)": 0.029578 + }, + { + "epoch": 0.921563137474501, + "grad_norm": 0.04842108410960974, + "learning_rate": 3.219636414622751e-07, + "loss": 0.44395694732666013, + "memory(GiB)": 77.63, + "step": 2880, + "token_acc": 0.872349158571624, + "train_speed(iter/s)": 0.029573 + }, + { + "epoch": 0.923163073477061, + "grad_norm": 0.050854557927068264, + "learning_rate": 3.090237831876053e-07, + "loss": 0.4437469482421875, + "memory(GiB)": 77.63, + "step": 2885, + "token_acc": 0.8500566251415629, + "train_speed(iter/s)": 0.029572 + }, + { + "epoch": 0.9247630094796209, + "grad_norm": 0.05289158935619814, + "learning_rate": 2.9634523462293005e-07, + "loss": 0.439394474029541, + "memory(GiB)": 77.63, + "step": 2890, + "token_acc": 0.9039268013724743, + "train_speed(iter/s)": 0.029576 + }, + { + "epoch": 0.9263629454821807, + "grad_norm": 0.0532633695315608, + "learning_rate": 2.839283376355506e-07, + "loss": 0.4414195537567139, + "memory(GiB)": 77.63, + "step": 2895, + "token_acc": 0.8979206049149339, + "train_speed(iter/s)": 0.029567 + }, + { + "epoch": 0.9279628814847406, + "grad_norm": 0.05197494947423765, + "learning_rate": 2.717734270375272e-07, + "loss": 0.4303572177886963, + "memory(GiB)": 77.63, + "step": 2900, + "token_acc": 0.8932318992654774, + "train_speed(iter/s)": 0.029573 + }, + { + "epoch": 0.9295628174873005, + "grad_norm": 0.05715199054523632, + "learning_rate": 2.5988083057666534e-07, + "loss": 0.4488718032836914, + "memory(GiB)": 77.63, + "step": 2905, + "token_acc": 0.8734599589322382, + "train_speed(iter/s)": 0.029572 + }, + { + "epoch": 0.9311627534898604, + "grad_norm": 0.052180972248620894, + "learning_rate": 2.4825086892766745e-07, + "loss": 0.44499683380126953, + "memory(GiB)": 77.63, + "step": 2910, + "token_acc": 0.8798773215198501, + "train_speed(iter/s)": 0.029568 + }, + { + "epoch": 0.9327626894924202, + "grad_norm": 0.05195036988843101, + "learning_rate": 2.3688385568349515e-07, + "loss": 0.4348268508911133, + "memory(GiB)": 77.63, + "step": 2915, + "token_acc": 0.8380835380835381, + "train_speed(iter/s)": 0.029578 + }, + { + "epoch": 0.9343626254949802, + "grad_norm": 0.05340475761749026, + "learning_rate": 2.2578009734690264e-07, + "loss": 0.4533662796020508, + "memory(GiB)": 77.63, + "step": 2920, + "token_acc": 0.8854700854700854, + "train_speed(iter/s)": 0.029569 + }, + { + "epoch": 0.9359625614975401, + "grad_norm": 0.047434143218971526, + "learning_rate": 2.1493989332218468e-07, + "loss": 0.4382183074951172, + "memory(GiB)": 77.63, + "step": 2925, + "token_acc": 0.880750496121234, + "train_speed(iter/s)": 0.029565 + }, + { + "epoch": 0.9375624975001, + "grad_norm": 0.052509681961836426, + "learning_rate": 2.043635359070928e-07, + "loss": 0.44708704948425293, + "memory(GiB)": 77.63, + "step": 2930, + "token_acc": 0.8942012598933936, + "train_speed(iter/s)": 0.029576 + }, + { + "epoch": 0.9391624335026599, + "grad_norm": 0.0531539858574761, + "learning_rate": 1.9405131028495838e-07, + "loss": 0.45058341026306153, + "memory(GiB)": 77.63, + "step": 2935, + "token_acc": 0.8614694335389792, + "train_speed(iter/s)": 0.029569 + }, + { + "epoch": 0.9407623695052197, + "grad_norm": 0.05931701393554437, + "learning_rate": 1.8400349451700438e-07, + "loss": 0.44367156028747556, + "memory(GiB)": 77.63, + "step": 2940, + "token_acc": 0.881156184096266, + "train_speed(iter/s)": 0.029573 + }, + { + "epoch": 0.9423623055077797, + "grad_norm": 0.05380572562718686, + "learning_rate": 1.742203595348435e-07, + "loss": 0.4424111843109131, + "memory(GiB)": 77.63, + "step": 2945, + "token_acc": 0.8769617074701821, + "train_speed(iter/s)": 0.029576 + }, + { + "epoch": 0.9439622415103396, + "grad_norm": 0.06156259021000458, + "learning_rate": 1.6470216913317628e-07, + "loss": 0.4509577751159668, + "memory(GiB)": 77.63, + "step": 2950, + "token_acc": 0.8544532947139754, + "train_speed(iter/s)": 0.029567 + }, + { + "epoch": 0.9455621775128995, + "grad_norm": 0.050697077683688974, + "learning_rate": 1.5544917996267562e-07, + "loss": 0.44117283821105957, + "memory(GiB)": 77.63, + "step": 2955, + "token_acc": 0.8515226026101759, + "train_speed(iter/s)": 0.029575 + }, + { + "epoch": 0.9471621135154594, + "grad_norm": 0.05205296295346333, + "learning_rate": 1.464616415230702e-07, + "loss": 0.4488182067871094, + "memory(GiB)": 77.63, + "step": 2960, + "token_acc": 0.874439461883408, + "train_speed(iter/s)": 0.029569 + }, + { + "epoch": 0.9487620495180192, + "grad_norm": 0.047632029637579856, + "learning_rate": 1.3773979615640976e-07, + "loss": 0.4415272235870361, + "memory(GiB)": 77.63, + "step": 2965, + "token_acc": 0.8889883616830797, + "train_speed(iter/s)": 0.029564 + }, + { + "epoch": 0.9503619855205792, + "grad_norm": 0.051226023380966074, + "learning_rate": 1.292838790405393e-07, + "loss": 0.4453396797180176, + "memory(GiB)": 77.63, + "step": 2970, + "token_acc": 0.8701866977829639, + "train_speed(iter/s)": 0.029572 + }, + { + "epoch": 0.9519619215231391, + "grad_norm": 0.05296626405711913, + "learning_rate": 1.2109411818274851e-07, + "loss": 0.44417614936828614, + "memory(GiB)": 77.63, + "step": 2975, + "token_acc": 0.8997547959036493, + "train_speed(iter/s)": 0.029565 + }, + { + "epoch": 0.953561857525699, + "grad_norm": 0.0535756814263424, + "learning_rate": 1.1317073441363458e-07, + "loss": 0.444796085357666, + "memory(GiB)": 77.63, + "step": 2980, + "token_acc": 0.9101887677336147, + "train_speed(iter/s)": 0.029563 + }, + { + "epoch": 0.9551617935282589, + "grad_norm": 0.05005027675979017, + "learning_rate": 1.055139413811379e-07, + "loss": 0.45203323364257814, + "memory(GiB)": 77.63, + "step": 2985, + "token_acc": 0.8569892473118279, + "train_speed(iter/s)": 0.029568 + }, + { + "epoch": 0.9567617295308187, + "grad_norm": 0.04934807615166247, + "learning_rate": 9.812394554478355e-08, + "loss": 0.43912034034729003, + "memory(GiB)": 77.63, + "step": 2990, + "token_acc": 0.8557346268189642, + "train_speed(iter/s)": 0.029559 + }, + { + "epoch": 0.9583616655333786, + "grad_norm": 0.051618825055470385, + "learning_rate": 9.10009461701189e-08, + "loss": 0.4506105899810791, + "memory(GiB)": 77.63, + "step": 2995, + "token_acc": 0.7809948032665182, + "train_speed(iter/s)": 0.029565 + }, + { + "epoch": 0.9599616015359386, + "grad_norm": 0.054833054855342726, + "learning_rate": 8.41451353233369e-08, + "loss": 0.442844820022583, + "memory(GiB)": 77.63, + "step": 3000, + "token_acc": 0.8733862959285005, + "train_speed(iter/s)": 0.029563 + }, + { + "epoch": 0.9599616015359386, + "eval_loss": 0.6573547720909119, + "eval_runtime": 106.0877, + "eval_samples_per_second": 189.353, + "eval_steps_per_second": 0.952, + "eval_token_acc": 0.8724322608695082, + "step": 3000 + } + ], + "logging_steps": 5, + "max_steps": 3125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.944159044825703e+20, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}