|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 2000, |
|
"global_step": 7916, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012632642748863063, |
|
"grad_norm": 0.3052225708961487, |
|
"learning_rate": 1.4705882352941177e-06, |
|
"loss": 1.3368, |
|
"num_input_tokens_seen": 1013408, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0025265285497726125, |
|
"grad_norm": 0.2797602713108063, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 1.3885, |
|
"num_input_tokens_seen": 1928736, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0037897928246589186, |
|
"grad_norm": 0.2704920470714569, |
|
"learning_rate": 4.4117647058823526e-06, |
|
"loss": 1.4081, |
|
"num_input_tokens_seen": 2911616, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.005053057099545225, |
|
"grad_norm": 0.26865851879119873, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 1.4143, |
|
"num_input_tokens_seen": 3775872, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.006316321374431531, |
|
"grad_norm": 0.29596275091171265, |
|
"learning_rate": 7.352941176470588e-06, |
|
"loss": 1.4061, |
|
"num_input_tokens_seen": 4755264, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.007579585649317837, |
|
"grad_norm": 0.26764747500419617, |
|
"learning_rate": 8.823529411764705e-06, |
|
"loss": 1.3282, |
|
"num_input_tokens_seen": 5670016, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.008842849924204144, |
|
"grad_norm": 0.2855396866798401, |
|
"learning_rate": 1.0294117647058823e-05, |
|
"loss": 1.3441, |
|
"num_input_tokens_seen": 6541472, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01010611419909045, |
|
"grad_norm": 0.27064523100852966, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 1.3611, |
|
"num_input_tokens_seen": 7502848, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.011369378473976757, |
|
"grad_norm": 0.26372382044792175, |
|
"learning_rate": 1.3235294117647058e-05, |
|
"loss": 1.3355, |
|
"num_input_tokens_seen": 8450880, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.012632642748863061, |
|
"grad_norm": 0.2819940149784088, |
|
"learning_rate": 1.4705882352941175e-05, |
|
"loss": 1.4074, |
|
"num_input_tokens_seen": 9363776, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.013895907023749368, |
|
"grad_norm": 0.27858778834342957, |
|
"learning_rate": 1.6176470588235293e-05, |
|
"loss": 1.3922, |
|
"num_input_tokens_seen": 10362848, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.015159171298635674, |
|
"grad_norm": 0.26853179931640625, |
|
"learning_rate": 1.764705882352941e-05, |
|
"loss": 1.3606, |
|
"num_input_tokens_seen": 11337152, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.016422435573521982, |
|
"grad_norm": 0.29751917719841003, |
|
"learning_rate": 1.9117647058823524e-05, |
|
"loss": 1.3469, |
|
"num_input_tokens_seen": 12268864, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.017685699848408287, |
|
"grad_norm": 0.29996374249458313, |
|
"learning_rate": 2.0588235294117645e-05, |
|
"loss": 1.3455, |
|
"num_input_tokens_seen": 13304544, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.018948964123294592, |
|
"grad_norm": 0.26638367772102356, |
|
"learning_rate": 2.2058823529411763e-05, |
|
"loss": 1.3529, |
|
"num_input_tokens_seen": 14245088, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0202122283981809, |
|
"grad_norm": 0.2829771041870117, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 1.3517, |
|
"num_input_tokens_seen": 15277408, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.021475492673067205, |
|
"grad_norm": 0.28468722105026245, |
|
"learning_rate": 2.4999999999999998e-05, |
|
"loss": 1.3756, |
|
"num_input_tokens_seen": 16296480, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.022738756947953513, |
|
"grad_norm": 0.2717965841293335, |
|
"learning_rate": 2.6470588235294115e-05, |
|
"loss": 1.3094, |
|
"num_input_tokens_seen": 17249088, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.024002021222839818, |
|
"grad_norm": 0.2902025878429413, |
|
"learning_rate": 2.7941176470588236e-05, |
|
"loss": 1.3894, |
|
"num_input_tokens_seen": 18267872, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.025265285497726123, |
|
"grad_norm": 0.27164924144744873, |
|
"learning_rate": 2.941176470588235e-05, |
|
"loss": 1.3471, |
|
"num_input_tokens_seen": 19228288, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02652854977261243, |
|
"grad_norm": 0.2791699767112732, |
|
"learning_rate": 3.088235294117647e-05, |
|
"loss": 1.3676, |
|
"num_input_tokens_seen": 20112768, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.027791814047498736, |
|
"grad_norm": 0.27457180619239807, |
|
"learning_rate": 3.2352941176470585e-05, |
|
"loss": 1.3667, |
|
"num_input_tokens_seen": 21080384, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.029055078322385044, |
|
"grad_norm": 0.2744538486003876, |
|
"learning_rate": 3.38235294117647e-05, |
|
"loss": 1.3791, |
|
"num_input_tokens_seen": 21978464, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03031834259727135, |
|
"grad_norm": 0.27631092071533203, |
|
"learning_rate": 3.49999941403517e-05, |
|
"loss": 1.3032, |
|
"num_input_tokens_seen": 22891136, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03158160687215766, |
|
"grad_norm": 0.25807875394821167, |
|
"learning_rate": 3.499978905307333e-05, |
|
"loss": 1.3203, |
|
"num_input_tokens_seen": 23788384, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.032844871147043965, |
|
"grad_norm": 0.282926470041275, |
|
"learning_rate": 3.499929098730414e-05, |
|
"loss": 1.3487, |
|
"num_input_tokens_seen": 24732448, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.034108135421930266, |
|
"grad_norm": 0.3243197500705719, |
|
"learning_rate": 3.499849995138268e-05, |
|
"loss": 1.3335, |
|
"num_input_tokens_seen": 25651072, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.035371399696816574, |
|
"grad_norm": 0.28631719946861267, |
|
"learning_rate": 3.499741595855233e-05, |
|
"loss": 1.3104, |
|
"num_input_tokens_seen": 26588256, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03663466397170288, |
|
"grad_norm": 0.2739802598953247, |
|
"learning_rate": 3.499603902696111e-05, |
|
"loss": 1.3294, |
|
"num_input_tokens_seen": 27506400, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.037897928246589184, |
|
"grad_norm": 0.25884002447128296, |
|
"learning_rate": 3.499436917966138e-05, |
|
"loss": 1.3253, |
|
"num_input_tokens_seen": 28436096, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03916119252147549, |
|
"grad_norm": 0.3526857793331146, |
|
"learning_rate": 3.4992406444609434e-05, |
|
"loss": 1.3731, |
|
"num_input_tokens_seen": 29415744, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0404244567963618, |
|
"grad_norm": 0.3010634183883667, |
|
"learning_rate": 3.499015085466505e-05, |
|
"loss": 1.3604, |
|
"num_input_tokens_seen": 30396288, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0416877210712481, |
|
"grad_norm": 0.30412164330482483, |
|
"learning_rate": 3.498760244759094e-05, |
|
"loss": 1.3192, |
|
"num_input_tokens_seen": 31281632, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.04295098534613441, |
|
"grad_norm": 0.28709614276885986, |
|
"learning_rate": 3.498476126605209e-05, |
|
"loss": 1.3405, |
|
"num_input_tokens_seen": 32139296, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.04421424962102072, |
|
"grad_norm": 0.2636132836341858, |
|
"learning_rate": 3.4981627357615085e-05, |
|
"loss": 1.3796, |
|
"num_input_tokens_seen": 33140544, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.045477513895907026, |
|
"grad_norm": 0.27414971590042114, |
|
"learning_rate": 3.497820077474728e-05, |
|
"loss": 1.3502, |
|
"num_input_tokens_seen": 34072480, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.04674077817079333, |
|
"grad_norm": 0.29717832803726196, |
|
"learning_rate": 3.4974481574815955e-05, |
|
"loss": 1.3218, |
|
"num_input_tokens_seen": 35043552, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.048004042445679636, |
|
"grad_norm": 0.274935781955719, |
|
"learning_rate": 3.49704698200873e-05, |
|
"loss": 1.3101, |
|
"num_input_tokens_seen": 36057536, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.049267306720565944, |
|
"grad_norm": 0.2995646893978119, |
|
"learning_rate": 3.496616557772545e-05, |
|
"loss": 1.3231, |
|
"num_input_tokens_seen": 37053280, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.050530570995452245, |
|
"grad_norm": 0.2813841998577118, |
|
"learning_rate": 3.4961568919791295e-05, |
|
"loss": 1.3073, |
|
"num_input_tokens_seen": 37949760, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.051793835270338554, |
|
"grad_norm": 0.25323453545570374, |
|
"learning_rate": 3.49566799232413e-05, |
|
"loss": 1.4188, |
|
"num_input_tokens_seen": 38825888, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.05305709954522486, |
|
"grad_norm": 0.3185766339302063, |
|
"learning_rate": 3.4951498669926205e-05, |
|
"loss": 1.2551, |
|
"num_input_tokens_seen": 39816832, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.05432036382011117, |
|
"grad_norm": 0.282988041639328, |
|
"learning_rate": 3.494602524658968e-05, |
|
"loss": 1.3429, |
|
"num_input_tokens_seen": 40746208, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05558362809499747, |
|
"grad_norm": 0.29383236169815063, |
|
"learning_rate": 3.494025974486684e-05, |
|
"loss": 1.2908, |
|
"num_input_tokens_seen": 41732576, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.05684689236988378, |
|
"grad_norm": 0.2495247721672058, |
|
"learning_rate": 3.4934202261282736e-05, |
|
"loss": 1.3379, |
|
"num_input_tokens_seen": 42725664, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05811015664477009, |
|
"grad_norm": 0.27226462960243225, |
|
"learning_rate": 3.4927852897250736e-05, |
|
"loss": 1.2906, |
|
"num_input_tokens_seen": 43636000, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.05937342091965639, |
|
"grad_norm": 0.2738124430179596, |
|
"learning_rate": 3.49212117590708e-05, |
|
"loss": 1.3382, |
|
"num_input_tokens_seen": 44584384, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0606366851945427, |
|
"grad_norm": 0.2823927700519562, |
|
"learning_rate": 3.4914278957927746e-05, |
|
"loss": 1.3572, |
|
"num_input_tokens_seen": 45563296, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.061899949469429005, |
|
"grad_norm": 0.3090139329433441, |
|
"learning_rate": 3.490705460988934e-05, |
|
"loss": 1.3633, |
|
"num_input_tokens_seen": 46504000, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.06316321374431531, |
|
"grad_norm": 0.2648494839668274, |
|
"learning_rate": 3.4899538835904395e-05, |
|
"loss": 1.296, |
|
"num_input_tokens_seen": 47469568, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06442647801920162, |
|
"grad_norm": 0.26772260665893555, |
|
"learning_rate": 3.489173176180072e-05, |
|
"loss": 1.3468, |
|
"num_input_tokens_seen": 48428992, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.06568974229408793, |
|
"grad_norm": 0.2722509503364563, |
|
"learning_rate": 3.488363351828301e-05, |
|
"loss": 1.3298, |
|
"num_input_tokens_seen": 49435616, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.06695300656897422, |
|
"grad_norm": 0.33240431547164917, |
|
"learning_rate": 3.48752442409307e-05, |
|
"loss": 1.3395, |
|
"num_input_tokens_seen": 50444960, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.06821627084386053, |
|
"grad_norm": 0.33877724409103394, |
|
"learning_rate": 3.4866564070195623e-05, |
|
"loss": 1.3627, |
|
"num_input_tokens_seen": 51354144, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.06947953511874684, |
|
"grad_norm": 0.25358885526657104, |
|
"learning_rate": 3.485759315139974e-05, |
|
"loss": 1.3665, |
|
"num_input_tokens_seen": 52353568, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.07074279939363315, |
|
"grad_norm": 0.3228625953197479, |
|
"learning_rate": 3.484833163473263e-05, |
|
"loss": 1.3603, |
|
"num_input_tokens_seen": 53330208, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.07200606366851946, |
|
"grad_norm": 0.27047306299209595, |
|
"learning_rate": 3.483877967524903e-05, |
|
"loss": 1.3918, |
|
"num_input_tokens_seen": 54292704, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.07326932794340577, |
|
"grad_norm": 0.23836977779865265, |
|
"learning_rate": 3.482893743286624e-05, |
|
"loss": 1.3265, |
|
"num_input_tokens_seen": 55289088, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.07453259221829207, |
|
"grad_norm": 0.2790107727050781, |
|
"learning_rate": 3.4818805072361394e-05, |
|
"loss": 1.34, |
|
"num_input_tokens_seen": 56191520, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.07579585649317837, |
|
"grad_norm": 0.2909539043903351, |
|
"learning_rate": 3.4808382763368746e-05, |
|
"loss": 1.3827, |
|
"num_input_tokens_seen": 57130144, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07705912076806468, |
|
"grad_norm": 0.2930690050125122, |
|
"learning_rate": 3.479767068037682e-05, |
|
"loss": 1.2993, |
|
"num_input_tokens_seen": 58166976, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.07832238504295098, |
|
"grad_norm": 0.2910405993461609, |
|
"learning_rate": 3.4786669002725486e-05, |
|
"loss": 1.4025, |
|
"num_input_tokens_seen": 59115968, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.07958564931783729, |
|
"grad_norm": 0.2609618008136749, |
|
"learning_rate": 3.477537791460297e-05, |
|
"loss": 1.3454, |
|
"num_input_tokens_seen": 60097152, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0808489135927236, |
|
"grad_norm": 0.2621832489967346, |
|
"learning_rate": 3.4763797605042735e-05, |
|
"loss": 1.3193, |
|
"num_input_tokens_seen": 61038400, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.08211217786760991, |
|
"grad_norm": 0.2869206666946411, |
|
"learning_rate": 3.475192826792036e-05, |
|
"loss": 1.3755, |
|
"num_input_tokens_seen": 62005408, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.0833754421424962, |
|
"grad_norm": 0.2955986261367798, |
|
"learning_rate": 3.473977010195027e-05, |
|
"loss": 1.3446, |
|
"num_input_tokens_seen": 62938944, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.08463870641738251, |
|
"grad_norm": 0.27759358286857605, |
|
"learning_rate": 3.47273233106824e-05, |
|
"loss": 1.3243, |
|
"num_input_tokens_seen": 63825280, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.08590197069226882, |
|
"grad_norm": 0.2854154706001282, |
|
"learning_rate": 3.471458810249883e-05, |
|
"loss": 1.3274, |
|
"num_input_tokens_seen": 64772224, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.08716523496715513, |
|
"grad_norm": 0.26865917444229126, |
|
"learning_rate": 3.470156469061023e-05, |
|
"loss": 1.3368, |
|
"num_input_tokens_seen": 65757408, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.08842849924204144, |
|
"grad_norm": 0.3124206066131592, |
|
"learning_rate": 3.468825329305235e-05, |
|
"loss": 1.3619, |
|
"num_input_tokens_seen": 66653856, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.08969176351692774, |
|
"grad_norm": 0.257878839969635, |
|
"learning_rate": 3.467465413268235e-05, |
|
"loss": 1.3705, |
|
"num_input_tokens_seen": 67551136, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.09095502779181405, |
|
"grad_norm": 0.3039745092391968, |
|
"learning_rate": 3.466076743717506e-05, |
|
"loss": 1.3407, |
|
"num_input_tokens_seen": 68461888, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.09221829206670035, |
|
"grad_norm": 0.297577828168869, |
|
"learning_rate": 3.4646593439019164e-05, |
|
"loss": 1.3068, |
|
"num_input_tokens_seen": 69439936, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.09348155634158666, |
|
"grad_norm": 0.26858824491500854, |
|
"learning_rate": 3.463213237551333e-05, |
|
"loss": 1.3362, |
|
"num_input_tokens_seen": 70315520, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.09474482061647296, |
|
"grad_norm": 0.32382968068122864, |
|
"learning_rate": 3.461738448876223e-05, |
|
"loss": 1.2972, |
|
"num_input_tokens_seen": 71249088, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.09600808489135927, |
|
"grad_norm": 0.2890531122684479, |
|
"learning_rate": 3.460235002567247e-05, |
|
"loss": 1.2899, |
|
"num_input_tokens_seen": 72123200, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.09727134916624558, |
|
"grad_norm": 0.2724192440509796, |
|
"learning_rate": 3.458702923794847e-05, |
|
"loss": 1.3435, |
|
"num_input_tokens_seen": 73014048, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.09853461344113189, |
|
"grad_norm": 0.2698012888431549, |
|
"learning_rate": 3.457142238208826e-05, |
|
"loss": 1.3823, |
|
"num_input_tokens_seen": 73970912, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.0997978777160182, |
|
"grad_norm": 0.25855422019958496, |
|
"learning_rate": 3.455552971937915e-05, |
|
"loss": 1.3545, |
|
"num_input_tokens_seen": 74960032, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.10106114199090449, |
|
"grad_norm": 0.3183737099170685, |
|
"learning_rate": 3.453935151589341e-05, |
|
"loss": 1.3597, |
|
"num_input_tokens_seen": 75886048, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1023244062657908, |
|
"grad_norm": 0.2935165464878082, |
|
"learning_rate": 3.4522888042483766e-05, |
|
"loss": 1.3745, |
|
"num_input_tokens_seen": 76882752, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.10358767054067711, |
|
"grad_norm": 0.2568333148956299, |
|
"learning_rate": 3.450613957477889e-05, |
|
"loss": 1.3502, |
|
"num_input_tokens_seen": 77780736, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.10485093481556342, |
|
"grad_norm": 0.29373618960380554, |
|
"learning_rate": 3.4489106393178774e-05, |
|
"loss": 1.33, |
|
"num_input_tokens_seen": 78738272, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.10611419909044972, |
|
"grad_norm": 0.2722548246383667, |
|
"learning_rate": 3.447178878285004e-05, |
|
"loss": 1.3533, |
|
"num_input_tokens_seen": 79636736, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.10737746336533603, |
|
"grad_norm": 0.29016321897506714, |
|
"learning_rate": 3.445418703372119e-05, |
|
"loss": 1.365, |
|
"num_input_tokens_seen": 80603008, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.10864072764022234, |
|
"grad_norm": 0.2636987268924713, |
|
"learning_rate": 3.443630144047771e-05, |
|
"loss": 1.3284, |
|
"num_input_tokens_seen": 81556992, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.10990399191510863, |
|
"grad_norm": 0.2925853133201599, |
|
"learning_rate": 3.441813230255714e-05, |
|
"loss": 1.306, |
|
"num_input_tokens_seen": 82544128, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.11116725618999494, |
|
"grad_norm": 0.32026803493499756, |
|
"learning_rate": 3.439967992414412e-05, |
|
"loss": 1.2703, |
|
"num_input_tokens_seen": 83488864, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.11243052046488125, |
|
"grad_norm": 0.2739593982696533, |
|
"learning_rate": 3.438094461416522e-05, |
|
"loss": 1.3276, |
|
"num_input_tokens_seen": 84447232, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.11369378473976756, |
|
"grad_norm": 0.26780998706817627, |
|
"learning_rate": 3.4361926686283805e-05, |
|
"loss": 1.3311, |
|
"num_input_tokens_seen": 85353344, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.11495704901465387, |
|
"grad_norm": 0.3547651469707489, |
|
"learning_rate": 3.43426264588948e-05, |
|
"loss": 1.3696, |
|
"num_input_tokens_seen": 86331744, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.11622031328954018, |
|
"grad_norm": 0.2572576105594635, |
|
"learning_rate": 3.4323044255119314e-05, |
|
"loss": 1.3226, |
|
"num_input_tokens_seen": 87350592, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.11748357756442648, |
|
"grad_norm": 0.26348087191581726, |
|
"learning_rate": 3.430318040279929e-05, |
|
"loss": 1.339, |
|
"num_input_tokens_seen": 88312000, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.11874684183931278, |
|
"grad_norm": 0.2919277846813202, |
|
"learning_rate": 3.428303523449194e-05, |
|
"loss": 1.3158, |
|
"num_input_tokens_seen": 89257856, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.12001010611419909, |
|
"grad_norm": 0.2658417820930481, |
|
"learning_rate": 3.426260908746427e-05, |
|
"loss": 1.3073, |
|
"num_input_tokens_seen": 90244352, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.1212733703890854, |
|
"grad_norm": 0.28189846873283386, |
|
"learning_rate": 3.424190230368733e-05, |
|
"loss": 1.3125, |
|
"num_input_tokens_seen": 91129440, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.1225366346639717, |
|
"grad_norm": 0.279550701379776, |
|
"learning_rate": 3.422091522983059e-05, |
|
"loss": 1.2755, |
|
"num_input_tokens_seen": 92033408, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.12379989893885801, |
|
"grad_norm": 0.28984683752059937, |
|
"learning_rate": 3.419964821725607e-05, |
|
"loss": 1.3188, |
|
"num_input_tokens_seen": 92960864, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.1250631632137443, |
|
"grad_norm": 0.2627594769001007, |
|
"learning_rate": 3.417810162201247e-05, |
|
"loss": 1.3248, |
|
"num_input_tokens_seen": 93996960, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.12632642748863063, |
|
"grad_norm": 0.2966674864292145, |
|
"learning_rate": 3.415627580482923e-05, |
|
"loss": 1.3486, |
|
"num_input_tokens_seen": 94925600, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12758969176351692, |
|
"grad_norm": 0.2634032666683197, |
|
"learning_rate": 3.413417113111045e-05, |
|
"loss": 1.3315, |
|
"num_input_tokens_seen": 95851200, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.12885295603840324, |
|
"grad_norm": 0.29642611742019653, |
|
"learning_rate": 3.4111787970928835e-05, |
|
"loss": 1.2694, |
|
"num_input_tokens_seen": 96800640, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.13011622031328954, |
|
"grad_norm": 0.25690603256225586, |
|
"learning_rate": 3.408912669901943e-05, |
|
"loss": 1.3334, |
|
"num_input_tokens_seen": 97827232, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.13137948458817586, |
|
"grad_norm": 0.2836136817932129, |
|
"learning_rate": 3.40661876947734e-05, |
|
"loss": 1.3122, |
|
"num_input_tokens_seen": 98797088, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.13264274886306215, |
|
"grad_norm": 0.2613033354282379, |
|
"learning_rate": 3.4042971342231655e-05, |
|
"loss": 1.3665, |
|
"num_input_tokens_seen": 99772384, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.13390601313794845, |
|
"grad_norm": 0.2632371485233307, |
|
"learning_rate": 3.401947803007841e-05, |
|
"loss": 1.342, |
|
"num_input_tokens_seen": 100704544, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.13516927741283477, |
|
"grad_norm": 0.25628045201301575, |
|
"learning_rate": 3.399570815163471e-05, |
|
"loss": 1.3686, |
|
"num_input_tokens_seen": 101608800, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.13643254168772107, |
|
"grad_norm": 0.23973917961120605, |
|
"learning_rate": 3.397166210485182e-05, |
|
"loss": 1.393, |
|
"num_input_tokens_seen": 102571712, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.1376958059626074, |
|
"grad_norm": 0.32102668285369873, |
|
"learning_rate": 3.394734029230454e-05, |
|
"loss": 1.2795, |
|
"num_input_tokens_seen": 103472640, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.13895907023749368, |
|
"grad_norm": 0.2778148651123047, |
|
"learning_rate": 3.3922743121184533e-05, |
|
"loss": 1.2751, |
|
"num_input_tokens_seen": 104464224, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.14022233451238, |
|
"grad_norm": 0.2992386221885681, |
|
"learning_rate": 3.3897871003293454e-05, |
|
"loss": 1.2715, |
|
"num_input_tokens_seen": 105472736, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.1414855987872663, |
|
"grad_norm": 0.2530061900615692, |
|
"learning_rate": 3.3872724355036066e-05, |
|
"loss": 1.3162, |
|
"num_input_tokens_seen": 106384480, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.1427488630621526, |
|
"grad_norm": 0.2719084918498993, |
|
"learning_rate": 3.384730359741327e-05, |
|
"loss": 1.2827, |
|
"num_input_tokens_seen": 107319712, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.14401212733703891, |
|
"grad_norm": 0.26223063468933105, |
|
"learning_rate": 3.3821609156015086e-05, |
|
"loss": 1.3352, |
|
"num_input_tokens_seen": 108260576, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.1452753916119252, |
|
"grad_norm": 0.28642159700393677, |
|
"learning_rate": 3.3795641461013454e-05, |
|
"loss": 1.3423, |
|
"num_input_tokens_seen": 109234720, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.14653865588681153, |
|
"grad_norm": 0.3532911539077759, |
|
"learning_rate": 3.376940094715512e-05, |
|
"loss": 1.3319, |
|
"num_input_tokens_seen": 110154176, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.14780192016169783, |
|
"grad_norm": 0.2519535720348358, |
|
"learning_rate": 3.3742888053754295e-05, |
|
"loss": 1.3348, |
|
"num_input_tokens_seen": 111066432, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.14906518443658415, |
|
"grad_norm": 0.28797778487205505, |
|
"learning_rate": 3.371610322468534e-05, |
|
"loss": 1.3478, |
|
"num_input_tokens_seen": 112032064, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.15032844871147044, |
|
"grad_norm": 0.2780948281288147, |
|
"learning_rate": 3.368904690837529e-05, |
|
"loss": 1.3099, |
|
"num_input_tokens_seen": 113065184, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.15159171298635674, |
|
"grad_norm": 0.3206534683704376, |
|
"learning_rate": 3.3661719557796405e-05, |
|
"loss": 1.3218, |
|
"num_input_tokens_seen": 114056096, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.15285497726124306, |
|
"grad_norm": 0.30456361174583435, |
|
"learning_rate": 3.363412163045853e-05, |
|
"loss": 1.3439, |
|
"num_input_tokens_seen": 115039808, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.15411824153612935, |
|
"grad_norm": 0.27767330408096313, |
|
"learning_rate": 3.3606253588401474e-05, |
|
"loss": 1.2642, |
|
"num_input_tokens_seen": 115943872, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.15538150581101567, |
|
"grad_norm": 0.25447219610214233, |
|
"learning_rate": 3.357811589818724e-05, |
|
"loss": 1.3209, |
|
"num_input_tokens_seen": 116934144, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.15664477008590197, |
|
"grad_norm": 0.28984275460243225, |
|
"learning_rate": 3.354970903089228e-05, |
|
"loss": 1.2694, |
|
"num_input_tokens_seen": 117866592, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.15790803436078826, |
|
"grad_norm": 0.2603750228881836, |
|
"learning_rate": 3.3521033462099505e-05, |
|
"loss": 1.3538, |
|
"num_input_tokens_seen": 118792000, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.15917129863567459, |
|
"grad_norm": 0.2679465413093567, |
|
"learning_rate": 3.3492089671890414e-05, |
|
"loss": 1.3708, |
|
"num_input_tokens_seen": 119700608, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.16043456291056088, |
|
"grad_norm": 0.2753802537918091, |
|
"learning_rate": 3.346287814483703e-05, |
|
"loss": 1.2785, |
|
"num_input_tokens_seen": 120664544, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.1616978271854472, |
|
"grad_norm": 0.2532285153865814, |
|
"learning_rate": 3.3433399369993764e-05, |
|
"loss": 1.3176, |
|
"num_input_tokens_seen": 121630656, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1629610914603335, |
|
"grad_norm": 0.2713632583618164, |
|
"learning_rate": 3.340365384088924e-05, |
|
"loss": 1.2721, |
|
"num_input_tokens_seen": 122593728, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.16422435573521982, |
|
"grad_norm": 0.31818637251853943, |
|
"learning_rate": 3.337364205551805e-05, |
|
"loss": 1.3474, |
|
"num_input_tokens_seen": 123604064, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1654876200101061, |
|
"grad_norm": 0.28953075408935547, |
|
"learning_rate": 3.3343364516332404e-05, |
|
"loss": 1.3117, |
|
"num_input_tokens_seen": 124606080, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.1667508842849924, |
|
"grad_norm": 0.32029005885124207, |
|
"learning_rate": 3.331282173023371e-05, |
|
"loss": 1.3281, |
|
"num_input_tokens_seen": 125569664, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.16801414855987873, |
|
"grad_norm": 0.2608253061771393, |
|
"learning_rate": 3.328201420856409e-05, |
|
"loss": 1.2915, |
|
"num_input_tokens_seen": 126460768, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.16927741283476502, |
|
"grad_norm": 0.2563798725605011, |
|
"learning_rate": 3.3250942467097835e-05, |
|
"loss": 1.3308, |
|
"num_input_tokens_seen": 127405408, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.17054067710965135, |
|
"grad_norm": 0.26563408970832825, |
|
"learning_rate": 3.3219607026032747e-05, |
|
"loss": 1.294, |
|
"num_input_tokens_seen": 128331968, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.17180394138453764, |
|
"grad_norm": 0.2531772553920746, |
|
"learning_rate": 3.318800840998146e-05, |
|
"loss": 1.3276, |
|
"num_input_tokens_seen": 129301248, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.17306720565942396, |
|
"grad_norm": 0.2774362862110138, |
|
"learning_rate": 3.3156147147962623e-05, |
|
"loss": 1.2639, |
|
"num_input_tokens_seen": 130282336, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.17433046993431026, |
|
"grad_norm": 0.284277081489563, |
|
"learning_rate": 3.312402377339206e-05, |
|
"loss": 1.3216, |
|
"num_input_tokens_seen": 131225056, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.17559373420919655, |
|
"grad_norm": 0.2917383015155792, |
|
"learning_rate": 3.309163882407384e-05, |
|
"loss": 1.2568, |
|
"num_input_tokens_seen": 132157504, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.17685699848408287, |
|
"grad_norm": 0.2731410264968872, |
|
"learning_rate": 3.305899284219128e-05, |
|
"loss": 1.3375, |
|
"num_input_tokens_seen": 133115200, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.17812026275896917, |
|
"grad_norm": 0.28233301639556885, |
|
"learning_rate": 3.302608637429786e-05, |
|
"loss": 1.2466, |
|
"num_input_tokens_seen": 134032192, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.1793835270338555, |
|
"grad_norm": 0.2799434959888458, |
|
"learning_rate": 3.2992919971308055e-05, |
|
"loss": 1.2824, |
|
"num_input_tokens_seen": 134994208, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.18064679130874178, |
|
"grad_norm": 0.29594945907592773, |
|
"learning_rate": 3.295949418848814e-05, |
|
"loss": 1.3309, |
|
"num_input_tokens_seen": 135938144, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1819100555836281, |
|
"grad_norm": 0.318526953458786, |
|
"learning_rate": 3.29258095854469e-05, |
|
"loss": 1.2905, |
|
"num_input_tokens_seen": 136866336, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.1831733198585144, |
|
"grad_norm": 0.2683306634426117, |
|
"learning_rate": 3.289186672612621e-05, |
|
"loss": 1.2648, |
|
"num_input_tokens_seen": 137815456, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.1844365841334007, |
|
"grad_norm": 0.27116644382476807, |
|
"learning_rate": 3.2857666178791656e-05, |
|
"loss": 1.2829, |
|
"num_input_tokens_seen": 138780256, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.18569984840828702, |
|
"grad_norm": 0.28254273533821106, |
|
"learning_rate": 3.282320851602298e-05, |
|
"loss": 1.3141, |
|
"num_input_tokens_seen": 139750496, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.1869631126831733, |
|
"grad_norm": 0.26385799050331116, |
|
"learning_rate": 3.2788494314704503e-05, |
|
"loss": 1.329, |
|
"num_input_tokens_seen": 140654176, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.18822637695805963, |
|
"grad_norm": 0.273930162191391, |
|
"learning_rate": 3.275352415601548e-05, |
|
"loss": 1.3267, |
|
"num_input_tokens_seen": 141615424, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.18948964123294593, |
|
"grad_norm": 0.2711365520954132, |
|
"learning_rate": 3.2718298625420366e-05, |
|
"loss": 1.2756, |
|
"num_input_tokens_seen": 142543328, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.19075290550783225, |
|
"grad_norm": 0.27136221528053284, |
|
"learning_rate": 3.268281831265899e-05, |
|
"loss": 1.3284, |
|
"num_input_tokens_seen": 143524416, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.19201616978271854, |
|
"grad_norm": 0.31618639826774597, |
|
"learning_rate": 3.264708381173672e-05, |
|
"loss": 1.3199, |
|
"num_input_tokens_seen": 144454016, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.19327943405760484, |
|
"grad_norm": 0.4721730053424835, |
|
"learning_rate": 3.261109572091448e-05, |
|
"loss": 1.3317, |
|
"num_input_tokens_seen": 145434336, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.19454269833249116, |
|
"grad_norm": 0.2652052342891693, |
|
"learning_rate": 3.257485464269878e-05, |
|
"loss": 1.3733, |
|
"num_input_tokens_seen": 146342112, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.19580596260737745, |
|
"grad_norm": 0.25424447655677795, |
|
"learning_rate": 3.253836118383157e-05, |
|
"loss": 1.2725, |
|
"num_input_tokens_seen": 147287264, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.19706922688226378, |
|
"grad_norm": 0.2884797751903534, |
|
"learning_rate": 3.2501615955280134e-05, |
|
"loss": 1.3223, |
|
"num_input_tokens_seen": 148183456, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.19833249115715007, |
|
"grad_norm": 0.2777753174304962, |
|
"learning_rate": 3.2464619572226836e-05, |
|
"loss": 1.3182, |
|
"num_input_tokens_seen": 149094624, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.1995957554320364, |
|
"grad_norm": 0.27247852087020874, |
|
"learning_rate": 3.242737265405882e-05, |
|
"loss": 1.3171, |
|
"num_input_tokens_seen": 149997920, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.2008590197069227, |
|
"grad_norm": 0.2738061249256134, |
|
"learning_rate": 3.238987582435767e-05, |
|
"loss": 1.2938, |
|
"num_input_tokens_seen": 150960064, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.20212228398180898, |
|
"grad_norm": 0.2913673520088196, |
|
"learning_rate": 3.235212971088891e-05, |
|
"loss": 1.3214, |
|
"num_input_tokens_seen": 151918208, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2033855482566953, |
|
"grad_norm": 0.279725044965744, |
|
"learning_rate": 3.231413494559156e-05, |
|
"loss": 1.2746, |
|
"num_input_tokens_seen": 152856864, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.2046488125315816, |
|
"grad_norm": 0.27453747391700745, |
|
"learning_rate": 3.227589216456752e-05, |
|
"loss": 1.3174, |
|
"num_input_tokens_seen": 153804192, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.20591207680646792, |
|
"grad_norm": 0.22528155148029327, |
|
"learning_rate": 3.223740200807091e-05, |
|
"loss": 1.2745, |
|
"num_input_tokens_seen": 154817632, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.20717534108135421, |
|
"grad_norm": 0.27404505014419556, |
|
"learning_rate": 3.2198665120497394e-05, |
|
"loss": 1.3032, |
|
"num_input_tokens_seen": 155756448, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.20843860535624054, |
|
"grad_norm": 0.32085704803466797, |
|
"learning_rate": 3.215968215037334e-05, |
|
"loss": 1.3325, |
|
"num_input_tokens_seen": 156763232, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.20970186963112683, |
|
"grad_norm": 0.27827686071395874, |
|
"learning_rate": 3.212045375034501e-05, |
|
"loss": 1.2955, |
|
"num_input_tokens_seen": 157709600, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.21096513390601312, |
|
"grad_norm": 0.2595587968826294, |
|
"learning_rate": 3.20809805771676e-05, |
|
"loss": 1.2932, |
|
"num_input_tokens_seen": 158695680, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.21222839818089945, |
|
"grad_norm": 0.26113271713256836, |
|
"learning_rate": 3.204126329169426e-05, |
|
"loss": 1.2886, |
|
"num_input_tokens_seen": 159651584, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.21349166245578574, |
|
"grad_norm": 0.3666292428970337, |
|
"learning_rate": 3.200130255886503e-05, |
|
"loss": 1.3232, |
|
"num_input_tokens_seen": 160621120, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.21475492673067206, |
|
"grad_norm": 0.30534592270851135, |
|
"learning_rate": 3.196109904769568e-05, |
|
"loss": 1.3539, |
|
"num_input_tokens_seen": 161585024, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.21601819100555836, |
|
"grad_norm": 0.2684236466884613, |
|
"learning_rate": 3.192065343126658e-05, |
|
"loss": 1.2818, |
|
"num_input_tokens_seen": 162539520, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.21728145528044468, |
|
"grad_norm": 0.26715096831321716, |
|
"learning_rate": 3.187996638671134e-05, |
|
"loss": 1.2616, |
|
"num_input_tokens_seen": 163462688, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.21854471955533097, |
|
"grad_norm": 0.26400476694107056, |
|
"learning_rate": 3.1839038595205555e-05, |
|
"loss": 1.3017, |
|
"num_input_tokens_seen": 164408768, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.21980798383021727, |
|
"grad_norm": 0.2887386381626129, |
|
"learning_rate": 3.1797870741955326e-05, |
|
"loss": 1.2897, |
|
"num_input_tokens_seen": 165382816, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.2210712481051036, |
|
"grad_norm": 0.26668059825897217, |
|
"learning_rate": 3.175646351618586e-05, |
|
"loss": 1.3151, |
|
"num_input_tokens_seen": 166320832, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.22233451237998988, |
|
"grad_norm": 0.2531121075153351, |
|
"learning_rate": 3.171481761112989e-05, |
|
"loss": 1.3027, |
|
"num_input_tokens_seen": 167349856, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.2235977766548762, |
|
"grad_norm": 0.24423161149024963, |
|
"learning_rate": 3.167293372401606e-05, |
|
"loss": 1.3245, |
|
"num_input_tokens_seen": 168295712, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.2248610409297625, |
|
"grad_norm": 0.31519579887390137, |
|
"learning_rate": 3.163081255605729e-05, |
|
"loss": 1.2645, |
|
"num_input_tokens_seen": 169282112, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.22612430520464882, |
|
"grad_norm": 0.26210370659828186, |
|
"learning_rate": 3.1588454812439e-05, |
|
"loss": 1.3267, |
|
"num_input_tokens_seen": 170222336, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.22738756947953512, |
|
"grad_norm": 0.27912288904190063, |
|
"learning_rate": 3.154586120230734e-05, |
|
"loss": 1.277, |
|
"num_input_tokens_seen": 171119488, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2286508337544214, |
|
"grad_norm": 0.26281440258026123, |
|
"learning_rate": 3.150303243875727e-05, |
|
"loss": 1.2892, |
|
"num_input_tokens_seen": 172093984, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.22991409802930773, |
|
"grad_norm": 0.2663213908672333, |
|
"learning_rate": 3.1459969238820664e-05, |
|
"loss": 1.3388, |
|
"num_input_tokens_seen": 172993696, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.23117736230419403, |
|
"grad_norm": 0.27080100774765015, |
|
"learning_rate": 3.141667232345429e-05, |
|
"loss": 1.3374, |
|
"num_input_tokens_seen": 173906304, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.23244062657908035, |
|
"grad_norm": 0.2679150104522705, |
|
"learning_rate": 3.137314241752775e-05, |
|
"loss": 1.288, |
|
"num_input_tokens_seen": 174847680, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.23370389085396664, |
|
"grad_norm": 0.2680162489414215, |
|
"learning_rate": 3.1329380249811304e-05, |
|
"loss": 1.3088, |
|
"num_input_tokens_seen": 175814240, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.23496715512885297, |
|
"grad_norm": 0.27686336636543274, |
|
"learning_rate": 3.128538655296373e-05, |
|
"loss": 1.2868, |
|
"num_input_tokens_seen": 176805408, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.23623041940373926, |
|
"grad_norm": 0.2732996344566345, |
|
"learning_rate": 3.1241162063520015e-05, |
|
"loss": 1.3692, |
|
"num_input_tokens_seen": 177763168, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.23749368367862556, |
|
"grad_norm": 0.25114187598228455, |
|
"learning_rate": 3.1196707521879027e-05, |
|
"loss": 1.3054, |
|
"num_input_tokens_seen": 178689312, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.23875694795351188, |
|
"grad_norm": 0.29648059606552124, |
|
"learning_rate": 3.115202367229115e-05, |
|
"loss": 1.3289, |
|
"num_input_tokens_seen": 179578144, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.24002021222839817, |
|
"grad_norm": 0.25034409761428833, |
|
"learning_rate": 3.110711126284578e-05, |
|
"loss": 1.305, |
|
"num_input_tokens_seen": 180480192, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.2412834765032845, |
|
"grad_norm": 0.26325249671936035, |
|
"learning_rate": 3.106197104545884e-05, |
|
"loss": 1.2645, |
|
"num_input_tokens_seen": 181482336, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.2425467407781708, |
|
"grad_norm": 0.279535710811615, |
|
"learning_rate": 3.101660377586017e-05, |
|
"loss": 1.2723, |
|
"num_input_tokens_seen": 182353792, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.2438100050530571, |
|
"grad_norm": 0.27417901158332825, |
|
"learning_rate": 3.097101021358088e-05, |
|
"loss": 1.2933, |
|
"num_input_tokens_seen": 183284000, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.2450732693279434, |
|
"grad_norm": 0.2854447066783905, |
|
"learning_rate": 3.092519112194063e-05, |
|
"loss": 1.2642, |
|
"num_input_tokens_seen": 184244640, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.2463365336028297, |
|
"grad_norm": 0.2935086190700531, |
|
"learning_rate": 3.087914726803486e-05, |
|
"loss": 1.3183, |
|
"num_input_tokens_seen": 185157728, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.24759979787771602, |
|
"grad_norm": 0.255464643239975, |
|
"learning_rate": 3.0832879422721926e-05, |
|
"loss": 1.2957, |
|
"num_input_tokens_seen": 186099200, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.24886306215260232, |
|
"grad_norm": 0.2608180642127991, |
|
"learning_rate": 3.078638836061023e-05, |
|
"loss": 1.3333, |
|
"num_input_tokens_seen": 187017280, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.2501263264274886, |
|
"grad_norm": 0.3294975459575653, |
|
"learning_rate": 3.073967486004523e-05, |
|
"loss": 1.332, |
|
"num_input_tokens_seen": 187879360, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.25138959070237493, |
|
"grad_norm": 0.2539006769657135, |
|
"learning_rate": 3.069273970309639e-05, |
|
"loss": 1.2726, |
|
"num_input_tokens_seen": 188825632, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.25265285497726125, |
|
"grad_norm": 0.282306969165802, |
|
"learning_rate": 3.064558367554414e-05, |
|
"loss": 1.32, |
|
"num_input_tokens_seen": 189801824, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.25265285497726125, |
|
"eval_loss": 1.321367859840393, |
|
"eval_runtime": 11.9892, |
|
"eval_samples_per_second": 12.511, |
|
"eval_steps_per_second": 0.834, |
|
"num_input_tokens_seen": 189801824, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2539161192521476, |
|
"grad_norm": 0.30715829133987427, |
|
"learning_rate": 3.0598207566866656e-05, |
|
"loss": 1.2423, |
|
"num_input_tokens_seen": 190754304, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.25517938352703384, |
|
"grad_norm": 0.2773028016090393, |
|
"learning_rate": 3.055061217022669e-05, |
|
"loss": 1.2411, |
|
"num_input_tokens_seen": 191695456, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.25644264780192016, |
|
"grad_norm": 0.267785906791687, |
|
"learning_rate": 3.0502798282458278e-05, |
|
"loss": 1.2461, |
|
"num_input_tokens_seen": 192625312, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.2577059120768065, |
|
"grad_norm": 0.2458842545747757, |
|
"learning_rate": 3.0454766704053395e-05, |
|
"loss": 1.2419, |
|
"num_input_tokens_seen": 193574848, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.25896917635169275, |
|
"grad_norm": 0.27695903182029724, |
|
"learning_rate": 3.040651823914855e-05, |
|
"loss": 1.3366, |
|
"num_input_tokens_seen": 194470688, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.2602324406265791, |
|
"grad_norm": 0.3028598725795746, |
|
"learning_rate": 3.0358053695511335e-05, |
|
"loss": 1.3199, |
|
"num_input_tokens_seen": 195437280, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.2614957049014654, |
|
"grad_norm": 0.2882876396179199, |
|
"learning_rate": 3.030937388452689e-05, |
|
"loss": 1.3221, |
|
"num_input_tokens_seen": 196396320, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.2627589691763517, |
|
"grad_norm": 0.29042840003967285, |
|
"learning_rate": 3.026047962118433e-05, |
|
"loss": 1.2693, |
|
"num_input_tokens_seen": 197314176, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.264022233451238, |
|
"grad_norm": 0.3192022740840912, |
|
"learning_rate": 3.0211371724063097e-05, |
|
"loss": 1.2668, |
|
"num_input_tokens_seen": 198295456, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.2652854977261243, |
|
"grad_norm": 0.250468373298645, |
|
"learning_rate": 3.016205101531925e-05, |
|
"loss": 1.2951, |
|
"num_input_tokens_seen": 199239264, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.26654876200101063, |
|
"grad_norm": 0.2620362639427185, |
|
"learning_rate": 3.0112518320671694e-05, |
|
"loss": 1.2826, |
|
"num_input_tokens_seen": 200166720, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.2678120262758969, |
|
"grad_norm": 0.2919938862323761, |
|
"learning_rate": 3.0062774469388378e-05, |
|
"loss": 1.3001, |
|
"num_input_tokens_seen": 201163456, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.2690752905507832, |
|
"grad_norm": 0.26850852370262146, |
|
"learning_rate": 3.0012820294272402e-05, |
|
"loss": 1.3118, |
|
"num_input_tokens_seen": 202055360, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.27033855482566954, |
|
"grad_norm": 0.2463986724615097, |
|
"learning_rate": 2.9962656631648068e-05, |
|
"loss": 1.2797, |
|
"num_input_tokens_seen": 202973376, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.27160181910055586, |
|
"grad_norm": 0.3001090884208679, |
|
"learning_rate": 2.991228432134687e-05, |
|
"loss": 1.2917, |
|
"num_input_tokens_seen": 203918208, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.27286508337544213, |
|
"grad_norm": 0.2551255524158478, |
|
"learning_rate": 2.9861704206693464e-05, |
|
"loss": 1.299, |
|
"num_input_tokens_seen": 204934080, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.27412834765032845, |
|
"grad_norm": 0.26097556948661804, |
|
"learning_rate": 2.9810917134491515e-05, |
|
"loss": 1.2935, |
|
"num_input_tokens_seen": 205865376, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.2753916119252148, |
|
"grad_norm": 0.2827478051185608, |
|
"learning_rate": 2.975992395500956e-05, |
|
"loss": 1.3006, |
|
"num_input_tokens_seen": 206770144, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.27665487620010104, |
|
"grad_norm": 0.28954237699508667, |
|
"learning_rate": 2.9708725521966717e-05, |
|
"loss": 1.3424, |
|
"num_input_tokens_seen": 207706784, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.27791814047498736, |
|
"grad_norm": 0.2639777660369873, |
|
"learning_rate": 2.9657322692518452e-05, |
|
"loss": 1.231, |
|
"num_input_tokens_seen": 208641184, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.2791814047498737, |
|
"grad_norm": 0.24287603795528412, |
|
"learning_rate": 2.9605716327242188e-05, |
|
"loss": 1.297, |
|
"num_input_tokens_seen": 209596512, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.28044466902476, |
|
"grad_norm": 0.2651768624782562, |
|
"learning_rate": 2.9553907290122907e-05, |
|
"loss": 1.3049, |
|
"num_input_tokens_seen": 210586464, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.2817079332996463, |
|
"grad_norm": 0.2656504809856415, |
|
"learning_rate": 2.9501896448538696e-05, |
|
"loss": 1.3497, |
|
"num_input_tokens_seen": 211556992, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.2829711975745326, |
|
"grad_norm": 0.26418015360832214, |
|
"learning_rate": 2.9449684673246218e-05, |
|
"loss": 1.2702, |
|
"num_input_tokens_seen": 212522560, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.2842344618494189, |
|
"grad_norm": 0.2586632966995239, |
|
"learning_rate": 2.9397272838366127e-05, |
|
"loss": 1.3232, |
|
"num_input_tokens_seen": 213488448, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.2854977261243052, |
|
"grad_norm": 0.28703370690345764, |
|
"learning_rate": 2.934466182136845e-05, |
|
"loss": 1.3158, |
|
"num_input_tokens_seen": 214453408, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.2867609903991915, |
|
"grad_norm": 0.2626774311065674, |
|
"learning_rate": 2.9291852503057874e-05, |
|
"loss": 1.3394, |
|
"num_input_tokens_seen": 215412832, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.28802425467407783, |
|
"grad_norm": 0.256173312664032, |
|
"learning_rate": 2.923884576755903e-05, |
|
"loss": 1.3325, |
|
"num_input_tokens_seen": 216335968, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.28928751894896415, |
|
"grad_norm": 0.26622363924980164, |
|
"learning_rate": 2.9185642502301656e-05, |
|
"loss": 1.2535, |
|
"num_input_tokens_seen": 217269728, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.2905507832238504, |
|
"grad_norm": 0.3084118068218231, |
|
"learning_rate": 2.9132243598005775e-05, |
|
"loss": 1.2808, |
|
"num_input_tokens_seen": 218189440, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.29181404749873674, |
|
"grad_norm": 0.32699644565582275, |
|
"learning_rate": 2.9078649948666754e-05, |
|
"loss": 1.3637, |
|
"num_input_tokens_seen": 219151008, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.29307731177362306, |
|
"grad_norm": 0.2988159954547882, |
|
"learning_rate": 2.902486245154035e-05, |
|
"loss": 1.2898, |
|
"num_input_tokens_seen": 220065312, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.29434057604850933, |
|
"grad_norm": 0.27708715200424194, |
|
"learning_rate": 2.897088200712769e-05, |
|
"loss": 1.2583, |
|
"num_input_tokens_seen": 220958560, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.29560384032339565, |
|
"grad_norm": 0.2532431185245514, |
|
"learning_rate": 2.8916709519160187e-05, |
|
"loss": 1.2647, |
|
"num_input_tokens_seen": 221960800, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.296867104598282, |
|
"grad_norm": 0.2507975101470947, |
|
"learning_rate": 2.8862345894584418e-05, |
|
"loss": 1.2569, |
|
"num_input_tokens_seen": 222927616, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.2981303688731683, |
|
"grad_norm": 0.30082589387893677, |
|
"learning_rate": 2.880779204354694e-05, |
|
"loss": 1.2582, |
|
"num_input_tokens_seen": 223897536, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.29939363314805456, |
|
"grad_norm": 0.25084131956100464, |
|
"learning_rate": 2.875304887937904e-05, |
|
"loss": 1.2445, |
|
"num_input_tokens_seen": 224856256, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.3006568974229409, |
|
"grad_norm": 0.27553117275238037, |
|
"learning_rate": 2.869811731858146e-05, |
|
"loss": 1.2693, |
|
"num_input_tokens_seen": 225789760, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.3019201616978272, |
|
"grad_norm": 0.31296080350875854, |
|
"learning_rate": 2.864299828080905e-05, |
|
"loss": 1.3125, |
|
"num_input_tokens_seen": 226730144, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.30318342597271347, |
|
"grad_norm": 0.2597751021385193, |
|
"learning_rate": 2.858769268885535e-05, |
|
"loss": 1.2959, |
|
"num_input_tokens_seen": 227688608, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.3044466902475998, |
|
"grad_norm": 0.27299267053604126, |
|
"learning_rate": 2.8532201468637184e-05, |
|
"loss": 1.2932, |
|
"num_input_tokens_seen": 228590528, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.3057099545224861, |
|
"grad_norm": 0.2804098129272461, |
|
"learning_rate": 2.8476525549179103e-05, |
|
"loss": 1.3001, |
|
"num_input_tokens_seen": 229560000, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.30697321879737244, |
|
"grad_norm": 0.30946534872055054, |
|
"learning_rate": 2.8420665862597894e-05, |
|
"loss": 1.2657, |
|
"num_input_tokens_seen": 230542208, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.3082364830722587, |
|
"grad_norm": 0.2868455648422241, |
|
"learning_rate": 2.8364623344086917e-05, |
|
"loss": 1.3603, |
|
"num_input_tokens_seen": 231454912, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.309499747347145, |
|
"grad_norm": 0.27222952246665955, |
|
"learning_rate": 2.8308398931900488e-05, |
|
"loss": 1.2796, |
|
"num_input_tokens_seen": 232387808, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.31076301162203135, |
|
"grad_norm": 0.29506227374076843, |
|
"learning_rate": 2.825199356733814e-05, |
|
"loss": 1.2863, |
|
"num_input_tokens_seen": 233295584, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.3120262758969176, |
|
"grad_norm": 0.25060921907424927, |
|
"learning_rate": 2.8195408194728893e-05, |
|
"loss": 1.2725, |
|
"num_input_tokens_seen": 234308960, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.31328954017180394, |
|
"grad_norm": 0.29915860295295715, |
|
"learning_rate": 2.8138643761415432e-05, |
|
"loss": 1.2656, |
|
"num_input_tokens_seen": 235218880, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.31455280444669026, |
|
"grad_norm": 0.30492904782295227, |
|
"learning_rate": 2.8081701217738234e-05, |
|
"loss": 1.2962, |
|
"num_input_tokens_seen": 236173888, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.3158160687215765, |
|
"grad_norm": 0.2989721894264221, |
|
"learning_rate": 2.8024581517019686e-05, |
|
"loss": 1.272, |
|
"num_input_tokens_seen": 237219584, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.31707933299646285, |
|
"grad_norm": 0.2604142725467682, |
|
"learning_rate": 2.7967285615548084e-05, |
|
"loss": 1.2846, |
|
"num_input_tokens_seen": 238150432, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.31834259727134917, |
|
"grad_norm": 0.2856138050556183, |
|
"learning_rate": 2.790981447256168e-05, |
|
"loss": 1.309, |
|
"num_input_tokens_seen": 239091040, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.3196058615462355, |
|
"grad_norm": 0.26201140880584717, |
|
"learning_rate": 2.785216905023256e-05, |
|
"loss": 1.3273, |
|
"num_input_tokens_seen": 240005152, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.32086912582112176, |
|
"grad_norm": 0.2805967628955841, |
|
"learning_rate": 2.7794350313650574e-05, |
|
"loss": 1.3044, |
|
"num_input_tokens_seen": 240957856, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.3221323900960081, |
|
"grad_norm": 0.25588178634643555, |
|
"learning_rate": 2.7736359230807183e-05, |
|
"loss": 1.4082, |
|
"num_input_tokens_seen": 241939904, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.3233956543708944, |
|
"grad_norm": 0.25974375009536743, |
|
"learning_rate": 2.767819677257922e-05, |
|
"loss": 1.3256, |
|
"num_input_tokens_seen": 242886176, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.32465891864578067, |
|
"grad_norm": 0.2552843689918518, |
|
"learning_rate": 2.761986391271267e-05, |
|
"loss": 1.3003, |
|
"num_input_tokens_seen": 243769600, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.325922182920667, |
|
"grad_norm": 0.2774961590766907, |
|
"learning_rate": 2.7561361627806343e-05, |
|
"loss": 1.3239, |
|
"num_input_tokens_seen": 244675136, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.3271854471955533, |
|
"grad_norm": 0.27106648683547974, |
|
"learning_rate": 2.7502690897295546e-05, |
|
"loss": 1.3087, |
|
"num_input_tokens_seen": 245566400, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.32844871147043964, |
|
"grad_norm": 0.253461629152298, |
|
"learning_rate": 2.7443852703435657e-05, |
|
"loss": 1.2503, |
|
"num_input_tokens_seen": 246513216, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.3297119757453259, |
|
"grad_norm": 0.290099173784256, |
|
"learning_rate": 2.738484803128571e-05, |
|
"loss": 1.3034, |
|
"num_input_tokens_seen": 247488992, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.3309752400202122, |
|
"grad_norm": 0.2331458479166031, |
|
"learning_rate": 2.7325677868691897e-05, |
|
"loss": 1.2443, |
|
"num_input_tokens_seen": 248404800, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.33223850429509855, |
|
"grad_norm": 0.2953519821166992, |
|
"learning_rate": 2.7266343206271e-05, |
|
"loss": 1.2703, |
|
"num_input_tokens_seen": 249396800, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.3335017685699848, |
|
"grad_norm": 0.2447034865617752, |
|
"learning_rate": 2.7206845037393847e-05, |
|
"loss": 1.2079, |
|
"num_input_tokens_seen": 250344864, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.33476503284487114, |
|
"grad_norm": 0.2688887417316437, |
|
"learning_rate": 2.7147184358168654e-05, |
|
"loss": 1.2866, |
|
"num_input_tokens_seen": 251205088, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.33602829711975746, |
|
"grad_norm": 0.284983366727829, |
|
"learning_rate": 2.7087362167424363e-05, |
|
"loss": 1.2328, |
|
"num_input_tokens_seen": 252125664, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.3372915613946438, |
|
"grad_norm": 0.26568886637687683, |
|
"learning_rate": 2.7027379466693918e-05, |
|
"loss": 1.3343, |
|
"num_input_tokens_seen": 253090112, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.33855482566953005, |
|
"grad_norm": 0.2735290229320526, |
|
"learning_rate": 2.6967237260197486e-05, |
|
"loss": 1.3117, |
|
"num_input_tokens_seen": 254002816, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.33981808994441637, |
|
"grad_norm": 0.2602190673351288, |
|
"learning_rate": 2.6906936554825652e-05, |
|
"loss": 1.2729, |
|
"num_input_tokens_seen": 254977856, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.3410813542193027, |
|
"grad_norm": 0.279680997133255, |
|
"learning_rate": 2.6846478360122567e-05, |
|
"loss": 1.2494, |
|
"num_input_tokens_seen": 255872864, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.34234461849418896, |
|
"grad_norm": 0.29687556624412537, |
|
"learning_rate": 2.6785863688269038e-05, |
|
"loss": 1.3039, |
|
"num_input_tokens_seen": 256788352, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.3436078827690753, |
|
"grad_norm": 0.24734219908714294, |
|
"learning_rate": 2.6725093554065596e-05, |
|
"loss": 1.2728, |
|
"num_input_tokens_seen": 257691904, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.3448711470439616, |
|
"grad_norm": 0.2798856496810913, |
|
"learning_rate": 2.666416897491548e-05, |
|
"loss": 1.2519, |
|
"num_input_tokens_seen": 258613408, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.3461344113188479, |
|
"grad_norm": 0.3039948046207428, |
|
"learning_rate": 2.660309097080763e-05, |
|
"loss": 1.354, |
|
"num_input_tokens_seen": 259569248, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.3473976755937342, |
|
"grad_norm": 0.25825923681259155, |
|
"learning_rate": 2.6541860564299605e-05, |
|
"loss": 1.265, |
|
"num_input_tokens_seen": 260534624, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.3486609398686205, |
|
"grad_norm": 0.2977043390274048, |
|
"learning_rate": 2.6480478780500435e-05, |
|
"loss": 1.3044, |
|
"num_input_tokens_seen": 261467520, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.34992420414350683, |
|
"grad_norm": 0.2831237018108368, |
|
"learning_rate": 2.6418946647053525e-05, |
|
"loss": 1.2419, |
|
"num_input_tokens_seen": 262404128, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.3511874684183931, |
|
"grad_norm": 0.27858638763427734, |
|
"learning_rate": 2.635726519411936e-05, |
|
"loss": 1.2902, |
|
"num_input_tokens_seen": 263348320, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.3524507326932794, |
|
"grad_norm": 0.2645137310028076, |
|
"learning_rate": 2.629543545435835e-05, |
|
"loss": 1.2151, |
|
"num_input_tokens_seen": 264335616, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.35371399696816574, |
|
"grad_norm": 0.2533610165119171, |
|
"learning_rate": 2.623345846291347e-05, |
|
"loss": 1.2592, |
|
"num_input_tokens_seen": 265353120, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.35497726124305207, |
|
"grad_norm": 0.25733280181884766, |
|
"learning_rate": 2.6171335257392957e-05, |
|
"loss": 1.3101, |
|
"num_input_tokens_seen": 266300480, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.35624052551793833, |
|
"grad_norm": 0.2579527199268341, |
|
"learning_rate": 2.610906687785296e-05, |
|
"loss": 1.3144, |
|
"num_input_tokens_seen": 267223328, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.35750378979282466, |
|
"grad_norm": 0.2560044527053833, |
|
"learning_rate": 2.6046654366780096e-05, |
|
"loss": 1.2442, |
|
"num_input_tokens_seen": 268154112, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.358767054067711, |
|
"grad_norm": 0.24506497383117676, |
|
"learning_rate": 2.5984098769073995e-05, |
|
"loss": 1.3063, |
|
"num_input_tokens_seen": 269044736, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.36003031834259724, |
|
"grad_norm": 0.27899622917175293, |
|
"learning_rate": 2.592140113202984e-05, |
|
"loss": 1.2877, |
|
"num_input_tokens_seen": 270024064, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.36129358261748357, |
|
"grad_norm": 0.2520020604133606, |
|
"learning_rate": 2.5858562505320787e-05, |
|
"loss": 1.2984, |
|
"num_input_tokens_seen": 270993600, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.3625568468923699, |
|
"grad_norm": 0.24186141788959503, |
|
"learning_rate": 2.5795583940980456e-05, |
|
"loss": 1.2663, |
|
"num_input_tokens_seen": 271930176, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.3638201111672562, |
|
"grad_norm": 0.28816744685173035, |
|
"learning_rate": 2.5732466493385238e-05, |
|
"loss": 1.281, |
|
"num_input_tokens_seen": 272857216, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.3650833754421425, |
|
"grad_norm": 0.29359421133995056, |
|
"learning_rate": 2.566921121923671e-05, |
|
"loss": 1.2804, |
|
"num_input_tokens_seen": 273869376, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.3663466397170288, |
|
"grad_norm": 0.2661145329475403, |
|
"learning_rate": 2.5605819177543906e-05, |
|
"loss": 1.3292, |
|
"num_input_tokens_seen": 274802592, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.3676099039919151, |
|
"grad_norm": 0.26722949743270874, |
|
"learning_rate": 2.55422914296056e-05, |
|
"loss": 1.3162, |
|
"num_input_tokens_seen": 275777312, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.3688731682668014, |
|
"grad_norm": 0.2770121991634369, |
|
"learning_rate": 2.5478629038992545e-05, |
|
"loss": 1.2678, |
|
"num_input_tokens_seen": 276772352, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.3701364325416877, |
|
"grad_norm": 0.24549973011016846, |
|
"learning_rate": 2.5414833071529645e-05, |
|
"loss": 1.2787, |
|
"num_input_tokens_seen": 277728896, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.37139969681657403, |
|
"grad_norm": 0.25942620635032654, |
|
"learning_rate": 2.5350904595278142e-05, |
|
"loss": 1.2834, |
|
"num_input_tokens_seen": 278658272, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.37266296109146035, |
|
"grad_norm": 0.25496846437454224, |
|
"learning_rate": 2.52868446805177e-05, |
|
"loss": 1.2753, |
|
"num_input_tokens_seen": 279635456, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.3739262253663466, |
|
"grad_norm": 0.26107245683670044, |
|
"learning_rate": 2.5222654399728518e-05, |
|
"loss": 1.2995, |
|
"num_input_tokens_seen": 280610176, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.37518948964123294, |
|
"grad_norm": 0.29526421427726746, |
|
"learning_rate": 2.515833482757335e-05, |
|
"loss": 1.2749, |
|
"num_input_tokens_seen": 281500224, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.37645275391611926, |
|
"grad_norm": 0.2750958204269409, |
|
"learning_rate": 2.5093887040879536e-05, |
|
"loss": 1.2654, |
|
"num_input_tokens_seen": 282466240, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.37771601819100553, |
|
"grad_norm": 0.26100271940231323, |
|
"learning_rate": 2.502931211862095e-05, |
|
"loss": 1.2777, |
|
"num_input_tokens_seen": 283435136, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.37897928246589185, |
|
"grad_norm": 0.29179760813713074, |
|
"learning_rate": 2.4964611141899948e-05, |
|
"loss": 1.258, |
|
"num_input_tokens_seen": 284388960, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3802425467407782, |
|
"grad_norm": 0.2875267565250397, |
|
"learning_rate": 2.489978519392929e-05, |
|
"loss": 1.277, |
|
"num_input_tokens_seen": 285277344, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.3815058110156645, |
|
"grad_norm": 0.28722459077835083, |
|
"learning_rate": 2.4834835360013953e-05, |
|
"loss": 1.2274, |
|
"num_input_tokens_seen": 286206112, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.38276907529055076, |
|
"grad_norm": 0.2907884418964386, |
|
"learning_rate": 2.476976272753301e-05, |
|
"loss": 1.26, |
|
"num_input_tokens_seen": 287188160, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.3840323395654371, |
|
"grad_norm": 0.2554284334182739, |
|
"learning_rate": 2.4704568385921404e-05, |
|
"loss": 1.2949, |
|
"num_input_tokens_seen": 288111200, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.3852956038403234, |
|
"grad_norm": 0.24661648273468018, |
|
"learning_rate": 2.4639253426651703e-05, |
|
"loss": 1.2442, |
|
"num_input_tokens_seen": 289071840, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.3865588681152097, |
|
"grad_norm": 0.2564159035682678, |
|
"learning_rate": 2.457381894321585e-05, |
|
"loss": 1.2549, |
|
"num_input_tokens_seen": 290037344, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.387822132390096, |
|
"grad_norm": 0.24792881309986115, |
|
"learning_rate": 2.4508266031106835e-05, |
|
"loss": 1.2534, |
|
"num_input_tokens_seen": 290963680, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.3890853966649823, |
|
"grad_norm": 0.29164549708366394, |
|
"learning_rate": 2.4442595787800345e-05, |
|
"loss": 1.2799, |
|
"num_input_tokens_seen": 291992224, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.39034866093986864, |
|
"grad_norm": 0.24966460466384888, |
|
"learning_rate": 2.4376809312736438e-05, |
|
"loss": 1.2712, |
|
"num_input_tokens_seen": 292976480, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.3916119252147549, |
|
"grad_norm": 0.28835946321487427, |
|
"learning_rate": 2.431090770730107e-05, |
|
"loss": 1.3135, |
|
"num_input_tokens_seen": 293943776, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.39287518948964123, |
|
"grad_norm": 0.25582680106163025, |
|
"learning_rate": 2.4244892074807714e-05, |
|
"loss": 1.1963, |
|
"num_input_tokens_seen": 294860864, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.39413845376452755, |
|
"grad_norm": 0.24214211106300354, |
|
"learning_rate": 2.4178763520478864e-05, |
|
"loss": 1.225, |
|
"num_input_tokens_seen": 295732256, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.3954017180394138, |
|
"grad_norm": 0.30721724033355713, |
|
"learning_rate": 2.4112523151427515e-05, |
|
"loss": 1.2633, |
|
"num_input_tokens_seen": 296664736, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.39666498231430014, |
|
"grad_norm": 0.30337947607040405, |
|
"learning_rate": 2.4046172076638657e-05, |
|
"loss": 1.2676, |
|
"num_input_tokens_seen": 297635488, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.39792824658918646, |
|
"grad_norm": 0.28588712215423584, |
|
"learning_rate": 2.3979711406950688e-05, |
|
"loss": 1.2635, |
|
"num_input_tokens_seen": 298546208, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.3991915108640728, |
|
"grad_norm": 0.27065521478652954, |
|
"learning_rate": 2.3913142255036848e-05, |
|
"loss": 1.3024, |
|
"num_input_tokens_seen": 299442720, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.40045477513895905, |
|
"grad_norm": 0.2623492181301117, |
|
"learning_rate": 2.384646573538654e-05, |
|
"loss": 1.2968, |
|
"num_input_tokens_seen": 300421664, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.4017180394138454, |
|
"grad_norm": 0.27391478419303894, |
|
"learning_rate": 2.3779682964286715e-05, |
|
"loss": 1.2181, |
|
"num_input_tokens_seen": 301369824, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.4029813036887317, |
|
"grad_norm": 0.2633381187915802, |
|
"learning_rate": 2.3712795059803166e-05, |
|
"loss": 1.2459, |
|
"num_input_tokens_seen": 302411648, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.40424456796361796, |
|
"grad_norm": 0.2716757655143738, |
|
"learning_rate": 2.36458031417618e-05, |
|
"loss": 1.2883, |
|
"num_input_tokens_seen": 303342464, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.4055078322385043, |
|
"grad_norm": 0.26981112360954285, |
|
"learning_rate": 2.3578708331729927e-05, |
|
"loss": 1.2978, |
|
"num_input_tokens_seen": 304307424, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.4067710965133906, |
|
"grad_norm": 0.24773098528385162, |
|
"learning_rate": 2.3511511752997423e-05, |
|
"loss": 1.3291, |
|
"num_input_tokens_seen": 305311648, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.40803436078827693, |
|
"grad_norm": 0.2609155774116516, |
|
"learning_rate": 2.3444214530557985e-05, |
|
"loss": 1.2416, |
|
"num_input_tokens_seen": 306299200, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.4092976250631632, |
|
"grad_norm": 0.258277028799057, |
|
"learning_rate": 2.3376817791090263e-05, |
|
"loss": 1.2476, |
|
"num_input_tokens_seen": 307199776, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.4105608893380495, |
|
"grad_norm": 0.3055669963359833, |
|
"learning_rate": 2.3309322662938994e-05, |
|
"loss": 1.2846, |
|
"num_input_tokens_seen": 308118080, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.41182415361293584, |
|
"grad_norm": 0.28719931840896606, |
|
"learning_rate": 2.3241730276096136e-05, |
|
"loss": 1.2432, |
|
"num_input_tokens_seen": 309095584, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.4130874178878221, |
|
"grad_norm": 0.2620775103569031, |
|
"learning_rate": 2.3174041762181924e-05, |
|
"loss": 1.3018, |
|
"num_input_tokens_seen": 310052032, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.41435068216270843, |
|
"grad_norm": 0.2525536119937897, |
|
"learning_rate": 2.310625825442595e-05, |
|
"loss": 1.2721, |
|
"num_input_tokens_seen": 311011040, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.41561394643759475, |
|
"grad_norm": 0.24205638468265533, |
|
"learning_rate": 2.3038380887648158e-05, |
|
"loss": 1.283, |
|
"num_input_tokens_seen": 311953920, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.41687721071248107, |
|
"grad_norm": 0.2821497321128845, |
|
"learning_rate": 2.2970410798239875e-05, |
|
"loss": 1.2184, |
|
"num_input_tokens_seen": 312900064, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.41814047498736734, |
|
"grad_norm": 0.26797381043434143, |
|
"learning_rate": 2.290234912414478e-05, |
|
"loss": 1.2682, |
|
"num_input_tokens_seen": 313856160, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.41940373926225366, |
|
"grad_norm": 0.26029297709465027, |
|
"learning_rate": 2.2834197004839832e-05, |
|
"loss": 1.2241, |
|
"num_input_tokens_seen": 314758112, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.42066700353714, |
|
"grad_norm": 0.2785716950893402, |
|
"learning_rate": 2.276595558131622e-05, |
|
"loss": 1.1807, |
|
"num_input_tokens_seen": 315687232, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.42193026781202625, |
|
"grad_norm": 0.282991886138916, |
|
"learning_rate": 2.2697625996060242e-05, |
|
"loss": 1.2337, |
|
"num_input_tokens_seen": 316675552, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.42319353208691257, |
|
"grad_norm": 0.26791542768478394, |
|
"learning_rate": 2.2629209393034202e-05, |
|
"loss": 1.277, |
|
"num_input_tokens_seen": 317594112, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.4244567963617989, |
|
"grad_norm": 0.2645999789237976, |
|
"learning_rate": 2.256070691765721e-05, |
|
"loss": 1.2995, |
|
"num_input_tokens_seen": 318542656, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.4257200606366852, |
|
"grad_norm": 0.2621070146560669, |
|
"learning_rate": 2.249211971678606e-05, |
|
"loss": 1.2712, |
|
"num_input_tokens_seen": 319529632, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.4269833249115715, |
|
"grad_norm": 0.292126327753067, |
|
"learning_rate": 2.2423448938696008e-05, |
|
"loss": 1.281, |
|
"num_input_tokens_seen": 320495008, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.4282465891864578, |
|
"grad_norm": 0.26194462180137634, |
|
"learning_rate": 2.235469573306152e-05, |
|
"loss": 1.2705, |
|
"num_input_tokens_seen": 321386944, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.4295098534613441, |
|
"grad_norm": 0.26072680950164795, |
|
"learning_rate": 2.2285861250937078e-05, |
|
"loss": 1.3382, |
|
"num_input_tokens_seen": 322285280, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.4307731177362304, |
|
"grad_norm": 0.308788001537323, |
|
"learning_rate": 2.2216946644737867e-05, |
|
"loss": 1.3189, |
|
"num_input_tokens_seen": 323297568, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.4320363820111167, |
|
"grad_norm": 0.26922985911369324, |
|
"learning_rate": 2.2147953068220498e-05, |
|
"loss": 1.2132, |
|
"num_input_tokens_seen": 324283360, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.43329964628600304, |
|
"grad_norm": 0.27006080746650696, |
|
"learning_rate": 2.207888167646369e-05, |
|
"loss": 1.2268, |
|
"num_input_tokens_seen": 325189760, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.43456291056088936, |
|
"grad_norm": 0.26316067576408386, |
|
"learning_rate": 2.2009733625848932e-05, |
|
"loss": 1.2945, |
|
"num_input_tokens_seen": 326144000, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.4358261748357756, |
|
"grad_norm": 0.2620113790035248, |
|
"learning_rate": 2.1940510074041124e-05, |
|
"loss": 1.2857, |
|
"num_input_tokens_seen": 327078432, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.43708943911066195, |
|
"grad_norm": 0.3018427789211273, |
|
"learning_rate": 2.1871212179969193e-05, |
|
"loss": 1.2732, |
|
"num_input_tokens_seen": 327975328, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.43835270338554827, |
|
"grad_norm": 0.3014253079891205, |
|
"learning_rate": 2.180184110380668e-05, |
|
"loss": 1.2944, |
|
"num_input_tokens_seen": 328923296, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.43961596766043454, |
|
"grad_norm": 0.26709380745887756, |
|
"learning_rate": 2.173239800695235e-05, |
|
"loss": 1.2801, |
|
"num_input_tokens_seen": 329852576, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.44087923193532086, |
|
"grad_norm": 0.26904571056365967, |
|
"learning_rate": 2.1662884052010715e-05, |
|
"loss": 1.3081, |
|
"num_input_tokens_seen": 330887712, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.4421424962102072, |
|
"grad_norm": 0.2532831132411957, |
|
"learning_rate": 2.1593300402772578e-05, |
|
"loss": 1.2399, |
|
"num_input_tokens_seen": 331852448, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4434057604850935, |
|
"grad_norm": 0.2727656364440918, |
|
"learning_rate": 2.1523648224195553e-05, |
|
"loss": 1.3334, |
|
"num_input_tokens_seen": 332849824, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.44466902475997977, |
|
"grad_norm": 0.2567518353462219, |
|
"learning_rate": 2.1453928682384567e-05, |
|
"loss": 1.2469, |
|
"num_input_tokens_seen": 333796544, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.4459322890348661, |
|
"grad_norm": 0.27944666147232056, |
|
"learning_rate": 2.1384142944572327e-05, |
|
"loss": 1.2182, |
|
"num_input_tokens_seen": 334769728, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.4471955533097524, |
|
"grad_norm": 0.26202327013015747, |
|
"learning_rate": 2.131429217909978e-05, |
|
"loss": 1.2556, |
|
"num_input_tokens_seen": 335697824, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.4484588175846387, |
|
"grad_norm": 0.2528652250766754, |
|
"learning_rate": 2.1244377555396552e-05, |
|
"loss": 1.2889, |
|
"num_input_tokens_seen": 336718816, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.449722081859525, |
|
"grad_norm": 0.27603092789649963, |
|
"learning_rate": 2.1174400243961384e-05, |
|
"loss": 1.2786, |
|
"num_input_tokens_seen": 337621120, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.4509853461344113, |
|
"grad_norm": 0.2740069627761841, |
|
"learning_rate": 2.1104361416342515e-05, |
|
"loss": 1.2048, |
|
"num_input_tokens_seen": 338654368, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.45224861040929765, |
|
"grad_norm": 0.2614036798477173, |
|
"learning_rate": 2.1034262245118083e-05, |
|
"loss": 1.299, |
|
"num_input_tokens_seen": 339635072, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.4535118746841839, |
|
"grad_norm": 0.2862122058868408, |
|
"learning_rate": 2.0964103903876478e-05, |
|
"loss": 1.2675, |
|
"num_input_tokens_seen": 340587008, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.45477513895907024, |
|
"grad_norm": 0.2503550946712494, |
|
"learning_rate": 2.089388756719672e-05, |
|
"loss": 1.3265, |
|
"num_input_tokens_seen": 341507104, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.45603840323395656, |
|
"grad_norm": 0.2760883867740631, |
|
"learning_rate": 2.0823614410628762e-05, |
|
"loss": 1.2568, |
|
"num_input_tokens_seen": 342452832, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.4573016675088428, |
|
"grad_norm": 0.25591230392456055, |
|
"learning_rate": 2.075328561067385e-05, |
|
"loss": 1.2854, |
|
"num_input_tokens_seen": 343443968, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.45856493178372915, |
|
"grad_norm": 0.247548446059227, |
|
"learning_rate": 2.0682902344764768e-05, |
|
"loss": 1.2427, |
|
"num_input_tokens_seen": 344422112, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.45982819605861547, |
|
"grad_norm": 0.2951701879501343, |
|
"learning_rate": 2.0612465791246192e-05, |
|
"loss": 1.2824, |
|
"num_input_tokens_seen": 345312448, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.4610914603335018, |
|
"grad_norm": 0.2961169481277466, |
|
"learning_rate": 2.0541977129354912e-05, |
|
"loss": 1.266, |
|
"num_input_tokens_seen": 346277152, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.46235472460838806, |
|
"grad_norm": 0.27115508913993835, |
|
"learning_rate": 2.0471437539200107e-05, |
|
"loss": 1.3118, |
|
"num_input_tokens_seen": 347211840, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.4636179888832744, |
|
"grad_norm": 0.27469298243522644, |
|
"learning_rate": 2.0400848201743608e-05, |
|
"loss": 1.1801, |
|
"num_input_tokens_seen": 348124992, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.4648812531581607, |
|
"grad_norm": 0.26864269375801086, |
|
"learning_rate": 2.033021029878008e-05, |
|
"loss": 1.2319, |
|
"num_input_tokens_seen": 349074176, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.46614451743304697, |
|
"grad_norm": 0.2966035008430481, |
|
"learning_rate": 2.0259525012917273e-05, |
|
"loss": 1.3158, |
|
"num_input_tokens_seen": 350022112, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.4674077817079333, |
|
"grad_norm": 0.24909211695194244, |
|
"learning_rate": 2.0188793527556226e-05, |
|
"loss": 1.2902, |
|
"num_input_tokens_seen": 350974272, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.4686710459828196, |
|
"grad_norm": 0.256197065114975, |
|
"learning_rate": 2.011801702687142e-05, |
|
"loss": 1.2275, |
|
"num_input_tokens_seen": 351958848, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.46993431025770593, |
|
"grad_norm": 0.2664201259613037, |
|
"learning_rate": 2.0047196695791006e-05, |
|
"loss": 1.2488, |
|
"num_input_tokens_seen": 352921472, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.4711975745325922, |
|
"grad_norm": 0.2655077278614044, |
|
"learning_rate": 1.997633371997689e-05, |
|
"loss": 1.2214, |
|
"num_input_tokens_seen": 353841344, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.4724608388074785, |
|
"grad_norm": 0.2981346845626831, |
|
"learning_rate": 1.9905429285804987e-05, |
|
"loss": 1.2257, |
|
"num_input_tokens_seen": 354788480, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.47372410308236484, |
|
"grad_norm": 0.3032223880290985, |
|
"learning_rate": 1.9834484580345248e-05, |
|
"loss": 1.2228, |
|
"num_input_tokens_seen": 355683616, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.4749873673572511, |
|
"grad_norm": 0.2835098206996918, |
|
"learning_rate": 1.976350079134187e-05, |
|
"loss": 1.2498, |
|
"num_input_tokens_seen": 356653312, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.47625063163213743, |
|
"grad_norm": 0.2348804771900177, |
|
"learning_rate": 1.9692479107193365e-05, |
|
"loss": 1.2461, |
|
"num_input_tokens_seen": 357609024, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.47751389590702376, |
|
"grad_norm": 0.28105470538139343, |
|
"learning_rate": 1.962142071693269e-05, |
|
"loss": 1.2909, |
|
"num_input_tokens_seen": 358542368, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.4787771601819101, |
|
"grad_norm": 0.27118179202079773, |
|
"learning_rate": 1.9550326810207325e-05, |
|
"loss": 1.2809, |
|
"num_input_tokens_seen": 359444576, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.48004042445679634, |
|
"grad_norm": 0.2707975506782532, |
|
"learning_rate": 1.9479198577259356e-05, |
|
"loss": 1.2116, |
|
"num_input_tokens_seen": 360334912, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.48130368873168267, |
|
"grad_norm": 0.2806662619113922, |
|
"learning_rate": 1.9408037208905558e-05, |
|
"loss": 1.2828, |
|
"num_input_tokens_seen": 361304576, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.482566953006569, |
|
"grad_norm": 0.2591959834098816, |
|
"learning_rate": 1.9336843896517458e-05, |
|
"loss": 1.1958, |
|
"num_input_tokens_seen": 362211520, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.48383021728145525, |
|
"grad_norm": 0.2818770706653595, |
|
"learning_rate": 1.926561983200137e-05, |
|
"loss": 1.3481, |
|
"num_input_tokens_seen": 363114336, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.4850934815563416, |
|
"grad_norm": 0.25823378562927246, |
|
"learning_rate": 1.919436620777847e-05, |
|
"loss": 1.2547, |
|
"num_input_tokens_seen": 364014272, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.4863567458312279, |
|
"grad_norm": 0.254759818315506, |
|
"learning_rate": 1.9123084216764807e-05, |
|
"loss": 1.2323, |
|
"num_input_tokens_seen": 364978528, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.4876200101061142, |
|
"grad_norm": 0.26032665371894836, |
|
"learning_rate": 1.9051775052351343e-05, |
|
"loss": 1.3204, |
|
"num_input_tokens_seen": 365890720, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.4888832743810005, |
|
"grad_norm": 0.26584163308143616, |
|
"learning_rate": 1.8980439908383986e-05, |
|
"loss": 1.2814, |
|
"num_input_tokens_seen": 366818304, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.4901465386558868, |
|
"grad_norm": 0.2640645205974579, |
|
"learning_rate": 1.890907997914357e-05, |
|
"loss": 1.2683, |
|
"num_input_tokens_seen": 367770048, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.49140980293077313, |
|
"grad_norm": 0.27595484256744385, |
|
"learning_rate": 1.8837696459325896e-05, |
|
"loss": 1.3023, |
|
"num_input_tokens_seen": 368716352, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.4926730672056594, |
|
"grad_norm": 0.2723195552825928, |
|
"learning_rate": 1.8766290544021696e-05, |
|
"loss": 1.2429, |
|
"num_input_tokens_seen": 369700736, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.4939363314805457, |
|
"grad_norm": 0.2871018052101135, |
|
"learning_rate": 1.869486342869667e-05, |
|
"loss": 1.3019, |
|
"num_input_tokens_seen": 370702016, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.49519959575543204, |
|
"grad_norm": 0.299991250038147, |
|
"learning_rate": 1.8623416309171423e-05, |
|
"loss": 1.2597, |
|
"num_input_tokens_seen": 371647904, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.49646286003031836, |
|
"grad_norm": 0.29281744360923767, |
|
"learning_rate": 1.8551950381601466e-05, |
|
"loss": 1.2109, |
|
"num_input_tokens_seen": 372649376, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.49772612430520463, |
|
"grad_norm": 0.2941571772098541, |
|
"learning_rate": 1.8480466842457208e-05, |
|
"loss": 1.2597, |
|
"num_input_tokens_seen": 373577504, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.49898938858009095, |
|
"grad_norm": 0.25515016913414, |
|
"learning_rate": 1.8408966888503894e-05, |
|
"loss": 1.2588, |
|
"num_input_tokens_seen": 374508256, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.5002526528549772, |
|
"grad_norm": 0.2905372083187103, |
|
"learning_rate": 1.8337451716781592e-05, |
|
"loss": 1.2734, |
|
"num_input_tokens_seen": 375425088, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.5015159171298635, |
|
"grad_norm": 0.27142760157585144, |
|
"learning_rate": 1.8265922524585137e-05, |
|
"loss": 1.2444, |
|
"num_input_tokens_seen": 376367264, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.5027791814047499, |
|
"grad_norm": 0.26266419887542725, |
|
"learning_rate": 1.8194380509444095e-05, |
|
"loss": 1.2504, |
|
"num_input_tokens_seen": 377307360, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.5040424456796362, |
|
"grad_norm": 0.24885958433151245, |
|
"learning_rate": 1.8122826869102706e-05, |
|
"loss": 1.2403, |
|
"num_input_tokens_seen": 378238624, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.5053057099545225, |
|
"grad_norm": 0.2766496241092682, |
|
"learning_rate": 1.8051262801499845e-05, |
|
"loss": 1.2614, |
|
"num_input_tokens_seen": 379241088, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5053057099545225, |
|
"eval_loss": 1.2814823389053345, |
|
"eval_runtime": 12.3847, |
|
"eval_samples_per_second": 12.112, |
|
"eval_steps_per_second": 0.807, |
|
"num_input_tokens_seen": 379241088, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5065689742294088, |
|
"grad_norm": 0.2559678256511688, |
|
"learning_rate": 1.7979689504748963e-05, |
|
"loss": 1.2359, |
|
"num_input_tokens_seen": 380145024, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.5078322385042952, |
|
"grad_norm": 0.276067852973938, |
|
"learning_rate": 1.7908108177118005e-05, |
|
"loss": 1.2247, |
|
"num_input_tokens_seen": 381154496, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.5090955027791814, |
|
"grad_norm": 0.26673588156700134, |
|
"learning_rate": 1.7836520017009383e-05, |
|
"loss": 1.2377, |
|
"num_input_tokens_seen": 382081728, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.5103587670540677, |
|
"grad_norm": 0.2775169014930725, |
|
"learning_rate": 1.7764926222939893e-05, |
|
"loss": 1.2305, |
|
"num_input_tokens_seen": 383040896, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.511622031328954, |
|
"grad_norm": 0.2704101502895355, |
|
"learning_rate": 1.7693327993520654e-05, |
|
"loss": 1.2809, |
|
"num_input_tokens_seen": 383997344, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.5128852956038403, |
|
"grad_norm": 0.2597109079360962, |
|
"learning_rate": 1.7621726527437044e-05, |
|
"loss": 1.2637, |
|
"num_input_tokens_seen": 384951744, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.5141485598787267, |
|
"grad_norm": 0.265578955411911, |
|
"learning_rate": 1.7550123023428622e-05, |
|
"loss": 1.306, |
|
"num_input_tokens_seen": 385818784, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.515411824153613, |
|
"grad_norm": 0.2557640075683594, |
|
"learning_rate": 1.7478518680269075e-05, |
|
"loss": 1.2842, |
|
"num_input_tokens_seen": 386759680, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.5166750884284993, |
|
"grad_norm": 0.25985798239707947, |
|
"learning_rate": 1.740691469674612e-05, |
|
"loss": 1.2464, |
|
"num_input_tokens_seen": 387730016, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.5179383527033855, |
|
"grad_norm": 0.25625666975975037, |
|
"learning_rate": 1.733531227164148e-05, |
|
"loss": 1.2265, |
|
"num_input_tokens_seen": 388693952, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.5192016169782718, |
|
"grad_norm": 0.2758398950099945, |
|
"learning_rate": 1.726371260371076e-05, |
|
"loss": 1.2007, |
|
"num_input_tokens_seen": 389669216, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.5204648812531582, |
|
"grad_norm": 0.27401378750801086, |
|
"learning_rate": 1.7192116891663433e-05, |
|
"loss": 1.2657, |
|
"num_input_tokens_seen": 390647360, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.5217281455280445, |
|
"grad_norm": 0.29113706946372986, |
|
"learning_rate": 1.712052633414272e-05, |
|
"loss": 1.2834, |
|
"num_input_tokens_seen": 391549504, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.5229914098029308, |
|
"grad_norm": 0.2795151472091675, |
|
"learning_rate": 1.7048942129705552e-05, |
|
"loss": 1.2343, |
|
"num_input_tokens_seen": 392518208, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.5242546740778171, |
|
"grad_norm": 0.3003349006175995, |
|
"learning_rate": 1.6977365476802505e-05, |
|
"loss": 1.28, |
|
"num_input_tokens_seen": 393502048, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.5255179383527034, |
|
"grad_norm": 0.28123393654823303, |
|
"learning_rate": 1.690579757375772e-05, |
|
"loss": 1.2696, |
|
"num_input_tokens_seen": 394482816, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.5267812026275897, |
|
"grad_norm": 0.25133296847343445, |
|
"learning_rate": 1.6834239618748856e-05, |
|
"loss": 1.2744, |
|
"num_input_tokens_seen": 395421792, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.528044466902476, |
|
"grad_norm": 0.2568908631801605, |
|
"learning_rate": 1.6762692809787007e-05, |
|
"loss": 1.2162, |
|
"num_input_tokens_seen": 396370464, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.5293077311773623, |
|
"grad_norm": 0.24872644245624542, |
|
"learning_rate": 1.66911583446967e-05, |
|
"loss": 1.2291, |
|
"num_input_tokens_seen": 397275616, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.5305709954522486, |
|
"grad_norm": 0.2645767033100128, |
|
"learning_rate": 1.6619637421095762e-05, |
|
"loss": 1.2803, |
|
"num_input_tokens_seen": 398260032, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.5318342597271349, |
|
"grad_norm": 0.2733348608016968, |
|
"learning_rate": 1.654813123637533e-05, |
|
"loss": 1.2447, |
|
"num_input_tokens_seen": 399281952, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.5330975240020213, |
|
"grad_norm": 0.27618396282196045, |
|
"learning_rate": 1.6476640987679787e-05, |
|
"loss": 1.2296, |
|
"num_input_tokens_seen": 400197792, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.5343607882769076, |
|
"grad_norm": 0.2598818242549896, |
|
"learning_rate": 1.64051678718867e-05, |
|
"loss": 1.258, |
|
"num_input_tokens_seen": 401102336, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.5356240525517938, |
|
"grad_norm": 0.254782497882843, |
|
"learning_rate": 1.6333713085586823e-05, |
|
"loss": 1.2465, |
|
"num_input_tokens_seen": 402011040, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.5368873168266801, |
|
"grad_norm": 0.26978209614753723, |
|
"learning_rate": 1.6262277825064032e-05, |
|
"loss": 1.279, |
|
"num_input_tokens_seen": 402950816, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.5381505811015664, |
|
"grad_norm": 0.2889060378074646, |
|
"learning_rate": 1.6190863286275296e-05, |
|
"loss": 1.3152, |
|
"num_input_tokens_seen": 403935136, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.5394138453764528, |
|
"grad_norm": 0.3075631856918335, |
|
"learning_rate": 1.611947066483068e-05, |
|
"loss": 1.2845, |
|
"num_input_tokens_seen": 404952864, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.5406771096513391, |
|
"grad_norm": 0.27360478043556213, |
|
"learning_rate": 1.6048101155973297e-05, |
|
"loss": 1.2516, |
|
"num_input_tokens_seen": 405957920, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.5419403739262254, |
|
"grad_norm": 0.24361246824264526, |
|
"learning_rate": 1.597675595455933e-05, |
|
"loss": 1.2319, |
|
"num_input_tokens_seen": 406898048, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.5432036382011117, |
|
"grad_norm": 0.25894516706466675, |
|
"learning_rate": 1.5905436255038e-05, |
|
"loss": 1.3278, |
|
"num_input_tokens_seen": 407848352, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.5444669024759979, |
|
"grad_norm": 0.2489163875579834, |
|
"learning_rate": 1.583414325143158e-05, |
|
"loss": 1.2478, |
|
"num_input_tokens_seen": 408813152, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.5457301667508843, |
|
"grad_norm": 0.2795446217060089, |
|
"learning_rate": 1.5762878137315406e-05, |
|
"loss": 1.1847, |
|
"num_input_tokens_seen": 409756608, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.5469934310257706, |
|
"grad_norm": 0.2824794352054596, |
|
"learning_rate": 1.5691642105797883e-05, |
|
"loss": 1.2562, |
|
"num_input_tokens_seen": 410623968, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.5482566953006569, |
|
"grad_norm": 0.2690293788909912, |
|
"learning_rate": 1.5620436349500548e-05, |
|
"loss": 1.2486, |
|
"num_input_tokens_seen": 411572768, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.5495199595755432, |
|
"grad_norm": 0.3064996302127838, |
|
"learning_rate": 1.5549262060538054e-05, |
|
"loss": 1.2568, |
|
"num_input_tokens_seen": 412493568, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.5507832238504295, |
|
"grad_norm": 0.2691975235939026, |
|
"learning_rate": 1.547812043049823e-05, |
|
"loss": 1.275, |
|
"num_input_tokens_seen": 413427264, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.5520464881253159, |
|
"grad_norm": 0.27678680419921875, |
|
"learning_rate": 1.5407012650422146e-05, |
|
"loss": 1.2137, |
|
"num_input_tokens_seen": 414404288, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.5533097524002021, |
|
"grad_norm": 0.2862233519554138, |
|
"learning_rate": 1.533593991078415e-05, |
|
"loss": 1.2782, |
|
"num_input_tokens_seen": 415391456, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.5545730166750884, |
|
"grad_norm": 0.2569049298763275, |
|
"learning_rate": 1.5264903401471965e-05, |
|
"loss": 1.2294, |
|
"num_input_tokens_seen": 416316512, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.5558362809499747, |
|
"grad_norm": 0.291337788105011, |
|
"learning_rate": 1.519390431176674e-05, |
|
"loss": 1.1881, |
|
"num_input_tokens_seen": 417250912, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.557099545224861, |
|
"grad_norm": 0.28458911180496216, |
|
"learning_rate": 1.5122943830323157e-05, |
|
"loss": 1.2479, |
|
"num_input_tokens_seen": 418203936, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.5583628094997474, |
|
"grad_norm": 0.2543714642524719, |
|
"learning_rate": 1.505202314514952e-05, |
|
"loss": 1.2394, |
|
"num_input_tokens_seen": 419118304, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.5596260737746337, |
|
"grad_norm": 0.2531825304031372, |
|
"learning_rate": 1.4981143443587867e-05, |
|
"loss": 1.259, |
|
"num_input_tokens_seen": 420057056, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.56088933804952, |
|
"grad_norm": 0.2655525207519531, |
|
"learning_rate": 1.4910305912294114e-05, |
|
"loss": 1.2547, |
|
"num_input_tokens_seen": 421040064, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.5621526023244062, |
|
"grad_norm": 0.2566235363483429, |
|
"learning_rate": 1.4839511737218156e-05, |
|
"loss": 1.2314, |
|
"num_input_tokens_seen": 421967616, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.5634158665992925, |
|
"grad_norm": 0.2777341306209564, |
|
"learning_rate": 1.476876210358402e-05, |
|
"loss": 1.2543, |
|
"num_input_tokens_seen": 422913952, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.5646791308741789, |
|
"grad_norm": 0.26129183173179626, |
|
"learning_rate": 1.4698058195870038e-05, |
|
"loss": 1.247, |
|
"num_input_tokens_seen": 423912288, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.5659423951490652, |
|
"grad_norm": 0.2949627637863159, |
|
"learning_rate": 1.462740119778899e-05, |
|
"loss": 1.2653, |
|
"num_input_tokens_seen": 424904672, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.5672056594239515, |
|
"grad_norm": 0.2683241367340088, |
|
"learning_rate": 1.4556792292268341e-05, |
|
"loss": 1.2303, |
|
"num_input_tokens_seen": 425895936, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.5684689236988378, |
|
"grad_norm": 0.26744595170021057, |
|
"learning_rate": 1.4486232661430359e-05, |
|
"loss": 1.193, |
|
"num_input_tokens_seen": 426778336, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5697321879737242, |
|
"grad_norm": 0.28104472160339355, |
|
"learning_rate": 1.4415723486572379e-05, |
|
"loss": 1.2065, |
|
"num_input_tokens_seen": 427702848, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.5709954522486104, |
|
"grad_norm": 0.2564327120780945, |
|
"learning_rate": 1.434526594814701e-05, |
|
"loss": 1.2315, |
|
"num_input_tokens_seen": 428663616, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.5722587165234967, |
|
"grad_norm": 0.246286079287529, |
|
"learning_rate": 1.4274861225742369e-05, |
|
"loss": 1.2768, |
|
"num_input_tokens_seen": 429622080, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.573521980798383, |
|
"grad_norm": 0.2924240529537201, |
|
"learning_rate": 1.4204510498062347e-05, |
|
"loss": 1.2405, |
|
"num_input_tokens_seen": 430489344, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.5747852450732693, |
|
"grad_norm": 0.26321151852607727, |
|
"learning_rate": 1.4134214942906854e-05, |
|
"loss": 1.2082, |
|
"num_input_tokens_seen": 431465248, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.5760485093481557, |
|
"grad_norm": 0.2737989127635956, |
|
"learning_rate": 1.4063975737152111e-05, |
|
"loss": 1.2378, |
|
"num_input_tokens_seen": 432344320, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.577311773623042, |
|
"grad_norm": 0.23963995277881622, |
|
"learning_rate": 1.3993794056730945e-05, |
|
"loss": 1.2195, |
|
"num_input_tokens_seen": 433296800, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.5785750378979283, |
|
"grad_norm": 0.25392717123031616, |
|
"learning_rate": 1.3923671076613121e-05, |
|
"loss": 1.2768, |
|
"num_input_tokens_seen": 434228672, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.5798383021728145, |
|
"grad_norm": 0.2499849945306778, |
|
"learning_rate": 1.3853607970785636e-05, |
|
"loss": 1.2608, |
|
"num_input_tokens_seen": 435125376, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.5811015664477008, |
|
"grad_norm": 0.2485542893409729, |
|
"learning_rate": 1.3783605912233086e-05, |
|
"loss": 1.3271, |
|
"num_input_tokens_seen": 436060128, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5823648307225872, |
|
"grad_norm": 0.26257503032684326, |
|
"learning_rate": 1.3713666072918025e-05, |
|
"loss": 1.2772, |
|
"num_input_tokens_seen": 437054208, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.5836280949974735, |
|
"grad_norm": 0.27504444122314453, |
|
"learning_rate": 1.3643789623761335e-05, |
|
"loss": 1.2807, |
|
"num_input_tokens_seen": 437972832, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.5848913592723598, |
|
"grad_norm": 0.2476516216993332, |
|
"learning_rate": 1.3573977734622654e-05, |
|
"loss": 1.2403, |
|
"num_input_tokens_seen": 438912832, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.5861546235472461, |
|
"grad_norm": 0.26506373286247253, |
|
"learning_rate": 1.3504231574280742e-05, |
|
"loss": 1.2203, |
|
"num_input_tokens_seen": 439899168, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.5874178878221324, |
|
"grad_norm": 0.29639938473701477, |
|
"learning_rate": 1.3434552310413948e-05, |
|
"loss": 1.314, |
|
"num_input_tokens_seen": 440917152, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.5886811520970187, |
|
"grad_norm": 0.26634323596954346, |
|
"learning_rate": 1.336494110958066e-05, |
|
"loss": 1.2586, |
|
"num_input_tokens_seen": 441860704, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.589944416371905, |
|
"grad_norm": 0.26301464438438416, |
|
"learning_rate": 1.3295399137199744e-05, |
|
"loss": 1.2541, |
|
"num_input_tokens_seen": 442838240, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.5912076806467913, |
|
"grad_norm": 0.26125144958496094, |
|
"learning_rate": 1.3225927557531086e-05, |
|
"loss": 1.2743, |
|
"num_input_tokens_seen": 443835552, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.5924709449216776, |
|
"grad_norm": 0.2652340829372406, |
|
"learning_rate": 1.3156527533656041e-05, |
|
"loss": 1.2308, |
|
"num_input_tokens_seen": 444788896, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.593734209196564, |
|
"grad_norm": 0.2752208411693573, |
|
"learning_rate": 1.3087200227458005e-05, |
|
"loss": 1.2548, |
|
"num_input_tokens_seen": 445779392, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.5949974734714503, |
|
"grad_norm": 0.28993070125579834, |
|
"learning_rate": 1.3017946799602943e-05, |
|
"loss": 1.2103, |
|
"num_input_tokens_seen": 446716864, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.5962607377463366, |
|
"grad_norm": 0.248098686337471, |
|
"learning_rate": 1.294876840951995e-05, |
|
"loss": 1.2628, |
|
"num_input_tokens_seen": 447604192, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.5975240020212228, |
|
"grad_norm": 0.26949024200439453, |
|
"learning_rate": 1.2879666215381881e-05, |
|
"loss": 1.219, |
|
"num_input_tokens_seen": 448549600, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.5987872662961091, |
|
"grad_norm": 0.2639176547527313, |
|
"learning_rate": 1.2810641374085904e-05, |
|
"loss": 1.194, |
|
"num_input_tokens_seen": 449481280, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.6000505305709954, |
|
"grad_norm": 0.2593153417110443, |
|
"learning_rate": 1.2741695041234165e-05, |
|
"loss": 1.2001, |
|
"num_input_tokens_seen": 450464096, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.6013137948458818, |
|
"grad_norm": 0.2578306794166565, |
|
"learning_rate": 1.2672828371114441e-05, |
|
"loss": 1.1945, |
|
"num_input_tokens_seen": 451387360, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.6025770591207681, |
|
"grad_norm": 0.2578235864639282, |
|
"learning_rate": 1.2604042516680797e-05, |
|
"loss": 1.2215, |
|
"num_input_tokens_seen": 452345664, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.6038403233956544, |
|
"grad_norm": 0.2732868790626526, |
|
"learning_rate": 1.2535338629534321e-05, |
|
"loss": 1.2748, |
|
"num_input_tokens_seen": 453247008, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.6051035876705407, |
|
"grad_norm": 0.24936838448047638, |
|
"learning_rate": 1.2466717859903794e-05, |
|
"loss": 1.2132, |
|
"num_input_tokens_seen": 454143616, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.6063668519454269, |
|
"grad_norm": 0.2849110960960388, |
|
"learning_rate": 1.2398181356626464e-05, |
|
"loss": 1.2112, |
|
"num_input_tokens_seen": 455058880, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.6076301162203133, |
|
"grad_norm": 0.2991189956665039, |
|
"learning_rate": 1.2329730267128808e-05, |
|
"loss": 1.2349, |
|
"num_input_tokens_seen": 456022464, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.6088933804951996, |
|
"grad_norm": 0.262685626745224, |
|
"learning_rate": 1.2261365737407316e-05, |
|
"loss": 1.2596, |
|
"num_input_tokens_seen": 457002592, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.6101566447700859, |
|
"grad_norm": 0.25802651047706604, |
|
"learning_rate": 1.2193088912009321e-05, |
|
"loss": 1.1975, |
|
"num_input_tokens_seen": 457977152, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.6114199090449722, |
|
"grad_norm": 0.25570937991142273, |
|
"learning_rate": 1.2124900934013812e-05, |
|
"loss": 1.2774, |
|
"num_input_tokens_seen": 458946368, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.6126831733198586, |
|
"grad_norm": 0.2608765959739685, |
|
"learning_rate": 1.2056802945012316e-05, |
|
"loss": 1.2298, |
|
"num_input_tokens_seen": 459789536, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.6139464375947449, |
|
"grad_norm": 0.27471068501472473, |
|
"learning_rate": 1.1988796085089777e-05, |
|
"loss": 1.2663, |
|
"num_input_tokens_seen": 460781856, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.6152097018696311, |
|
"grad_norm": 0.30232349038124084, |
|
"learning_rate": 1.1920881492805467e-05, |
|
"loss": 1.2709, |
|
"num_input_tokens_seen": 461735360, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.6164729661445174, |
|
"grad_norm": 0.2713924050331116, |
|
"learning_rate": 1.1853060305173947e-05, |
|
"loss": 1.2925, |
|
"num_input_tokens_seen": 462762272, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.6177362304194037, |
|
"grad_norm": 0.2612393796443939, |
|
"learning_rate": 1.1785333657645997e-05, |
|
"loss": 1.2671, |
|
"num_input_tokens_seen": 463701440, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.61899949469429, |
|
"grad_norm": 0.2994194030761719, |
|
"learning_rate": 1.1717702684089622e-05, |
|
"loss": 1.2685, |
|
"num_input_tokens_seen": 464628288, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.6202627589691764, |
|
"grad_norm": 0.27403557300567627, |
|
"learning_rate": 1.1650168516771077e-05, |
|
"loss": 1.2313, |
|
"num_input_tokens_seen": 465563264, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.6215260232440627, |
|
"grad_norm": 0.2665519118309021, |
|
"learning_rate": 1.1582732286335892e-05, |
|
"loss": 1.2608, |
|
"num_input_tokens_seen": 466527296, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.622789287518949, |
|
"grad_norm": 0.2931445837020874, |
|
"learning_rate": 1.151539512178998e-05, |
|
"loss": 1.1978, |
|
"num_input_tokens_seen": 467422144, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.6240525517938352, |
|
"grad_norm": 0.243869349360466, |
|
"learning_rate": 1.1448158150480684e-05, |
|
"loss": 1.2584, |
|
"num_input_tokens_seen": 468346080, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.6253158160687216, |
|
"grad_norm": 0.24073927104473114, |
|
"learning_rate": 1.1381022498077936e-05, |
|
"loss": 1.2786, |
|
"num_input_tokens_seen": 469268160, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.6265790803436079, |
|
"grad_norm": 0.2580939531326294, |
|
"learning_rate": 1.1313989288555403e-05, |
|
"loss": 1.3028, |
|
"num_input_tokens_seen": 470217248, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.6278423446184942, |
|
"grad_norm": 0.27437812089920044, |
|
"learning_rate": 1.1247059644171683e-05, |
|
"loss": 1.1893, |
|
"num_input_tokens_seen": 471134528, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.6291056088933805, |
|
"grad_norm": 0.27005961537361145, |
|
"learning_rate": 1.1180234685451485e-05, |
|
"loss": 1.2873, |
|
"num_input_tokens_seen": 472091616, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.6303688731682668, |
|
"grad_norm": 0.2728407680988312, |
|
"learning_rate": 1.1113515531166905e-05, |
|
"loss": 1.2812, |
|
"num_input_tokens_seen": 473036928, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.631632137443153, |
|
"grad_norm": 0.2591012716293335, |
|
"learning_rate": 1.1046903298318667e-05, |
|
"loss": 1.2289, |
|
"num_input_tokens_seen": 474006976, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6328954017180394, |
|
"grad_norm": 0.23528583347797394, |
|
"learning_rate": 1.0980399102117435e-05, |
|
"loss": 1.2315, |
|
"num_input_tokens_seen": 474996096, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.6341586659929257, |
|
"grad_norm": 0.27465859055519104, |
|
"learning_rate": 1.0914004055965161e-05, |
|
"loss": 1.3264, |
|
"num_input_tokens_seen": 475933248, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.635421930267812, |
|
"grad_norm": 0.27259302139282227, |
|
"learning_rate": 1.08477192714364e-05, |
|
"loss": 1.2479, |
|
"num_input_tokens_seen": 476921888, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.6366851945426983, |
|
"grad_norm": 0.2752089202404022, |
|
"learning_rate": 1.078154585825974e-05, |
|
"loss": 1.1889, |
|
"num_input_tokens_seen": 477911648, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.6379484588175847, |
|
"grad_norm": 0.2641167938709259, |
|
"learning_rate": 1.0715484924299207e-05, |
|
"loss": 1.1821, |
|
"num_input_tokens_seen": 478897216, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.639211723092471, |
|
"grad_norm": 0.24626615643501282, |
|
"learning_rate": 1.0649537575535706e-05, |
|
"loss": 1.3228, |
|
"num_input_tokens_seen": 479897216, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.6404749873673572, |
|
"grad_norm": 0.25866448879241943, |
|
"learning_rate": 1.0583704916048546e-05, |
|
"loss": 1.2286, |
|
"num_input_tokens_seen": 480879104, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.6417382516422435, |
|
"grad_norm": 0.2469986230134964, |
|
"learning_rate": 1.05179880479969e-05, |
|
"loss": 1.2382, |
|
"num_input_tokens_seen": 481884800, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.6430015159171298, |
|
"grad_norm": 0.26307523250579834, |
|
"learning_rate": 1.0452388071601396e-05, |
|
"loss": 1.2541, |
|
"num_input_tokens_seen": 482806624, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.6442647801920162, |
|
"grad_norm": 0.2624097168445587, |
|
"learning_rate": 1.0386906085125676e-05, |
|
"loss": 1.2405, |
|
"num_input_tokens_seen": 483727232, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.6455280444669025, |
|
"grad_norm": 0.25804755091667175, |
|
"learning_rate": 1.0321543184858012e-05, |
|
"loss": 1.2258, |
|
"num_input_tokens_seen": 484757024, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.6467913087417888, |
|
"grad_norm": 0.26082345843315125, |
|
"learning_rate": 1.0256300465092968e-05, |
|
"loss": 1.2453, |
|
"num_input_tokens_seen": 485694944, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.6480545730166751, |
|
"grad_norm": 0.26765161752700806, |
|
"learning_rate": 1.0191179018113052e-05, |
|
"loss": 1.2447, |
|
"num_input_tokens_seen": 486613664, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.6493178372915613, |
|
"grad_norm": 0.2676701545715332, |
|
"learning_rate": 1.0126179934170446e-05, |
|
"loss": 1.3095, |
|
"num_input_tokens_seen": 487574816, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.6505811015664477, |
|
"grad_norm": 0.2636936604976654, |
|
"learning_rate": 1.0061304301468766e-05, |
|
"loss": 1.2053, |
|
"num_input_tokens_seen": 488516544, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.651844365841334, |
|
"grad_norm": 0.2662390172481537, |
|
"learning_rate": 9.996553206144797e-06, |
|
"loss": 1.2751, |
|
"num_input_tokens_seen": 489412608, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.6531076301162203, |
|
"grad_norm": 0.26386016607284546, |
|
"learning_rate": 9.931927732250374e-06, |
|
"loss": 1.2631, |
|
"num_input_tokens_seen": 490374624, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.6543708943911066, |
|
"grad_norm": 0.27195560932159424, |
|
"learning_rate": 9.867428961734188e-06, |
|
"loss": 1.2587, |
|
"num_input_tokens_seen": 491366592, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.655634158665993, |
|
"grad_norm": 0.2867816686630249, |
|
"learning_rate": 9.803057974423667e-06, |
|
"loss": 1.2609, |
|
"num_input_tokens_seen": 492314912, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.6568974229408793, |
|
"grad_norm": 0.28000280261039734, |
|
"learning_rate": 9.738815848006945e-06, |
|
"loss": 1.2562, |
|
"num_input_tokens_seen": 493215136, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.6581606872157655, |
|
"grad_norm": 0.27017146348953247, |
|
"learning_rate": 9.674703658014749e-06, |
|
"loss": 1.2261, |
|
"num_input_tokens_seen": 494146080, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.6594239514906518, |
|
"grad_norm": 0.2675604522228241, |
|
"learning_rate": 9.610722477802483e-06, |
|
"loss": 1.292, |
|
"num_input_tokens_seen": 495103840, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.6606872157655381, |
|
"grad_norm": 0.2377164214849472, |
|
"learning_rate": 9.546873378532158e-06, |
|
"loss": 1.2278, |
|
"num_input_tokens_seen": 496014752, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.6619504800404244, |
|
"grad_norm": 0.2551622688770294, |
|
"learning_rate": 9.483157429154547e-06, |
|
"loss": 1.247, |
|
"num_input_tokens_seen": 496955936, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.6632137443153108, |
|
"grad_norm": 0.2615555226802826, |
|
"learning_rate": 9.419575696391218e-06, |
|
"loss": 1.2705, |
|
"num_input_tokens_seen": 497881920, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.6644770085901971, |
|
"grad_norm": 0.2722395956516266, |
|
"learning_rate": 9.356129244716729e-06, |
|
"loss": 1.2736, |
|
"num_input_tokens_seen": 498859040, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.6657402728650834, |
|
"grad_norm": 0.2843475639820099, |
|
"learning_rate": 9.29281913634078e-06, |
|
"loss": 1.2112, |
|
"num_input_tokens_seen": 499848032, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.6670035371399696, |
|
"grad_norm": 0.260781466960907, |
|
"learning_rate": 9.22964643119044e-06, |
|
"loss": 1.2301, |
|
"num_input_tokens_seen": 500782656, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.668266801414856, |
|
"grad_norm": 0.28937065601348877, |
|
"learning_rate": 9.166612186892376e-06, |
|
"loss": 1.2573, |
|
"num_input_tokens_seen": 501775328, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.6695300656897423, |
|
"grad_norm": 0.24364541471004486, |
|
"learning_rate": 9.103717458755188e-06, |
|
"loss": 1.2888, |
|
"num_input_tokens_seen": 502721632, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.6707933299646286, |
|
"grad_norm": 0.32249847054481506, |
|
"learning_rate": 9.040963299751722e-06, |
|
"loss": 1.2103, |
|
"num_input_tokens_seen": 503649088, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.6720565942395149, |
|
"grad_norm": 0.274586945772171, |
|
"learning_rate": 8.978350760501413e-06, |
|
"loss": 1.2604, |
|
"num_input_tokens_seen": 504589696, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.6733198585144012, |
|
"grad_norm": 0.25306662917137146, |
|
"learning_rate": 8.915880889252758e-06, |
|
"loss": 1.212, |
|
"num_input_tokens_seen": 505495648, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.6745831227892876, |
|
"grad_norm": 0.2675648629665375, |
|
"learning_rate": 8.853554731865696e-06, |
|
"loss": 1.2735, |
|
"num_input_tokens_seen": 506399776, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.6758463870641738, |
|
"grad_norm": 0.25868740677833557, |
|
"learning_rate": 8.791373331794155e-06, |
|
"loss": 1.2346, |
|
"num_input_tokens_seen": 507369920, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.6771096513390601, |
|
"grad_norm": 0.26915502548217773, |
|
"learning_rate": 8.729337730068559e-06, |
|
"loss": 1.2514, |
|
"num_input_tokens_seen": 508312480, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.6783729156139464, |
|
"grad_norm": 0.27946212887763977, |
|
"learning_rate": 8.667448965278404e-06, |
|
"loss": 1.2084, |
|
"num_input_tokens_seen": 509257024, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.6796361798888327, |
|
"grad_norm": 0.2765122950077057, |
|
"learning_rate": 8.60570807355484e-06, |
|
"loss": 1.2396, |
|
"num_input_tokens_seen": 510240480, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.6808994441637191, |
|
"grad_norm": 0.24776999652385712, |
|
"learning_rate": 8.54411608855339e-06, |
|
"loss": 1.1789, |
|
"num_input_tokens_seen": 511188832, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.6821627084386054, |
|
"grad_norm": 0.2991964519023895, |
|
"learning_rate": 8.482674041436567e-06, |
|
"loss": 1.2665, |
|
"num_input_tokens_seen": 512158368, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6834259727134917, |
|
"grad_norm": 0.28031983971595764, |
|
"learning_rate": 8.421382960856695e-06, |
|
"loss": 1.2297, |
|
"num_input_tokens_seen": 513132704, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.6846892369883779, |
|
"grad_norm": 0.2627319395542145, |
|
"learning_rate": 8.360243872938599e-06, |
|
"loss": 1.2734, |
|
"num_input_tokens_seen": 514124160, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.6859525012632642, |
|
"grad_norm": 0.2459687888622284, |
|
"learning_rate": 8.299257801262496e-06, |
|
"loss": 1.2091, |
|
"num_input_tokens_seen": 515011840, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.6872157655381506, |
|
"grad_norm": 0.26756593585014343, |
|
"learning_rate": 8.238425766846812e-06, |
|
"loss": 1.2104, |
|
"num_input_tokens_seen": 515957856, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.6884790298130369, |
|
"grad_norm": 0.293277382850647, |
|
"learning_rate": 8.177748788131119e-06, |
|
"loss": 1.2523, |
|
"num_input_tokens_seen": 516907040, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.6897422940879232, |
|
"grad_norm": 0.2430182844400406, |
|
"learning_rate": 8.117227880959081e-06, |
|
"loss": 1.2209, |
|
"num_input_tokens_seen": 517874624, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.6910055583628095, |
|
"grad_norm": 0.26824715733528137, |
|
"learning_rate": 8.056864058561416e-06, |
|
"loss": 1.2237, |
|
"num_input_tokens_seen": 518780064, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.6922688226376958, |
|
"grad_norm": 0.2571701407432556, |
|
"learning_rate": 7.996658331538978e-06, |
|
"loss": 1.2251, |
|
"num_input_tokens_seen": 519746560, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.6935320869125821, |
|
"grad_norm": 0.25399723649024963, |
|
"learning_rate": 7.936611707845793e-06, |
|
"loss": 1.2448, |
|
"num_input_tokens_seen": 520710432, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.6947953511874684, |
|
"grad_norm": 0.24103257060050964, |
|
"learning_rate": 7.876725192772224e-06, |
|
"loss": 1.1599, |
|
"num_input_tokens_seen": 521672128, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6960586154623547, |
|
"grad_norm": 0.2598767876625061, |
|
"learning_rate": 7.816999788928119e-06, |
|
"loss": 1.2595, |
|
"num_input_tokens_seen": 522644576, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.697321879737241, |
|
"grad_norm": 0.28568968176841736, |
|
"learning_rate": 7.757436496226034e-06, |
|
"loss": 1.2672, |
|
"num_input_tokens_seen": 523695168, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.6985851440121273, |
|
"grad_norm": 0.264839768409729, |
|
"learning_rate": 7.698036311864467e-06, |
|
"loss": 1.2521, |
|
"num_input_tokens_seen": 524620992, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.6998484082870137, |
|
"grad_norm": 0.27619093656539917, |
|
"learning_rate": 7.638800230311206e-06, |
|
"loss": 1.1977, |
|
"num_input_tokens_seen": 525573280, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.7011116725619, |
|
"grad_norm": 0.2585349380970001, |
|
"learning_rate": 7.579729243286638e-06, |
|
"loss": 1.2956, |
|
"num_input_tokens_seen": 526491552, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.7023749368367862, |
|
"grad_norm": 0.26802536845207214, |
|
"learning_rate": 7.5208243397471995e-06, |
|
"loss": 1.2719, |
|
"num_input_tokens_seen": 527423648, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.7036382011116725, |
|
"grad_norm": 0.2632644474506378, |
|
"learning_rate": 7.462086505868744e-06, |
|
"loss": 1.208, |
|
"num_input_tokens_seen": 528368960, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.7049014653865588, |
|
"grad_norm": 0.25977852940559387, |
|
"learning_rate": 7.4035167250301035e-06, |
|
"loss": 1.1928, |
|
"num_input_tokens_seen": 529333984, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.7061647296614452, |
|
"grad_norm": 0.2557479739189148, |
|
"learning_rate": 7.345115977796573e-06, |
|
"loss": 1.1766, |
|
"num_input_tokens_seen": 530305760, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.7074279939363315, |
|
"grad_norm": 0.2768225073814392, |
|
"learning_rate": 7.286885241903531e-06, |
|
"loss": 1.2209, |
|
"num_input_tokens_seen": 531239232, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.7086912582112178, |
|
"grad_norm": 0.27175867557525635, |
|
"learning_rate": 7.2288254922400575e-06, |
|
"loss": 1.2839, |
|
"num_input_tokens_seen": 532124640, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.7099545224861041, |
|
"grad_norm": 0.28098565340042114, |
|
"learning_rate": 7.1709377008325895e-06, |
|
"loss": 1.2523, |
|
"num_input_tokens_seen": 533148320, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.7112177867609903, |
|
"grad_norm": 0.2613276541233063, |
|
"learning_rate": 7.113222836828695e-06, |
|
"loss": 1.1796, |
|
"num_input_tokens_seen": 534125856, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.7124810510358767, |
|
"grad_norm": 0.24941375851631165, |
|
"learning_rate": 7.055681866480792e-06, |
|
"loss": 1.2102, |
|
"num_input_tokens_seen": 535057408, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.713744315310763, |
|
"grad_norm": 0.28444018959999084, |
|
"learning_rate": 6.998315753130024e-06, |
|
"loss": 1.1713, |
|
"num_input_tokens_seen": 536041280, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.7150075795856493, |
|
"grad_norm": 0.2781004309654236, |
|
"learning_rate": 6.9411254571901e-06, |
|
"loss": 1.2121, |
|
"num_input_tokens_seen": 536970048, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.7162708438605356, |
|
"grad_norm": 0.2684124708175659, |
|
"learning_rate": 6.884111936131231e-06, |
|
"loss": 1.2733, |
|
"num_input_tokens_seen": 537863008, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.717534108135422, |
|
"grad_norm": 0.27960875630378723, |
|
"learning_rate": 6.82727614446407e-06, |
|
"loss": 1.1975, |
|
"num_input_tokens_seen": 538773152, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.7187973724103083, |
|
"grad_norm": 0.24374781548976898, |
|
"learning_rate": 6.770619033723783e-06, |
|
"loss": 1.2273, |
|
"num_input_tokens_seen": 539793088, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.7200606366851945, |
|
"grad_norm": 0.2838081121444702, |
|
"learning_rate": 6.714141552454072e-06, |
|
"loss": 1.2066, |
|
"num_input_tokens_seen": 540656768, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.7213239009600808, |
|
"grad_norm": 0.24478621780872345, |
|
"learning_rate": 6.657844646191328e-06, |
|
"loss": 1.2102, |
|
"num_input_tokens_seen": 541561248, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.7225871652349671, |
|
"grad_norm": 0.2654918432235718, |
|
"learning_rate": 6.6017292574487635e-06, |
|
"loss": 1.2756, |
|
"num_input_tokens_seen": 542457408, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.7238504295098535, |
|
"grad_norm": 0.24361199140548706, |
|
"learning_rate": 6.545796325700683e-06, |
|
"loss": 1.1843, |
|
"num_input_tokens_seen": 543394112, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.7251136937847398, |
|
"grad_norm": 0.27256685495376587, |
|
"learning_rate": 6.4900467873667e-06, |
|
"loss": 1.2305, |
|
"num_input_tokens_seen": 544360768, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.7263769580596261, |
|
"grad_norm": 0.24635472893714905, |
|
"learning_rate": 6.434481575796107e-06, |
|
"loss": 1.243, |
|
"num_input_tokens_seen": 545282080, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.7276402223345124, |
|
"grad_norm": 0.306068480014801, |
|
"learning_rate": 6.3791016212522256e-06, |
|
"loss": 1.2045, |
|
"num_input_tokens_seen": 546234848, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.7289034866093986, |
|
"grad_norm": 0.26721495389938354, |
|
"learning_rate": 6.32390785089682e-06, |
|
"loss": 1.2897, |
|
"num_input_tokens_seen": 547182400, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.730166750884285, |
|
"grad_norm": 0.25117790699005127, |
|
"learning_rate": 6.268901188774617e-06, |
|
"loss": 1.2824, |
|
"num_input_tokens_seen": 548096000, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.7314300151591713, |
|
"grad_norm": 0.2862393260002136, |
|
"learning_rate": 6.2140825557977745e-06, |
|
"loss": 1.2498, |
|
"num_input_tokens_seen": 549029216, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.7326932794340576, |
|
"grad_norm": 0.25375497341156006, |
|
"learning_rate": 6.159452869730546e-06, |
|
"loss": 1.2498, |
|
"num_input_tokens_seen": 550029152, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.7339565437089439, |
|
"grad_norm": 0.2733435034751892, |
|
"learning_rate": 6.1050130451738186e-06, |
|
"loss": 1.1756, |
|
"num_input_tokens_seen": 551018848, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.7352198079838302, |
|
"grad_norm": 0.25357958674430847, |
|
"learning_rate": 6.050763993549884e-06, |
|
"loss": 1.1967, |
|
"num_input_tokens_seen": 551936608, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.7364830722587166, |
|
"grad_norm": 0.2535962760448456, |
|
"learning_rate": 5.996706623087126e-06, |
|
"loss": 1.251, |
|
"num_input_tokens_seen": 552928192, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.7377463365336028, |
|
"grad_norm": 0.26090991497039795, |
|
"learning_rate": 5.942841838804848e-06, |
|
"loss": 1.2385, |
|
"num_input_tokens_seen": 553912960, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.7390096008084891, |
|
"grad_norm": 0.2640230357646942, |
|
"learning_rate": 5.889170542498102e-06, |
|
"loss": 1.2426, |
|
"num_input_tokens_seen": 554837248, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.7402728650833754, |
|
"grad_norm": 0.24669994413852692, |
|
"learning_rate": 5.835693632722607e-06, |
|
"loss": 1.1978, |
|
"num_input_tokens_seen": 555733696, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.7415361293582617, |
|
"grad_norm": 0.2583445608615875, |
|
"learning_rate": 5.7824120047796725e-06, |
|
"loss": 1.2602, |
|
"num_input_tokens_seen": 556739392, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.7427993936331481, |
|
"grad_norm": 0.24428869783878326, |
|
"learning_rate": 5.729326550701263e-06, |
|
"loss": 1.2476, |
|
"num_input_tokens_seen": 557767840, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.7440626579080344, |
|
"grad_norm": 0.26555436849594116, |
|
"learning_rate": 5.676438159235005e-06, |
|
"loss": 1.265, |
|
"num_input_tokens_seen": 558685312, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.7453259221829207, |
|
"grad_norm": 0.29612812399864197, |
|
"learning_rate": 5.623747715829356e-06, |
|
"loss": 1.2436, |
|
"num_input_tokens_seen": 559607904, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.7465891864578069, |
|
"grad_norm": 0.26325854659080505, |
|
"learning_rate": 5.571256102618758e-06, |
|
"loss": 1.2447, |
|
"num_input_tokens_seen": 560536256, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.7478524507326932, |
|
"grad_norm": 0.2596051096916199, |
|
"learning_rate": 5.518964198408862e-06, |
|
"loss": 1.2401, |
|
"num_input_tokens_seen": 561426784, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.7491157150075796, |
|
"grad_norm": 0.28517597913742065, |
|
"learning_rate": 5.466872878661839e-06, |
|
"loss": 1.2213, |
|
"num_input_tokens_seen": 562311360, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.7503789792824659, |
|
"grad_norm": 0.24300004541873932, |
|
"learning_rate": 5.414983015481682e-06, |
|
"loss": 1.2828, |
|
"num_input_tokens_seen": 563216640, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.7516422435573522, |
|
"grad_norm": 0.26081758737564087, |
|
"learning_rate": 5.363295477599677e-06, |
|
"loss": 1.2356, |
|
"num_input_tokens_seen": 564140992, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.7529055078322385, |
|
"grad_norm": 0.30684077739715576, |
|
"learning_rate": 5.311811130359772e-06, |
|
"loss": 1.2487, |
|
"num_input_tokens_seen": 565051296, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.7541687721071249, |
|
"grad_norm": 0.243248850107193, |
|
"learning_rate": 5.260530835704159e-06, |
|
"loss": 1.2313, |
|
"num_input_tokens_seen": 566038848, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.7554320363820111, |
|
"grad_norm": 0.2502289116382599, |
|
"learning_rate": 5.209455452158796e-06, |
|
"loss": 1.2092, |
|
"num_input_tokens_seen": 567044608, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.7566953006568974, |
|
"grad_norm": 0.26396942138671875, |
|
"learning_rate": 5.1585858348190666e-06, |
|
"loss": 1.2309, |
|
"num_input_tokens_seen": 567994848, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.7579585649317837, |
|
"grad_norm": 0.2504906952381134, |
|
"learning_rate": 5.107922835335452e-06, |
|
"loss": 1.2367, |
|
"num_input_tokens_seen": 568955808, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7579585649317837, |
|
"eval_loss": 1.2595031261444092, |
|
"eval_runtime": 13.0677, |
|
"eval_samples_per_second": 11.479, |
|
"eval_steps_per_second": 0.765, |
|
"num_input_tokens_seen": 568955808, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.75922182920667, |
|
"grad_norm": 0.2684820592403412, |
|
"learning_rate": 5.057467301899274e-06, |
|
"loss": 1.1746, |
|
"num_input_tokens_seen": 569895776, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.7604850934815564, |
|
"grad_norm": 0.2721717655658722, |
|
"learning_rate": 5.007220079228478e-06, |
|
"loss": 1.2066, |
|
"num_input_tokens_seen": 570859552, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.7617483577564427, |
|
"grad_norm": 0.25938835740089417, |
|
"learning_rate": 4.957182008553527e-06, |
|
"loss": 1.2192, |
|
"num_input_tokens_seen": 571787136, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.763011622031329, |
|
"grad_norm": 0.25528407096862793, |
|
"learning_rate": 4.9073539276032756e-06, |
|
"loss": 1.2433, |
|
"num_input_tokens_seen": 572685056, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.7642748863062152, |
|
"grad_norm": 0.22747959196567535, |
|
"learning_rate": 4.857736670590982e-06, |
|
"loss": 1.2425, |
|
"num_input_tokens_seen": 573630944, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.7655381505811015, |
|
"grad_norm": 0.23501618206501007, |
|
"learning_rate": 4.808331068200329e-06, |
|
"loss": 1.3179, |
|
"num_input_tokens_seen": 574504000, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.7668014148559879, |
|
"grad_norm": 0.2590336203575134, |
|
"learning_rate": 4.759137947571491e-06, |
|
"loss": 1.2479, |
|
"num_input_tokens_seen": 575465184, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.7680646791308742, |
|
"grad_norm": 0.2563855051994324, |
|
"learning_rate": 4.710158132287332e-06, |
|
"loss": 1.2028, |
|
"num_input_tokens_seen": 576397088, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.7693279434057605, |
|
"grad_norm": 0.29565200209617615, |
|
"learning_rate": 4.661392442359582e-06, |
|
"loss": 1.2799, |
|
"num_input_tokens_seen": 577387744, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.7705912076806468, |
|
"grad_norm": 0.26293325424194336, |
|
"learning_rate": 4.612841694215136e-06, |
|
"loss": 1.2272, |
|
"num_input_tokens_seen": 578310496, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.7718544719555331, |
|
"grad_norm": 0.2616961598396301, |
|
"learning_rate": 4.56450670068234e-06, |
|
"loss": 1.2489, |
|
"num_input_tokens_seen": 579258496, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.7731177362304194, |
|
"grad_norm": 0.24685987830162048, |
|
"learning_rate": 4.51638827097745e-06, |
|
"loss": 1.2588, |
|
"num_input_tokens_seen": 580197760, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.7743810005053057, |
|
"grad_norm": 0.2490658164024353, |
|
"learning_rate": 4.46848721069101e-06, |
|
"loss": 1.293, |
|
"num_input_tokens_seen": 581108448, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.775644264780192, |
|
"grad_norm": 0.24475279450416565, |
|
"learning_rate": 4.420804321774441e-06, |
|
"loss": 1.287, |
|
"num_input_tokens_seen": 582039072, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.7769075290550783, |
|
"grad_norm": 0.2623221278190613, |
|
"learning_rate": 4.373340402526543e-06, |
|
"loss": 1.2117, |
|
"num_input_tokens_seen": 582932992, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.7781707933299646, |
|
"grad_norm": 0.27465909719467163, |
|
"learning_rate": 4.326096247580186e-06, |
|
"loss": 1.2135, |
|
"num_input_tokens_seen": 583861568, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.779434057604851, |
|
"grad_norm": 0.28181222081184387, |
|
"learning_rate": 4.27907264788896e-06, |
|
"loss": 1.2537, |
|
"num_input_tokens_seen": 584843136, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.7806973218797373, |
|
"grad_norm": 0.2493135631084442, |
|
"learning_rate": 4.23227039071398e-06, |
|
"loss": 1.2263, |
|
"num_input_tokens_seen": 585837664, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.7819605861546235, |
|
"grad_norm": 0.26791173219680786, |
|
"learning_rate": 4.1856902596106726e-06, |
|
"loss": 1.2273, |
|
"num_input_tokens_seen": 586797536, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.7832238504295098, |
|
"grad_norm": 0.26550182700157166, |
|
"learning_rate": 4.139333034415663e-06, |
|
"loss": 1.2031, |
|
"num_input_tokens_seen": 587734880, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.7844871147043961, |
|
"grad_norm": 0.27607518434524536, |
|
"learning_rate": 4.0931994912337345e-06, |
|
"loss": 1.2426, |
|
"num_input_tokens_seen": 588659360, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.7857503789792825, |
|
"grad_norm": 0.2891901433467865, |
|
"learning_rate": 4.047290402424806e-06, |
|
"loss": 1.2864, |
|
"num_input_tokens_seen": 589628256, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.7870136432541688, |
|
"grad_norm": 0.2835799753665924, |
|
"learning_rate": 4.001606536591042e-06, |
|
"loss": 1.2634, |
|
"num_input_tokens_seen": 590567904, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.7882769075290551, |
|
"grad_norm": 0.2466340959072113, |
|
"learning_rate": 3.956148658563945e-06, |
|
"loss": 1.1893, |
|
"num_input_tokens_seen": 591514912, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.7895401718039414, |
|
"grad_norm": 0.2408566027879715, |
|
"learning_rate": 3.910917529391582e-06, |
|
"loss": 1.1672, |
|
"num_input_tokens_seen": 592500416, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.7908034360788276, |
|
"grad_norm": 0.28810036182403564, |
|
"learning_rate": 3.8659139063258146e-06, |
|
"loss": 1.2376, |
|
"num_input_tokens_seen": 593538144, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.792066700353714, |
|
"grad_norm": 0.26853030920028687, |
|
"learning_rate": 3.8211385428096474e-06, |
|
"loss": 1.2726, |
|
"num_input_tokens_seen": 594506272, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.7933299646286003, |
|
"grad_norm": 0.2816145122051239, |
|
"learning_rate": 3.7765921884645917e-06, |
|
"loss": 1.3003, |
|
"num_input_tokens_seen": 595431904, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.7945932289034866, |
|
"grad_norm": 0.26149782538414, |
|
"learning_rate": 3.7322755890781368e-06, |
|
"loss": 1.2477, |
|
"num_input_tokens_seen": 596461440, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.7958564931783729, |
|
"grad_norm": 0.260708749294281, |
|
"learning_rate": 3.68818948659125e-06, |
|
"loss": 1.256, |
|
"num_input_tokens_seen": 597473312, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.7971197574532592, |
|
"grad_norm": 0.27105608582496643, |
|
"learning_rate": 3.6443346190859598e-06, |
|
"loss": 1.2488, |
|
"num_input_tokens_seen": 598412000, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.7983830217281456, |
|
"grad_norm": 0.24419113993644714, |
|
"learning_rate": 3.600711720772991e-06, |
|
"loss": 1.2774, |
|
"num_input_tokens_seen": 599430656, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.7996462860030318, |
|
"grad_norm": 0.25261548161506653, |
|
"learning_rate": 3.557321521979489e-06, |
|
"loss": 1.2279, |
|
"num_input_tokens_seen": 600412224, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.8009095502779181, |
|
"grad_norm": 0.25508007407188416, |
|
"learning_rate": 3.51416474913678e-06, |
|
"loss": 1.251, |
|
"num_input_tokens_seen": 601375968, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.8021728145528044, |
|
"grad_norm": 0.2806225121021271, |
|
"learning_rate": 3.471242124768207e-06, |
|
"loss": 1.2055, |
|
"num_input_tokens_seen": 602286496, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.8034360788276907, |
|
"grad_norm": 0.32982784509658813, |
|
"learning_rate": 3.42855436747705e-06, |
|
"loss": 1.2309, |
|
"num_input_tokens_seen": 603281216, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.8046993431025771, |
|
"grad_norm": 0.27231696248054504, |
|
"learning_rate": 3.3861021919344735e-06, |
|
"loss": 1.1807, |
|
"num_input_tokens_seen": 604231360, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.8059626073774634, |
|
"grad_norm": 0.2853865325450897, |
|
"learning_rate": 3.3438863088675783e-06, |
|
"loss": 1.2638, |
|
"num_input_tokens_seen": 605138944, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.8072258716523497, |
|
"grad_norm": 0.2520991563796997, |
|
"learning_rate": 3.301907425047496e-06, |
|
"loss": 1.2291, |
|
"num_input_tokens_seen": 606092896, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.8084891359272359, |
|
"grad_norm": 0.2628813683986664, |
|
"learning_rate": 3.260166243277564e-06, |
|
"loss": 1.2588, |
|
"num_input_tokens_seen": 607004512, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.8097524002021222, |
|
"grad_norm": 0.24886657297611237, |
|
"learning_rate": 3.2186634623815337e-06, |
|
"loss": 1.2636, |
|
"num_input_tokens_seen": 607919360, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.8110156644770086, |
|
"grad_norm": 0.2556428909301758, |
|
"learning_rate": 3.177399777191912e-06, |
|
"loss": 1.2427, |
|
"num_input_tokens_seen": 608921984, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.8122789287518949, |
|
"grad_norm": 0.24436554312705994, |
|
"learning_rate": 3.1363758785382866e-06, |
|
"loss": 1.2667, |
|
"num_input_tokens_seen": 609854816, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.8135421930267812, |
|
"grad_norm": 0.26374685764312744, |
|
"learning_rate": 3.0955924532357908e-06, |
|
"loss": 1.2398, |
|
"num_input_tokens_seen": 610815712, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.8148054573016675, |
|
"grad_norm": 0.28322839736938477, |
|
"learning_rate": 3.055050184073599e-06, |
|
"loss": 1.2552, |
|
"num_input_tokens_seen": 611770144, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.8160687215765539, |
|
"grad_norm": 0.2539218068122864, |
|
"learning_rate": 3.0147497498034735e-06, |
|
"loss": 1.202, |
|
"num_input_tokens_seen": 612729024, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.8173319858514401, |
|
"grad_norm": 0.27928316593170166, |
|
"learning_rate": 2.974691825128433e-06, |
|
"loss": 1.2777, |
|
"num_input_tokens_seen": 613643488, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.8185952501263264, |
|
"grad_norm": 0.26042285561561584, |
|
"learning_rate": 2.934877080691438e-06, |
|
"loss": 1.2077, |
|
"num_input_tokens_seen": 614610560, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.8198585144012127, |
|
"grad_norm": 0.24354539811611176, |
|
"learning_rate": 2.8953061830641663e-06, |
|
"loss": 1.191, |
|
"num_input_tokens_seen": 615577216, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.821121778676099, |
|
"grad_norm": 0.2690410912036896, |
|
"learning_rate": 2.8559797947358463e-06, |
|
"loss": 1.1872, |
|
"num_input_tokens_seen": 616548384, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.8223850429509854, |
|
"grad_norm": 0.2414551079273224, |
|
"learning_rate": 2.8168985741021875e-06, |
|
"loss": 1.2318, |
|
"num_input_tokens_seen": 617543904, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.8236483072258717, |
|
"grad_norm": 0.23589564859867096, |
|
"learning_rate": 2.7780631754543265e-06, |
|
"loss": 1.2087, |
|
"num_input_tokens_seen": 618540128, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.824911571500758, |
|
"grad_norm": 0.25712019205093384, |
|
"learning_rate": 2.739474248967916e-06, |
|
"loss": 1.1912, |
|
"num_input_tokens_seen": 619500352, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.8261748357756442, |
|
"grad_norm": 0.26267293095588684, |
|
"learning_rate": 2.7011324406921816e-06, |
|
"loss": 1.2882, |
|
"num_input_tokens_seen": 620453920, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.8274381000505305, |
|
"grad_norm": 0.2525344789028168, |
|
"learning_rate": 2.6630383925391654e-06, |
|
"loss": 1.2602, |
|
"num_input_tokens_seen": 621427552, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.8287013643254169, |
|
"grad_norm": 0.25016433000564575, |
|
"learning_rate": 2.6251927422729305e-06, |
|
"loss": 1.2071, |
|
"num_input_tokens_seen": 622454432, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.8299646286003032, |
|
"grad_norm": 0.24579358100891113, |
|
"learning_rate": 2.5875961234989185e-06, |
|
"loss": 1.2262, |
|
"num_input_tokens_seen": 623389792, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.8312278928751895, |
|
"grad_norm": 0.24960210919380188, |
|
"learning_rate": 2.5502491656533293e-06, |
|
"loss": 1.1894, |
|
"num_input_tokens_seen": 624352928, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.8324911571500758, |
|
"grad_norm": 0.2529809772968292, |
|
"learning_rate": 2.513152493992568e-06, |
|
"loss": 1.2355, |
|
"num_input_tokens_seen": 625237472, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.8337544214249621, |
|
"grad_norm": 0.2756924331188202, |
|
"learning_rate": 2.4763067295828053e-06, |
|
"loss": 1.1959, |
|
"num_input_tokens_seen": 626200416, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.8350176856998484, |
|
"grad_norm": 0.2560481131076813, |
|
"learning_rate": 2.439712489289555e-06, |
|
"loss": 1.1686, |
|
"num_input_tokens_seen": 627085760, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.8362809499747347, |
|
"grad_norm": 0.2564622461795807, |
|
"learning_rate": 2.403370385767364e-06, |
|
"loss": 1.2475, |
|
"num_input_tokens_seen": 628078240, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.837544214249621, |
|
"grad_norm": 0.2827485203742981, |
|
"learning_rate": 2.367281027449548e-06, |
|
"loss": 1.1958, |
|
"num_input_tokens_seen": 629016384, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.8388074785245073, |
|
"grad_norm": 0.2654615342617035, |
|
"learning_rate": 2.3314450185380047e-06, |
|
"loss": 1.278, |
|
"num_input_tokens_seen": 629963040, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.8400707427993936, |
|
"grad_norm": 0.26686492562294006, |
|
"learning_rate": 2.295862958993091e-06, |
|
"loss": 1.2544, |
|
"num_input_tokens_seen": 630921504, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.84133400707428, |
|
"grad_norm": 0.2568102180957794, |
|
"learning_rate": 2.2605354445236036e-06, |
|
"loss": 1.1788, |
|
"num_input_tokens_seen": 631837184, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.8425972713491663, |
|
"grad_norm": 0.2527879476547241, |
|
"learning_rate": 2.2254630665767636e-06, |
|
"loss": 1.2889, |
|
"num_input_tokens_seen": 632828288, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.8438605356240525, |
|
"grad_norm": 0.26815953850746155, |
|
"learning_rate": 2.1906464123283744e-06, |
|
"loss": 1.2576, |
|
"num_input_tokens_seen": 633815520, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.8451237998989388, |
|
"grad_norm": 0.2878230810165405, |
|
"learning_rate": 2.156086064672924e-06, |
|
"loss": 1.2808, |
|
"num_input_tokens_seen": 634722208, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.8463870641738251, |
|
"grad_norm": 0.2378537356853485, |
|
"learning_rate": 2.1217826022138783e-06, |
|
"loss": 1.1683, |
|
"num_input_tokens_seen": 635706144, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.8476503284487115, |
|
"grad_norm": 0.25701719522476196, |
|
"learning_rate": 2.0877365992539653e-06, |
|
"loss": 1.2215, |
|
"num_input_tokens_seen": 636619104, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.8489135927235978, |
|
"grad_norm": 0.24454209208488464, |
|
"learning_rate": 2.0539486257855774e-06, |
|
"loss": 1.262, |
|
"num_input_tokens_seen": 637517568, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.8501768569984841, |
|
"grad_norm": 0.2640119791030884, |
|
"learning_rate": 2.0204192474812166e-06, |
|
"loss": 1.2826, |
|
"num_input_tokens_seen": 638479936, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.8514401212733704, |
|
"grad_norm": 0.2534317076206207, |
|
"learning_rate": 1.987149025684028e-06, |
|
"loss": 1.2236, |
|
"num_input_tokens_seen": 639357088, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.8527033855482566, |
|
"grad_norm": 0.2551516890525818, |
|
"learning_rate": 1.9541385173984074e-06, |
|
"loss": 1.1855, |
|
"num_input_tokens_seen": 640362912, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.853966649823143, |
|
"grad_norm": 0.257917582988739, |
|
"learning_rate": 1.921388275280664e-06, |
|
"loss": 1.2111, |
|
"num_input_tokens_seen": 641336448, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.8552299140980293, |
|
"grad_norm": 0.2687523663043976, |
|
"learning_rate": 1.888898847629779e-06, |
|
"loss": 1.2092, |
|
"num_input_tokens_seen": 642348704, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.8564931783729156, |
|
"grad_norm": 0.27500104904174805, |
|
"learning_rate": 1.8566707783782231e-06, |
|
"loss": 1.2022, |
|
"num_input_tokens_seen": 643290272, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.8577564426478019, |
|
"grad_norm": 0.27554988861083984, |
|
"learning_rate": 1.8247046070828535e-06, |
|
"loss": 1.1901, |
|
"num_input_tokens_seen": 644221792, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.8590197069226883, |
|
"grad_norm": 0.2787459194660187, |
|
"learning_rate": 1.7930008689158637e-06, |
|
"loss": 1.2127, |
|
"num_input_tokens_seen": 645176224, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.8602829711975746, |
|
"grad_norm": 0.23403003811836243, |
|
"learning_rate": 1.761560094655851e-06, |
|
"loss": 1.2688, |
|
"num_input_tokens_seen": 646193152, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.8615462354724608, |
|
"grad_norm": 0.2776746451854706, |
|
"learning_rate": 1.730382810678895e-06, |
|
"loss": 1.2174, |
|
"num_input_tokens_seen": 647194528, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.8628094997473471, |
|
"grad_norm": 0.2932538092136383, |
|
"learning_rate": 1.6994695389497982e-06, |
|
"loss": 1.1361, |
|
"num_input_tokens_seen": 648208224, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.8640727640222334, |
|
"grad_norm": 0.26842474937438965, |
|
"learning_rate": 1.6688207970132808e-06, |
|
"loss": 1.2041, |
|
"num_input_tokens_seen": 649171072, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.8653360282971198, |
|
"grad_norm": 0.2833315134048462, |
|
"learning_rate": 1.6384370979853776e-06, |
|
"loss": 1.27, |
|
"num_input_tokens_seen": 650172224, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.8665992925720061, |
|
"grad_norm": 0.26029422879219055, |
|
"learning_rate": 1.6083189505447964e-06, |
|
"loss": 1.2732, |
|
"num_input_tokens_seen": 651096864, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.8678625568468924, |
|
"grad_norm": 0.2853679060935974, |
|
"learning_rate": 1.578466858924442e-06, |
|
"loss": 1.1936, |
|
"num_input_tokens_seen": 652020192, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.8691258211217787, |
|
"grad_norm": 0.28354784846305847, |
|
"learning_rate": 1.548881322902959e-06, |
|
"loss": 1.2461, |
|
"num_input_tokens_seen": 652919488, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.8703890853966649, |
|
"grad_norm": 0.2513621747493744, |
|
"learning_rate": 1.5195628377963493e-06, |
|
"loss": 1.2352, |
|
"num_input_tokens_seen": 653868192, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.8716523496715513, |
|
"grad_norm": 0.2537190616130829, |
|
"learning_rate": 1.4905118944497058e-06, |
|
"loss": 1.1954, |
|
"num_input_tokens_seen": 654866304, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.8729156139464376, |
|
"grad_norm": 0.26647478342056274, |
|
"learning_rate": 1.4617289792289743e-06, |
|
"loss": 1.2386, |
|
"num_input_tokens_seen": 655850752, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.8741788782213239, |
|
"grad_norm": 0.2586477994918823, |
|
"learning_rate": 1.4332145740128345e-06, |
|
"loss": 1.256, |
|
"num_input_tokens_seen": 656778176, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.8754421424962102, |
|
"grad_norm": 0.2705184817314148, |
|
"learning_rate": 1.4049691561845975e-06, |
|
"loss": 1.2329, |
|
"num_input_tokens_seen": 657784128, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.8767054067710965, |
|
"grad_norm": 0.2453477680683136, |
|
"learning_rate": 1.376993198624248e-06, |
|
"loss": 1.1833, |
|
"num_input_tokens_seen": 658703168, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.8779686710459829, |
|
"grad_norm": 0.25567731261253357, |
|
"learning_rate": 1.3492871697005042e-06, |
|
"loss": 1.2284, |
|
"num_input_tokens_seen": 659688864, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.8792319353208691, |
|
"grad_norm": 0.29871034622192383, |
|
"learning_rate": 1.3218515332629892e-06, |
|
"loss": 1.2664, |
|
"num_input_tokens_seen": 660603104, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.8804951995957554, |
|
"grad_norm": 0.25376957654953003, |
|
"learning_rate": 1.2946867486344597e-06, |
|
"loss": 1.2197, |
|
"num_input_tokens_seen": 661552704, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.8817584638706417, |
|
"grad_norm": 0.3075960874557495, |
|
"learning_rate": 1.267793270603122e-06, |
|
"loss": 1.1982, |
|
"num_input_tokens_seen": 662524096, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.883021728145528, |
|
"grad_norm": 0.2471645623445511, |
|
"learning_rate": 1.2411715494150024e-06, |
|
"loss": 1.1913, |
|
"num_input_tokens_seen": 663442336, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.8842849924204144, |
|
"grad_norm": 0.2692629098892212, |
|
"learning_rate": 1.214822030766437e-06, |
|
"loss": 1.2643, |
|
"num_input_tokens_seen": 664365344, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.8855482566953007, |
|
"grad_norm": 0.2840708792209625, |
|
"learning_rate": 1.1887451557965732e-06, |
|
"loss": 1.1826, |
|
"num_input_tokens_seen": 665290880, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.886811520970187, |
|
"grad_norm": 0.2730172574520111, |
|
"learning_rate": 1.1629413610800198e-06, |
|
"loss": 1.2738, |
|
"num_input_tokens_seen": 666231392, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.8880747852450732, |
|
"grad_norm": 0.28216251730918884, |
|
"learning_rate": 1.1374110786195212e-06, |
|
"loss": 1.1925, |
|
"num_input_tokens_seen": 667211072, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.8893380495199595, |
|
"grad_norm": 0.25766119360923767, |
|
"learning_rate": 1.1121547358387154e-06, |
|
"loss": 1.2013, |
|
"num_input_tokens_seen": 668144320, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.8906013137948459, |
|
"grad_norm": 0.24992607533931732, |
|
"learning_rate": 1.087172755575001e-06, |
|
"loss": 1.1939, |
|
"num_input_tokens_seen": 669092064, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.8918645780697322, |
|
"grad_norm": 0.26488760113716125, |
|
"learning_rate": 1.0624655560724363e-06, |
|
"loss": 1.2276, |
|
"num_input_tokens_seen": 670011840, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.8931278423446185, |
|
"grad_norm": 0.25586891174316406, |
|
"learning_rate": 1.0380335509747583e-06, |
|
"loss": 1.2528, |
|
"num_input_tokens_seen": 670906560, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.8943911066195048, |
|
"grad_norm": 0.2638219892978668, |
|
"learning_rate": 1.0138771493184352e-06, |
|
"loss": 1.2721, |
|
"num_input_tokens_seen": 671885760, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.8956543708943911, |
|
"grad_norm": 0.25774410367012024, |
|
"learning_rate": 9.899967555258347e-07, |
|
"loss": 1.2788, |
|
"num_input_tokens_seen": 672838336, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.8969176351692774, |
|
"grad_norm": 0.24537810683250427, |
|
"learning_rate": 9.663927693984438e-07, |
|
"loss": 1.2218, |
|
"num_input_tokens_seen": 673773728, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.8981808994441637, |
|
"grad_norm": 0.269209623336792, |
|
"learning_rate": 9.430655861101829e-07, |
|
"loss": 1.1914, |
|
"num_input_tokens_seen": 674686496, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.89944416371905, |
|
"grad_norm": 0.2713133692741394, |
|
"learning_rate": 9.200155962007868e-07, |
|
"loss": 1.221, |
|
"num_input_tokens_seen": 675659040, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.9007074279939363, |
|
"grad_norm": 0.2782800793647766, |
|
"learning_rate": 8.972431855692685e-07, |
|
"loss": 1.2197, |
|
"num_input_tokens_seen": 676523936, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.9019706922688226, |
|
"grad_norm": 0.28656941652297974, |
|
"learning_rate": 8.747487354674457e-07, |
|
"loss": 1.2924, |
|
"num_input_tokens_seen": 677481408, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.903233956543709, |
|
"grad_norm": 0.2603612542152405, |
|
"learning_rate": 8.525326224935794e-07, |
|
"loss": 1.2418, |
|
"num_input_tokens_seen": 678461056, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.9044972208185953, |
|
"grad_norm": 0.2789015471935272, |
|
"learning_rate": 8.305952185860484e-07, |
|
"loss": 1.1934, |
|
"num_input_tokens_seen": 679452256, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.9057604850934815, |
|
"grad_norm": 0.29948341846466064, |
|
"learning_rate": 8.089368910171396e-07, |
|
"loss": 1.2467, |
|
"num_input_tokens_seen": 680371648, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.9070237493683678, |
|
"grad_norm": 0.26572108268737793, |
|
"learning_rate": 7.875580023868885e-07, |
|
"loss": 1.1925, |
|
"num_input_tokens_seen": 681355648, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.9082870136432541, |
|
"grad_norm": 0.24899084866046906, |
|
"learning_rate": 7.664589106170069e-07, |
|
"loss": 1.252, |
|
"num_input_tokens_seen": 682361344, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.9095502779181405, |
|
"grad_norm": 0.24572855234146118, |
|
"learning_rate": 7.456399689449052e-07, |
|
"loss": 1.2339, |
|
"num_input_tokens_seen": 683316896, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.9108135421930268, |
|
"grad_norm": 0.2785273492336273, |
|
"learning_rate": 7.251015259177561e-07, |
|
"loss": 1.2259, |
|
"num_input_tokens_seen": 684286528, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.9120768064679131, |
|
"grad_norm": 0.24116089940071106, |
|
"learning_rate": 7.048439253866866e-07, |
|
"loss": 1.1971, |
|
"num_input_tokens_seen": 685241440, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.9133400707427994, |
|
"grad_norm": 0.25249651074409485, |
|
"learning_rate": 6.848675065009904e-07, |
|
"loss": 1.1883, |
|
"num_input_tokens_seen": 686179008, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.9146033350176856, |
|
"grad_norm": 0.24898767471313477, |
|
"learning_rate": 6.651726037024796e-07, |
|
"loss": 1.2214, |
|
"num_input_tokens_seen": 687148992, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.915866599292572, |
|
"grad_norm": 0.2656947672367096, |
|
"learning_rate": 6.457595467198567e-07, |
|
"loss": 1.1936, |
|
"num_input_tokens_seen": 688136000, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.9171298635674583, |
|
"grad_norm": 0.2621888816356659, |
|
"learning_rate": 6.266286605632295e-07, |
|
"loss": 1.2068, |
|
"num_input_tokens_seen": 689067328, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.9183931278423446, |
|
"grad_norm": 0.2367779016494751, |
|
"learning_rate": 6.07780265518632e-07, |
|
"loss": 1.2581, |
|
"num_input_tokens_seen": 690001664, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.9196563921172309, |
|
"grad_norm": 0.24973830580711365, |
|
"learning_rate": 5.892146771426915e-07, |
|
"loss": 1.2381, |
|
"num_input_tokens_seen": 690943648, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.9209196563921173, |
|
"grad_norm": 0.2687539756298065, |
|
"learning_rate": 5.70932206257326e-07, |
|
"loss": 1.2386, |
|
"num_input_tokens_seen": 691864224, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.9221829206670036, |
|
"grad_norm": 0.25320330262184143, |
|
"learning_rate": 5.529331589445516e-07, |
|
"loss": 1.2678, |
|
"num_input_tokens_seen": 692833472, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.9234461849418898, |
|
"grad_norm": 0.2584136426448822, |
|
"learning_rate": 5.35217836541362e-07, |
|
"loss": 1.2621, |
|
"num_input_tokens_seen": 693706112, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.9247094492167761, |
|
"grad_norm": 0.2527817487716675, |
|
"learning_rate": 5.177865356346644e-07, |
|
"loss": 1.2521, |
|
"num_input_tokens_seen": 694636736, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.9259727134916624, |
|
"grad_norm": 0.24299506843090057, |
|
"learning_rate": 5.00639548056338e-07, |
|
"loss": 1.2517, |
|
"num_input_tokens_seen": 695631264, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.9272359777665488, |
|
"grad_norm": 0.24970118701457977, |
|
"learning_rate": 4.837771608783264e-07, |
|
"loss": 1.2364, |
|
"num_input_tokens_seen": 696587872, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.9284992420414351, |
|
"grad_norm": 0.2587854564189911, |
|
"learning_rate": 4.6719965640784676e-07, |
|
"loss": 1.2376, |
|
"num_input_tokens_seen": 697601376, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.9297625063163214, |
|
"grad_norm": 0.26746806502342224, |
|
"learning_rate": 4.509073121826623e-07, |
|
"loss": 1.2466, |
|
"num_input_tokens_seen": 698550432, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.9310257705912077, |
|
"grad_norm": 0.269715815782547, |
|
"learning_rate": 4.349004009664275e-07, |
|
"loss": 1.2421, |
|
"num_input_tokens_seen": 699511744, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.9322890348660939, |
|
"grad_norm": 0.24946600198745728, |
|
"learning_rate": 4.1917919074412416e-07, |
|
"loss": 1.1982, |
|
"num_input_tokens_seen": 700446176, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.9335522991409803, |
|
"grad_norm": 0.281342089176178, |
|
"learning_rate": 4.037439447175789e-07, |
|
"loss": 1.2408, |
|
"num_input_tokens_seen": 701373568, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.9348155634158666, |
|
"grad_norm": 0.2512856125831604, |
|
"learning_rate": 3.88594921301055e-07, |
|
"loss": 1.2414, |
|
"num_input_tokens_seen": 702294016, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.9360788276907529, |
|
"grad_norm": 0.2601119577884674, |
|
"learning_rate": 3.737323741169257e-07, |
|
"loss": 1.2491, |
|
"num_input_tokens_seen": 703232672, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.9373420919656392, |
|
"grad_norm": 0.270298033952713, |
|
"learning_rate": 3.5915655199142663e-07, |
|
"loss": 1.2174, |
|
"num_input_tokens_seen": 704175744, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.9386053562405255, |
|
"grad_norm": 0.23530983924865723, |
|
"learning_rate": 3.448676989504925e-07, |
|
"loss": 1.2368, |
|
"num_input_tokens_seen": 705141664, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.9398686205154119, |
|
"grad_norm": 0.2633696496486664, |
|
"learning_rate": 3.308660542156694e-07, |
|
"loss": 1.2018, |
|
"num_input_tokens_seen": 706067200, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.9411318847902981, |
|
"grad_norm": 0.26215797662734985, |
|
"learning_rate": 3.1715185220010984e-07, |
|
"loss": 1.2193, |
|
"num_input_tokens_seen": 706966304, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.9423951490651844, |
|
"grad_norm": 0.27117466926574707, |
|
"learning_rate": 3.037253225046529e-07, |
|
"loss": 1.2907, |
|
"num_input_tokens_seen": 707921440, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.9436584133400707, |
|
"grad_norm": 0.27227288484573364, |
|
"learning_rate": 2.905866899139708e-07, |
|
"loss": 1.251, |
|
"num_input_tokens_seen": 708838784, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.944921677614957, |
|
"grad_norm": 0.26309284567832947, |
|
"learning_rate": 2.777361743928194e-07, |
|
"loss": 1.2574, |
|
"num_input_tokens_seen": 709754176, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.9461849418898434, |
|
"grad_norm": 0.24601784348487854, |
|
"learning_rate": 2.6517399108233886e-07, |
|
"loss": 1.1808, |
|
"num_input_tokens_seen": 710722944, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.9474482061647297, |
|
"grad_norm": 0.28660014271736145, |
|
"learning_rate": 2.5290035029646523e-07, |
|
"loss": 1.2572, |
|
"num_input_tokens_seen": 711716256, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.948711470439616, |
|
"grad_norm": 0.2446954995393753, |
|
"learning_rate": 2.409154575184077e-07, |
|
"loss": 1.1996, |
|
"num_input_tokens_seen": 712625856, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.9499747347145022, |
|
"grad_norm": 0.2447938770055771, |
|
"learning_rate": 2.2921951339720053e-07, |
|
"loss": 1.2414, |
|
"num_input_tokens_seen": 713581728, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.9512379989893885, |
|
"grad_norm": 0.2409149706363678, |
|
"learning_rate": 2.178127137443489e-07, |
|
"loss": 1.1916, |
|
"num_input_tokens_seen": 714471360, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.9525012632642749, |
|
"grad_norm": 0.25430941581726074, |
|
"learning_rate": 2.0669524953055377e-07, |
|
"loss": 1.2343, |
|
"num_input_tokens_seen": 715391488, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.9537645275391612, |
|
"grad_norm": 0.27573850750923157, |
|
"learning_rate": 1.9586730688250395e-07, |
|
"loss": 1.2559, |
|
"num_input_tokens_seen": 716352896, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.9550277918140475, |
|
"grad_norm": 0.2683832347393036, |
|
"learning_rate": 1.8532906707978106e-07, |
|
"loss": 1.2169, |
|
"num_input_tokens_seen": 717298784, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.9562910560889338, |
|
"grad_norm": 0.28321197628974915, |
|
"learning_rate": 1.7508070655179757e-07, |
|
"loss": 1.2796, |
|
"num_input_tokens_seen": 718316000, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.9575543203638202, |
|
"grad_norm": 0.25757691264152527, |
|
"learning_rate": 1.65122396874863e-07, |
|
"loss": 1.2222, |
|
"num_input_tokens_seen": 719217248, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.9588175846387064, |
|
"grad_norm": 0.2687084972858429, |
|
"learning_rate": 1.5545430476930465e-07, |
|
"loss": 1.1853, |
|
"num_input_tokens_seen": 720198464, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.9600808489135927, |
|
"grad_norm": 0.2586497664451599, |
|
"learning_rate": 1.4607659209667165e-07, |
|
"loss": 1.2438, |
|
"num_input_tokens_seen": 721068160, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.961344113188479, |
|
"grad_norm": 0.24861587584018707, |
|
"learning_rate": 1.3698941585704033e-07, |
|
"loss": 1.2712, |
|
"num_input_tokens_seen": 722061472, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.9626073774633653, |
|
"grad_norm": 0.244459331035614, |
|
"learning_rate": 1.281929281863639e-07, |
|
"loss": 1.1897, |
|
"num_input_tokens_seen": 723015232, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.9638706417382517, |
|
"grad_norm": 0.225861594080925, |
|
"learning_rate": 1.1968727635394497e-07, |
|
"loss": 1.2689, |
|
"num_input_tokens_seen": 724000384, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.965133906013138, |
|
"grad_norm": 0.246552512049675, |
|
"learning_rate": 1.1147260275995634e-07, |
|
"loss": 1.1784, |
|
"num_input_tokens_seen": 724964992, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.9663971702880243, |
|
"grad_norm": 0.2584232687950134, |
|
"learning_rate": 1.0354904493306865e-07, |
|
"loss": 1.2263, |
|
"num_input_tokens_seen": 725923104, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.9676604345629105, |
|
"grad_norm": 0.25840452313423157, |
|
"learning_rate": 9.591673552813844e-08, |
|
"loss": 1.2081, |
|
"num_input_tokens_seen": 726876224, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.9689236988377968, |
|
"grad_norm": 0.28871768712997437, |
|
"learning_rate": 8.85758023239913e-08, |
|
"loss": 1.2545, |
|
"num_input_tokens_seen": 727721568, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.9701869631126832, |
|
"grad_norm": 0.29037731885910034, |
|
"learning_rate": 8.152636822127883e-08, |
|
"loss": 1.2221, |
|
"num_input_tokens_seen": 728634912, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.9714502273875695, |
|
"grad_norm": 0.2691645324230194, |
|
"learning_rate": 7.476855124043086e-08, |
|
"loss": 1.2158, |
|
"num_input_tokens_seen": 729574464, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.9727134916624558, |
|
"grad_norm": 0.2742849290370941, |
|
"learning_rate": 6.830246451966975e-08, |
|
"loss": 1.2089, |
|
"num_input_tokens_seen": 730499136, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.9739767559373421, |
|
"grad_norm": 0.26165613532066345, |
|
"learning_rate": 6.212821631311621e-08, |
|
"loss": 1.2314, |
|
"num_input_tokens_seen": 731461280, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.9752400202122284, |
|
"grad_norm": 0.24117015302181244, |
|
"learning_rate": 5.624590998898615e-08, |
|
"loss": 1.2055, |
|
"num_input_tokens_seen": 732374848, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.9765032844871147, |
|
"grad_norm": 0.2643440365791321, |
|
"learning_rate": 5.0655644027847994e-08, |
|
"loss": 1.2044, |
|
"num_input_tokens_seen": 733271648, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.977766548762001, |
|
"grad_norm": 0.24681268632411957, |
|
"learning_rate": 4.5357512020986755e-08, |
|
"loss": 1.1749, |
|
"num_input_tokens_seen": 734233312, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.9790298130368873, |
|
"grad_norm": 0.28687500953674316, |
|
"learning_rate": 4.0351602668824423e-08, |
|
"loss": 1.2237, |
|
"num_input_tokens_seen": 735189120, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.9802930773117736, |
|
"grad_norm": 0.2667155861854553, |
|
"learning_rate": 3.563799977944537e-08, |
|
"loss": 1.2138, |
|
"num_input_tokens_seen": 736120128, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.9815563415866599, |
|
"grad_norm": 0.25432640314102173, |
|
"learning_rate": 3.121678226718577e-08, |
|
"loss": 1.1976, |
|
"num_input_tokens_seen": 737063456, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.9828196058615463, |
|
"grad_norm": 0.2468518167734146, |
|
"learning_rate": 2.708802415131828e-08, |
|
"loss": 1.2268, |
|
"num_input_tokens_seen": 738004096, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.9840828701364326, |
|
"grad_norm": 0.27853333950042725, |
|
"learning_rate": 2.3251794554806636e-08, |
|
"loss": 1.2074, |
|
"num_input_tokens_seen": 739017440, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.9853461344113188, |
|
"grad_norm": 0.26621630787849426, |
|
"learning_rate": 1.9708157703157424e-08, |
|
"loss": 1.213, |
|
"num_input_tokens_seen": 740034656, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.9866093986862051, |
|
"grad_norm": 0.2626071572303772, |
|
"learning_rate": 1.645717292333204e-08, |
|
"loss": 1.2604, |
|
"num_input_tokens_seen": 741063104, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.9878726629610914, |
|
"grad_norm": 0.26386693120002747, |
|
"learning_rate": 1.3498894642769432e-08, |
|
"loss": 1.2779, |
|
"num_input_tokens_seen": 742014688, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.9891359272359778, |
|
"grad_norm": 0.2615217864513397, |
|
"learning_rate": 1.0833372388455442e-08, |
|
"loss": 1.2108, |
|
"num_input_tokens_seen": 742960160, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.9903991915108641, |
|
"grad_norm": 0.2661604881286621, |
|
"learning_rate": 8.460650786114576e-09, |
|
"loss": 1.1899, |
|
"num_input_tokens_seen": 743845760, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.9916624557857504, |
|
"grad_norm": 0.26591452956199646, |
|
"learning_rate": 6.380769559444499e-09, |
|
"loss": 1.2474, |
|
"num_input_tokens_seen": 744760672, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.9929257200606367, |
|
"grad_norm": 0.27036914229393005, |
|
"learning_rate": 4.5937635294671094e-09, |
|
"loss": 1.2709, |
|
"num_input_tokens_seen": 745728352, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.9941889843355229, |
|
"grad_norm": 0.24849487841129303, |
|
"learning_rate": 3.099662613930132e-09, |
|
"loss": 1.2096, |
|
"num_input_tokens_seen": 746640928, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.9954522486104093, |
|
"grad_norm": 0.2538692057132721, |
|
"learning_rate": 1.8984918268175055e-09, |
|
"loss": 1.2464, |
|
"num_input_tokens_seen": 747588896, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.9967155128852956, |
|
"grad_norm": 0.26595503091812134, |
|
"learning_rate": 9.902712779277788e-10, |
|
"loss": 1.2883, |
|
"num_input_tokens_seen": 748464864, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.9979787771601819, |
|
"grad_norm": 0.27239322662353516, |
|
"learning_rate": 3.7501617253216096e-10, |
|
"loss": 1.1961, |
|
"num_input_tokens_seen": 749490752, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.9992420414350682, |
|
"grad_norm": 0.2784164249897003, |
|
"learning_rate": 5.2736811129716613e-11, |
|
"loss": 1.2785, |
|
"num_input_tokens_seen": 750395392, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"num_input_tokens_seen": 750938410, |
|
"step": 7916, |
|
"total_flos": 3.6248418467253043e+18, |
|
"train_loss": 1.2696220230851407, |
|
"train_runtime": 79988.0702, |
|
"train_samples_per_second": 12.667, |
|
"train_steps_per_second": 0.099 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 7916, |
|
"num_input_tokens_seen": 750938410, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.6248418467253043e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|