| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2328598516350088, | |
| "eval_steps": 500, | |
| "global_step": 21000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 3.2580924034118652, | |
| "learning_rate": 2.2172949002217296e-06, | |
| "loss": 10.2933, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.4347386360168457, | |
| "learning_rate": 4.434589800443459e-06, | |
| "loss": 10.1894, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.3895885944366455, | |
| "learning_rate": 6.651884700665188e-06, | |
| "loss": 10.1424, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.129647731781006, | |
| "learning_rate": 8.869179600886918e-06, | |
| "loss": 10.0995, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.3564186096191406, | |
| "learning_rate": 1.1086474501108649e-05, | |
| "loss": 10.0479, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.830551028251648, | |
| "learning_rate": 1.3303769401330377e-05, | |
| "loss": 9.9971, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.1173911094665527, | |
| "learning_rate": 1.5521064301552106e-05, | |
| "loss": 9.9201, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.6636557579040527, | |
| "learning_rate": 1.7738359201773837e-05, | |
| "loss": 9.8562, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.4503839015960693, | |
| "learning_rate": 1.9955654101995567e-05, | |
| "loss": 9.7599, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.822424054145813, | |
| "learning_rate": 2.2172949002217298e-05, | |
| "loss": 9.6608, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.6598998308181763, | |
| "learning_rate": 2.4390243902439026e-05, | |
| "loss": 9.55, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.8471707105636597, | |
| "learning_rate": 2.6607538802660753e-05, | |
| "loss": 9.4606, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4833533763885498, | |
| "learning_rate": 2.8824833702882487e-05, | |
| "loss": 9.3283, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.688541054725647, | |
| "learning_rate": 3.104212860310421e-05, | |
| "loss": 9.2229, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.6466543674468994, | |
| "learning_rate": 3.325942350332594e-05, | |
| "loss": 9.1093, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4169293642044067, | |
| "learning_rate": 3.547671840354767e-05, | |
| "loss": 8.9703, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.7079193592071533, | |
| "learning_rate": 3.7694013303769404e-05, | |
| "loss": 8.8351, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.5513204336166382, | |
| "learning_rate": 3.9911308203991135e-05, | |
| "loss": 8.7111, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.485573172569275, | |
| "learning_rate": 4.212860310421286e-05, | |
| "loss": 8.5627, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.511690616607666, | |
| "learning_rate": 4.4345898004434597e-05, | |
| "loss": 8.5042, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 2.1478614807128906, | |
| "learning_rate": 4.656319290465632e-05, | |
| "loss": 8.3287, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4060652256011963, | |
| "learning_rate": 4.878048780487805e-05, | |
| "loss": 8.2341, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.3950035572052002, | |
| "learning_rate": 5.099778270509978e-05, | |
| "loss": 8.1277, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.5197688341140747, | |
| "learning_rate": 5.3215077605321506e-05, | |
| "loss": 8.0311, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.3406693935394287, | |
| "learning_rate": 5.543237250554324e-05, | |
| "loss": 7.9824, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4520119428634644, | |
| "learning_rate": 5.7649667405764975e-05, | |
| "loss": 7.9948, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.179124116897583, | |
| "learning_rate": 5.98669623059867e-05, | |
| "loss": 7.9144, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.4039533138275146, | |
| "learning_rate": 6.208425720620842e-05, | |
| "loss": 7.8768, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5542700290679932, | |
| "learning_rate": 6.430155210643016e-05, | |
| "loss": 7.894, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.4150550365447998, | |
| "learning_rate": 6.651884700665188e-05, | |
| "loss": 7.8409, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6647827625274658, | |
| "learning_rate": 6.873614190687362e-05, | |
| "loss": 7.91, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7795697450637817, | |
| "learning_rate": 7.095343680709535e-05, | |
| "loss": 7.8256, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.933110237121582, | |
| "learning_rate": 7.317073170731707e-05, | |
| "loss": 7.8463, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.1942570209503174, | |
| "learning_rate": 7.538802660753881e-05, | |
| "loss": 7.7827, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6759297847747803, | |
| "learning_rate": 7.760532150776053e-05, | |
| "loss": 7.8, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.093256950378418, | |
| "learning_rate": 7.982261640798227e-05, | |
| "loss": 7.7461, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.567872166633606, | |
| "learning_rate": 8.2039911308204e-05, | |
| "loss": 7.7338, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.3017679452896118, | |
| "learning_rate": 8.425720620842572e-05, | |
| "loss": 7.804, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7510960102081299, | |
| "learning_rate": 8.647450110864746e-05, | |
| "loss": 7.7405, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7215120792388916, | |
| "learning_rate": 8.869179600886919e-05, | |
| "loss": 7.7429, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6202715635299683, | |
| "learning_rate": 9.090909090909092e-05, | |
| "loss": 7.6588, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5680756568908691, | |
| "learning_rate": 9.312638580931264e-05, | |
| "loss": 7.6224, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.462240219116211, | |
| "learning_rate": 9.534368070953438e-05, | |
| "loss": 7.6851, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 2.2018320560455322, | |
| "learning_rate": 9.75609756097561e-05, | |
| "loss": 7.6443, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.9520208835601807, | |
| "learning_rate": 9.977827050997783e-05, | |
| "loss": 7.6456, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.115421175956726, | |
| "learning_rate": 0.00010199556541019956, | |
| "loss": 7.5894, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6002250909805298, | |
| "learning_rate": 0.0001042128603104213, | |
| "loss": 7.6017, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6516796350479126, | |
| "learning_rate": 0.00010643015521064301, | |
| "loss": 7.4548, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.2168257236480713, | |
| "learning_rate": 0.00010864745011086475, | |
| "loss": 7.5867, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5447593927383423, | |
| "learning_rate": 0.00011086474501108647, | |
| "loss": 7.5317, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6840906143188477, | |
| "learning_rate": 0.00011308203991130821, | |
| "loss": 7.5127, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.2965503931045532, | |
| "learning_rate": 0.00011529933481152995, | |
| "loss": 7.4911, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.643584966659546, | |
| "learning_rate": 0.00011751662971175166, | |
| "loss": 7.4416, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5419111251831055, | |
| "learning_rate": 0.0001197339246119734, | |
| "loss": 7.4944, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7774205207824707, | |
| "learning_rate": 0.00012195121951219512, | |
| "loss": 7.4244, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.1709322929382324, | |
| "learning_rate": 0.00012416851441241685, | |
| "loss": 7.371, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5503411293029785, | |
| "learning_rate": 0.0001263858093126386, | |
| "loss": 7.3031, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7744035720825195, | |
| "learning_rate": 0.00012860310421286032, | |
| "loss": 7.3338, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.2014000415802, | |
| "learning_rate": 0.00013082039911308205, | |
| "loss": 7.2962, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6716220378875732, | |
| "learning_rate": 0.00013303769401330377, | |
| "loss": 7.3348, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7045074701309204, | |
| "learning_rate": 0.0001352549889135255, | |
| "loss": 7.2864, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.8933771848678589, | |
| "learning_rate": 0.00013747228381374724, | |
| "loss": 7.2744, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.298779249191284, | |
| "learning_rate": 0.00013968957871396897, | |
| "loss": 7.2472, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.3420922756195068, | |
| "learning_rate": 0.0001419068736141907, | |
| "loss": 7.3019, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9339039325714111, | |
| "learning_rate": 0.00014412416851441242, | |
| "loss": 7.2982, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.69667387008667, | |
| "learning_rate": 0.00014634146341463414, | |
| "loss": 7.2851, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.3124189376831055, | |
| "learning_rate": 0.0001485587583148559, | |
| "loss": 7.258, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.975651741027832, | |
| "learning_rate": 0.00015077605321507762, | |
| "loss": 7.1275, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9704022407531738, | |
| "learning_rate": 0.00015299334811529934, | |
| "loss": 7.1473, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.5047757625579834, | |
| "learning_rate": 0.00015521064301552106, | |
| "loss": 7.1096, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.5465894937515259, | |
| "learning_rate": 0.0001574279379157428, | |
| "loss": 7.1501, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9557933807373047, | |
| "learning_rate": 0.00015964523281596454, | |
| "loss": 7.2033, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.420116424560547, | |
| "learning_rate": 0.00016186252771618626, | |
| "loss": 7.1275, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.114737033843994, | |
| "learning_rate": 0.000164079822616408, | |
| "loss": 7.0932, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.3085389137268066, | |
| "learning_rate": 0.00016629711751662974, | |
| "loss": 7.0311, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.5679140090942383, | |
| "learning_rate": 0.00016851441241685144, | |
| "loss": 6.9168, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.8611838817596436, | |
| "learning_rate": 0.0001707317073170732, | |
| "loss": 7.0085, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.8603994846343994, | |
| "learning_rate": 0.0001729490022172949, | |
| "loss": 6.9432, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.4244627952575684, | |
| "learning_rate": 0.00017516629711751663, | |
| "loss": 6.9333, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.177870750427246, | |
| "learning_rate": 0.00017738359201773839, | |
| "loss": 6.9499, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9320554733276367, | |
| "learning_rate": 0.00017960088691796008, | |
| "loss": 6.8204, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.5062849521636963, | |
| "learning_rate": 0.00018181818181818183, | |
| "loss": 6.9505, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.9272422790527344, | |
| "learning_rate": 0.00018403547671840356, | |
| "loss": 6.8701, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0309596061706543, | |
| "learning_rate": 0.00018625277161862528, | |
| "loss": 6.924, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0265886783599854, | |
| "learning_rate": 0.00018847006651884703, | |
| "loss": 6.9223, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.5160486698150635, | |
| "learning_rate": 0.00019068736141906876, | |
| "loss": 6.8708, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.613301992416382, | |
| "learning_rate": 0.00019290465631929045, | |
| "loss": 6.8937, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.3031229972839355, | |
| "learning_rate": 0.0001951219512195122, | |
| "loss": 6.8337, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.54779052734375, | |
| "learning_rate": 0.00019733924611973393, | |
| "loss": 6.8334, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.8277971744537354, | |
| "learning_rate": 0.00019955654101995565, | |
| "loss": 6.7925, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0113885402679443, | |
| "learning_rate": 0.00019999989242739025, | |
| "loss": 6.8458, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.2395377159118652, | |
| "learning_rate": 0.00019999945541405976, | |
| "loss": 6.6251, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.445993423461914, | |
| "learning_rate": 0.0001999986822381884, | |
| "loss": 6.8099, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.077752590179443, | |
| "learning_rate": 0.0001999975729023753, | |
| "loss": 6.8053, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.167569875717163, | |
| "learning_rate": 0.00019999612741034963, | |
| "loss": 6.7706, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.893659234046936, | |
| "learning_rate": 0.00019999434576697066, | |
| "loss": 6.8245, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.6101326942443848, | |
| "learning_rate": 0.00019999222797822762, | |
| "loss": 6.7407, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.2858726978302, | |
| "learning_rate": 0.00019998977405123974, | |
| "loss": 6.74, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.9325459003448486, | |
| "learning_rate": 0.0001999869839942563, | |
| "loss": 6.716, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.0043437480926514, | |
| "learning_rate": 0.00019998385781665643, | |
| "loss": 6.6003, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 4.151523113250732, | |
| "learning_rate": 0.00019998039552894924, | |
| "loss": 6.6801, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.8407771587371826, | |
| "learning_rate": 0.00019997659714277372, | |
| "loss": 6.608, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.230713129043579, | |
| "learning_rate": 0.00019997246267089867, | |
| "loss": 6.6479, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.2546942234039307, | |
| "learning_rate": 0.0001999679921272227, | |
| "loss": 6.6548, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.180986166000366, | |
| "learning_rate": 0.00019996318552677425, | |
| "loss": 6.6851, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.341231346130371, | |
| "learning_rate": 0.00019995804288571134, | |
| "loss": 6.547, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1117124557495117, | |
| "learning_rate": 0.00019995256422132172, | |
| "loss": 6.7072, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.0082530975341797, | |
| "learning_rate": 0.0001999467495520227, | |
| "loss": 6.5422, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.409489870071411, | |
| "learning_rate": 0.0001999405988973611, | |
| "loss": 6.3716, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.649052381515503, | |
| "learning_rate": 0.00019993411227801328, | |
| "loss": 6.6434, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.081116199493408, | |
| "learning_rate": 0.00019992728971578492, | |
| "loss": 6.4624, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1578280925750732, | |
| "learning_rate": 0.00019992013123361102, | |
| "loss": 6.5416, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.7874557971954346, | |
| "learning_rate": 0.0001999126368555559, | |
| "loss": 6.4512, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.7693099975585938, | |
| "learning_rate": 0.00019990480660681293, | |
| "loss": 6.5105, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.4338185787200928, | |
| "learning_rate": 0.00019989680712666593, | |
| "loss": 6.5092, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.656937837600708, | |
| "learning_rate": 0.00019988831193270577, | |
| "loss": 6.4269, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.857292652130127, | |
| "learning_rate": 0.00019987948094982952, | |
| "loss": 6.4387, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.4963467121124268, | |
| "learning_rate": 0.00019987031420772385, | |
| "loss": 6.3851, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.602522611618042, | |
| "learning_rate": 0.00019986081173720396, | |
| "loss": 6.3413, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.6455273628234863, | |
| "learning_rate": 0.00019985097357021385, | |
| "loss": 6.2965, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.5592167377471924, | |
| "learning_rate": 0.0001998407997398259, | |
| "loss": 6.4293, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.6016533374786377, | |
| "learning_rate": 0.00019983029028024094, | |
| "loss": 6.2897, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.5536839962005615, | |
| "learning_rate": 0.000199819445226788, | |
| "loss": 6.3157, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.0514349937438965, | |
| "learning_rate": 0.00019980826461592427, | |
| "loss": 6.3847, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.72495174407959, | |
| "learning_rate": 0.00019979674848523505, | |
| "loss": 6.3517, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.4264872074127197, | |
| "learning_rate": 0.00019978489687343335, | |
| "loss": 6.2533, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.8361423015594482, | |
| "learning_rate": 0.0001997727098203602, | |
| "loss": 6.3654, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.9690892696380615, | |
| "learning_rate": 0.00019976018736698404, | |
| "loss": 6.3968, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.6132867336273193, | |
| "learning_rate": 0.0001997473295554009, | |
| "loss": 6.3444, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 4.820697784423828, | |
| "learning_rate": 0.00019973413642883424, | |
| "loss": 6.2019, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.2316782474517822, | |
| "learning_rate": 0.00019972060803163458, | |
| "loss": 6.2049, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.9528305530548096, | |
| "learning_rate": 0.00019970674440927957, | |
| "loss": 6.1718, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.891073226928711, | |
| "learning_rate": 0.0001996925456083738, | |
| "loss": 6.2393, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.813270092010498, | |
| "learning_rate": 0.00019967801167664853, | |
| "loss": 6.2116, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.2726826667785645, | |
| "learning_rate": 0.00019966314266296173, | |
| "loss": 6.1521, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.3895318508148193, | |
| "learning_rate": 0.00019964793861729772, | |
| "loss": 6.1072, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.190431833267212, | |
| "learning_rate": 0.000199632399590767, | |
| "loss": 6.2009, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.79266095161438, | |
| "learning_rate": 0.00019961652563560634, | |
| "loss": 6.028, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.260039806365967, | |
| "learning_rate": 0.00019960031680517826, | |
| "loss": 6.0733, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.0739686489105225, | |
| "learning_rate": 0.0001995837731539711, | |
| "loss": 6.0521, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.0517771244049072, | |
| "learning_rate": 0.00019956689473759872, | |
| "loss": 6.0544, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.9524648189544678, | |
| "learning_rate": 0.0001995496816128003, | |
| "loss": 6.1326, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.498497486114502, | |
| "learning_rate": 0.00019953213383744033, | |
| "loss": 6.236, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.157576084136963, | |
| "learning_rate": 0.00019951425147050807, | |
| "loss": 5.9898, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.9297516345977783, | |
| "learning_rate": 0.00019949603457211775, | |
| "loss": 6.086, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.3214786052703857, | |
| "learning_rate": 0.00019947748320350804, | |
| "loss": 5.9589, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.8847291469573975, | |
| "learning_rate": 0.00019945859742704201, | |
| "loss": 6.1931, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.387896776199341, | |
| "learning_rate": 0.00019943937730620702, | |
| "loss": 6.0539, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.1214797496795654, | |
| "learning_rate": 0.00019941982290561417, | |
| "loss": 6.0288, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.7995123863220215, | |
| "learning_rate": 0.00019939993429099841, | |
| "loss": 6.0526, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.788393974304199, | |
| "learning_rate": 0.00019937971152921818, | |
| "loss": 5.9799, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.009220123291016, | |
| "learning_rate": 0.0001993591546882552, | |
| "loss": 6.1223, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.5576276779174805, | |
| "learning_rate": 0.00019933826383721428, | |
| "loss": 5.989, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.1287412643432617, | |
| "learning_rate": 0.00019931703904632294, | |
| "loss": 6.0542, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.6518595218658447, | |
| "learning_rate": 0.00019929548038693146, | |
| "loss": 6.041, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.268080472946167, | |
| "learning_rate": 0.0001992735879315123, | |
| "loss": 5.888, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.6055593490600586, | |
| "learning_rate": 0.00019925136175366007, | |
| "loss": 5.913, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.866463661193848, | |
| "learning_rate": 0.00019922880192809137, | |
| "loss": 5.9858, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.44808292388916, | |
| "learning_rate": 0.00019920590853064423, | |
| "loss": 5.7686, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.9507765769958496, | |
| "learning_rate": 0.00019918268163827808, | |
| "loss": 5.8557, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.441870927810669, | |
| "learning_rate": 0.00019915912132907352, | |
| "loss": 5.8268, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.838809013366699, | |
| "learning_rate": 0.00019913522768223182, | |
| "loss": 5.9833, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.165487289428711, | |
| "learning_rate": 0.00019911100077807498, | |
| "loss": 5.7422, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.5947463512420654, | |
| "learning_rate": 0.0001990864406980452, | |
| "loss": 5.7479, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.130446434020996, | |
| "learning_rate": 0.00019906154752470472, | |
| "loss": 5.7767, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.866550922393799, | |
| "learning_rate": 0.00019903632134173554, | |
| "loss": 5.7681, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.2839725017547607, | |
| "learning_rate": 0.00019901076223393903, | |
| "loss": 5.6656, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.0762476921081543, | |
| "learning_rate": 0.0001989848702872359, | |
| "loss": 5.789, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.7109107971191406, | |
| "learning_rate": 0.00019895864558866556, | |
| "loss": 5.773, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 5.400998115539551, | |
| "learning_rate": 0.00019893208822638618, | |
| "loss": 5.7506, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.3062849044799805, | |
| "learning_rate": 0.00019890519828967413, | |
| "loss": 5.7515, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.109920501708984, | |
| "learning_rate": 0.00019887797586892373, | |
| "loss": 5.7972, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.4838390350341797, | |
| "learning_rate": 0.00019885042105564717, | |
| "loss": 5.6753, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.251760959625244, | |
| "learning_rate": 0.00019882253394247381, | |
| "loss": 5.6303, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.042376518249512, | |
| "learning_rate": 0.00019879431462315025, | |
| "loss": 5.5753, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.239652633666992, | |
| "learning_rate": 0.0001987657631925398, | |
| "loss": 5.5335, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.15481424331665, | |
| "learning_rate": 0.00019873687974662215, | |
| "loss": 5.5396, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.36835241317749, | |
| "learning_rate": 0.00019870766438249317, | |
| "loss": 5.6017, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.165258407592773, | |
| "learning_rate": 0.00019867811719836452, | |
| "loss": 5.7228, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.125988006591797, | |
| "learning_rate": 0.0001986482382935633, | |
| "loss": 5.5787, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.177731037139893, | |
| "learning_rate": 0.0001986180277685317, | |
| "loss": 5.5829, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.006561279296875, | |
| "learning_rate": 0.00019858748572482683, | |
| "loss": 5.5466, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.33070182800293, | |
| "learning_rate": 0.00019855661226512007, | |
| "loss": 5.5544, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.358560085296631, | |
| "learning_rate": 0.00019852540749319708, | |
| "loss": 5.4599, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.536096096038818, | |
| "learning_rate": 0.00019849387151395708, | |
| "loss": 5.4983, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.66163444519043, | |
| "learning_rate": 0.0001984620044334129, | |
| "loss": 5.4097, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.4319233894348145, | |
| "learning_rate": 0.00019842980635869024, | |
| "loss": 5.4093, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.98419713973999, | |
| "learning_rate": 0.0001983972773980276, | |
| "loss": 5.4056, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.6354339122772217, | |
| "learning_rate": 0.0001983644176607757, | |
| "loss": 5.3171, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.495342254638672, | |
| "learning_rate": 0.00019833122725739736, | |
| "loss": 5.4521, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.5558671951293945, | |
| "learning_rate": 0.00019829770629946678, | |
| "loss": 5.5158, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.7165732383728027, | |
| "learning_rate": 0.00019826385489966957, | |
| "loss": 5.301, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.030915260314941, | |
| "learning_rate": 0.00019822967317180204, | |
| "loss": 5.3316, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.385923385620117, | |
| "learning_rate": 0.00019819516123077094, | |
| "loss": 5.3844, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.383516788482666, | |
| "learning_rate": 0.00019816101926755305, | |
| "loss": 5.2995, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.446406364440918, | |
| "learning_rate": 0.00019812585384780055, | |
| "loss": 5.386, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.345483303070068, | |
| "learning_rate": 0.00019809035856388805, | |
| "loss": 5.2815, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.791261672973633, | |
| "learning_rate": 0.00019805453353513813, | |
| "loss": 5.3757, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.622151851654053, | |
| "learning_rate": 0.00019801837888198172, | |
| "loss": 5.4405, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.934606075286865, | |
| "learning_rate": 0.0001979818947259579, | |
| "loss": 5.139, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.9659693241119385, | |
| "learning_rate": 0.0001979450811897134, | |
| "loss": 5.1726, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.214992046356201, | |
| "learning_rate": 0.00019790793839700226, | |
| "loss": 5.2864, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.5359601974487305, | |
| "learning_rate": 0.00019787046647268524, | |
| "loss": 5.1443, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.26462984085083, | |
| "learning_rate": 0.00019783266554272962, | |
| "loss": 5.0597, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.053945064544678, | |
| "learning_rate": 0.00019779453573420873, | |
| "loss": 5.2946, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.082211494445801, | |
| "learning_rate": 0.00019775607717530127, | |
| "loss": 5.2075, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.107390403747559, | |
| "learning_rate": 0.00019771728999529132, | |
| "loss": 5.1394, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.58411169052124, | |
| "learning_rate": 0.00019767817432456752, | |
| "loss": 5.1064, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 8.38965892791748, | |
| "learning_rate": 0.00019763952239228627, | |
| "loss": 5.0808, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.885803699493408, | |
| "learning_rate": 0.00019759975669894338, | |
| "loss": 5.0664, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.1605916023254395, | |
| "learning_rate": 0.00019755966290999167, | |
| "loss": 5.2469, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.821887016296387, | |
| "learning_rate": 0.00019751924116021225, | |
| "loss": 5.2451, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.865694761276245, | |
| "learning_rate": 0.00019747849158548858, | |
| "loss": 5.2334, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.640681028366089, | |
| "learning_rate": 0.00019743741432280625, | |
| "loss": 5.1206, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.04166316986084, | |
| "learning_rate": 0.00019739600951025236, | |
| "loss": 5.0059, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.637605667114258, | |
| "learning_rate": 0.00019735427728701516, | |
| "loss": 5.0302, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.08723783493042, | |
| "learning_rate": 0.0001973122177933835, | |
| "loss": 5.1551, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.7944953441619873, | |
| "learning_rate": 0.00019726983117074643, | |
| "loss": 5.0665, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.2847371101379395, | |
| "learning_rate": 0.00019722711756159266, | |
| "loss": 5.2212, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.109150409698486, | |
| "learning_rate": 0.00019718407710951012, | |
| "loss": 5.2645, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.127768039703369, | |
| "learning_rate": 0.0001971407099591855, | |
| "loss": 5.0395, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.058667182922363, | |
| "learning_rate": 0.00019709701625640367, | |
| "loss": 5.0247, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.4407267570495605, | |
| "learning_rate": 0.00019705299614804732, | |
| "loss": 4.9935, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.7877707481384277, | |
| "learning_rate": 0.00019700864978209636, | |
| "loss": 5.074, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.777330160140991, | |
| "learning_rate": 0.00019696397730762746, | |
| "loss": 5.0458, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.143067836761475, | |
| "learning_rate": 0.0001969189788748136, | |
| "loss": 4.9375, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.560107231140137, | |
| "learning_rate": 0.00019687365463492344, | |
| "loss": 4.8285, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.057905197143555, | |
| "learning_rate": 0.00019682800474032095, | |
| "loss": 4.9753, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.835442066192627, | |
| "learning_rate": 0.00019678202934446482, | |
| "loss": 4.9368, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.135551929473877, | |
| "learning_rate": 0.0001967357286019079, | |
| "loss": 4.9994, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.615053653717041, | |
| "learning_rate": 0.00019668910266829685, | |
| "loss": 5.0182, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.474258899688721, | |
| "learning_rate": 0.0001966421517003714, | |
| "loss": 4.8704, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.264945030212402, | |
| "learning_rate": 0.00019659487585596406, | |
| "loss": 4.9076, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.091209411621094, | |
| "learning_rate": 0.00019654727529399925, | |
| "loss": 4.7135, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.154038429260254, | |
| "learning_rate": 0.00019649935017449318, | |
| "loss": 4.8239, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.697162628173828, | |
| "learning_rate": 0.00019645110065855305, | |
| "loss": 4.9972, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.0024847984313965, | |
| "learning_rate": 0.00019640252690837645, | |
| "loss": 4.8854, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.9416885375976562, | |
| "learning_rate": 0.0001963536290872511, | |
| "loss": 4.8547, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.978651285171509, | |
| "learning_rate": 0.000196304407359554, | |
| "loss": 4.7873, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.435175895690918, | |
| "learning_rate": 0.0001962548618907511, | |
| "loss": 4.8124, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.8776824474334717, | |
| "learning_rate": 0.00019620499284739662, | |
| "loss": 4.8896, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.041496276855469, | |
| "learning_rate": 0.00019615480039713248, | |
| "loss": 4.8343, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.18281888961792, | |
| "learning_rate": 0.00019610428470868784, | |
| "loss": 4.8559, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.223630905151367, | |
| "learning_rate": 0.00019605344595187844, | |
| "loss": 4.8153, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.63677453994751, | |
| "learning_rate": 0.0001960022842976061, | |
| "loss": 4.7951, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.188296794891357, | |
| "learning_rate": 0.00019595079991785802, | |
| "loss": 4.8904, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.402559280395508, | |
| "learning_rate": 0.00019589899298570634, | |
| "loss": 4.7851, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.976877212524414, | |
| "learning_rate": 0.00019584686367530755, | |
| "loss": 4.6431, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.849298477172852, | |
| "learning_rate": 0.0001957944121619018, | |
| "loss": 4.7544, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.932714462280273, | |
| "learning_rate": 0.0001957416386218124, | |
| "loss": 4.6811, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.682474136352539, | |
| "learning_rate": 0.00019568854323244515, | |
| "loss": 4.799, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.228520393371582, | |
| "learning_rate": 0.00019563619766470511, | |
| "loss": 4.7622, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.093870162963867, | |
| "learning_rate": 0.00019558246554138458, | |
| "loss": 4.7369, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.248356342315674, | |
| "learning_rate": 0.0001955284121038694, | |
| "loss": 4.7519, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.924299955368042, | |
| "learning_rate": 0.00019547403753386803, | |
| "loss": 4.6441, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.972569942474365, | |
| "learning_rate": 0.00019542043906868188, | |
| "loss": 4.7192, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.033604145050049, | |
| "learning_rate": 0.00019536542919665846, | |
| "loss": 4.6397, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.222695350646973, | |
| "learning_rate": 0.00019531009874003928, | |
| "loss": 4.6309, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.810999631881714, | |
| "learning_rate": 0.00019525444788482562, | |
| "loss": 4.6513, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.272600173950195, | |
| "learning_rate": 0.00019519847681809585, | |
| "loss": 4.8001, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.836308002471924, | |
| "learning_rate": 0.00019514218572800468, | |
| "loss": 4.7101, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.598148345947266, | |
| "learning_rate": 0.00019508557480378276, | |
| "loss": 4.5578, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.910820722579956, | |
| "learning_rate": 0.0001950286442357358, | |
| "loss": 4.7124, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.856081962585449, | |
| "learning_rate": 0.00019497139421524416, | |
| "loss": 4.7563, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.151907920837402, | |
| "learning_rate": 0.00019491382493476195, | |
| "loss": 4.6726, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.349935054779053, | |
| "learning_rate": 0.0001948559365878166, | |
| "loss": 4.6341, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.8229756355285645, | |
| "learning_rate": 0.00019479772936900811, | |
| "loss": 4.6183, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.495506286621094, | |
| "learning_rate": 0.0001947392034740084, | |
| "loss": 4.6608, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.307513236999512, | |
| "learning_rate": 0.00019468035909956072, | |
| "loss": 4.6805, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.939659595489502, | |
| "learning_rate": 0.0001946211964434788, | |
| "loss": 4.679, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.444967269897461, | |
| "learning_rate": 0.00019456171570464653, | |
| "loss": 4.7195, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.513270854949951, | |
| "learning_rate": 0.00019450191708301687, | |
| "loss": 4.5367, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.617405414581299, | |
| "learning_rate": 0.00019444180077961146, | |
| "loss": 4.5742, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.580646991729736, | |
| "learning_rate": 0.00019438136699652001, | |
| "loss": 4.4936, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.657532691955566, | |
| "learning_rate": 0.00019432061593689927, | |
| "loss": 4.6877, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.374803066253662, | |
| "learning_rate": 0.0001942595478049727, | |
| "loss": 4.6101, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.1111650466918945, | |
| "learning_rate": 0.00019419816280602962, | |
| "loss": 4.6185, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.18306303024292, | |
| "learning_rate": 0.00019413646114642446, | |
| "loss": 4.5524, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.411191463470459, | |
| "learning_rate": 0.00019407444303357624, | |
| "loss": 4.4346, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.161925792694092, | |
| "learning_rate": 0.0001940121086759678, | |
| "loss": 4.3702, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.059813022613525, | |
| "learning_rate": 0.000193949458283145, | |
| "loss": 4.5351, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.563150882720947, | |
| "learning_rate": 0.00019388649206571616, | |
| "loss": 4.477, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.1144609451293945, | |
| "learning_rate": 0.00019382321023535127, | |
| "loss": 4.6033, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.734794616699219, | |
| "learning_rate": 0.00019375961300478127, | |
| "loss": 4.5287, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.543684959411621, | |
| "learning_rate": 0.00019369570058779743, | |
| "loss": 4.4474, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.4647979736328125, | |
| "learning_rate": 0.00019363147319925047, | |
| "loss": 4.3806, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.058681964874268, | |
| "learning_rate": 0.00019356693105505006, | |
| "loss": 4.4998, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.494804859161377, | |
| "learning_rate": 0.00019350207437216386, | |
| "loss": 4.3911, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.227470397949219, | |
| "learning_rate": 0.00019343690336861687, | |
| "loss": 4.2557, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.7686829566955566, | |
| "learning_rate": 0.00019337141826349092, | |
| "loss": 4.313, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.975152492523193, | |
| "learning_rate": 0.00019330561927692345, | |
| "loss": 4.2914, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.811885356903076, | |
| "learning_rate": 0.00019323950663010733, | |
| "loss": 4.3566, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.566829204559326, | |
| "learning_rate": 0.00019317308054528966, | |
| "loss": 4.2847, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.977478504180908, | |
| "learning_rate": 0.0001931063412457713, | |
| "loss": 4.3034, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.601086616516113, | |
| "learning_rate": 0.00019303928895590596, | |
| "loss": 4.1929, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.051478385925293, | |
| "learning_rate": 0.0001929719239010996, | |
| "loss": 4.2749, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.248847961425781, | |
| "learning_rate": 0.00019290424630780947, | |
| "loss": 4.3419, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.392062664031982, | |
| "learning_rate": 0.0001928362564035436, | |
| "loss": 4.4038, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.6346211433410645, | |
| "learning_rate": 0.00019276795441685975, | |
| "loss": 4.3403, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.646982192993164, | |
| "learning_rate": 0.00019269934057736493, | |
| "loss": 4.252, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.455059051513672, | |
| "learning_rate": 0.00019263041511571438, | |
| "loss": 4.3809, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.478726387023926, | |
| "learning_rate": 0.00019256117826361096, | |
| "loss": 4.1885, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.029292106628418, | |
| "learning_rate": 0.0001924916302538043, | |
| "loss": 4.2615, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.6447978019714355, | |
| "learning_rate": 0.00019242177132009, | |
| "loss": 4.268, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.165138244628906, | |
| "learning_rate": 0.00019235160169730895, | |
| "loss": 4.3222, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.661884784698486, | |
| "learning_rate": 0.00019228112162134641, | |
| "loss": 4.3179, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 6.117990493774414, | |
| "learning_rate": 0.0001922103313291313, | |
| "loss": 4.2241, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.299765110015869, | |
| "learning_rate": 0.0001921392310586353, | |
| "loss": 4.2602, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.798460483551025, | |
| "learning_rate": 0.00019206782104887223, | |
| "loss": 4.3096, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.016506671905518, | |
| "learning_rate": 0.00019199610153989712, | |
| "loss": 4.2073, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 9.708767890930176, | |
| "learning_rate": 0.0001919240727728054, | |
| "loss": 4.2099, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.904361248016357, | |
| "learning_rate": 0.00019185173498973204, | |
| "loss": 4.2461, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.290199279785156, | |
| "learning_rate": 0.00019177908843385103, | |
| "loss": 4.115, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 6.290179252624512, | |
| "learning_rate": 0.00019170613334937406, | |
| "loss": 4.3295, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.071104526519775, | |
| "learning_rate": 0.00019163286998155027, | |
| "loss": 4.1532, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.5464067459106445, | |
| "learning_rate": 0.00019155929857666494, | |
| "loss": 4.0761, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.664229393005371, | |
| "learning_rate": 0.0001914854193820389, | |
| "loss": 4.1371, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 7.168484210968018, | |
| "learning_rate": 0.0001914112326460277, | |
| "loss": 4.178, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.570041179656982, | |
| "learning_rate": 0.0001913367386180207, | |
| "loss": 4.1536, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.298222064971924, | |
| "learning_rate": 0.00019126193754844036, | |
| "loss": 4.2089, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 7.139255523681641, | |
| "learning_rate": 0.0001911868296887411, | |
| "loss": 4.1362, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.763050556182861, | |
| "learning_rate": 0.00019111141529140887, | |
| "loss": 4.1106, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.586143493652344, | |
| "learning_rate": 0.00019103569460995998, | |
| "loss": 3.9519, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.827348232269287, | |
| "learning_rate": 0.00019095966789894038, | |
| "loss": 3.9598, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.121611595153809, | |
| "learning_rate": 0.00019088333541392478, | |
| "loss": 4.1347, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.110377788543701, | |
| "learning_rate": 0.00019080669741151581, | |
| "loss": 4.0088, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.672893047332764, | |
| "learning_rate": 0.00019072975414934318, | |
| "loss": 4.0916, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.667397499084473, | |
| "learning_rate": 0.00019065250588606262, | |
| "loss": 4.0695, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.404243469238281, | |
| "learning_rate": 0.0001905749528813553, | |
| "loss": 3.9728, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.912601470947266, | |
| "learning_rate": 0.00019049709539592686, | |
| "loss": 4.029, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.015479564666748, | |
| "learning_rate": 0.00019041893369150636, | |
| "loss": 4.0268, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.656422138214111, | |
| "learning_rate": 0.00019034046803084563, | |
| "loss": 4.0393, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.685242176055908, | |
| "learning_rate": 0.00019026169867771825, | |
| "loss": 4.1104, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.503780364990234, | |
| "learning_rate": 0.00019018262589691874, | |
| "loss": 4.0344, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.73757266998291, | |
| "learning_rate": 0.00019010324995426156, | |
| "loss": 4.1114, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 7.276214122772217, | |
| "learning_rate": 0.0001900235711165804, | |
| "loss": 3.8838, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.2224273681640625, | |
| "learning_rate": 0.00018994358965172717, | |
| "loss": 3.9479, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.4751996994018555, | |
| "learning_rate": 0.00018986330582857096, | |
| "loss": 4.0079, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.874088764190674, | |
| "learning_rate": 0.00018978271991699743, | |
| "loss": 4.1664, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 7.713326454162598, | |
| "learning_rate": 0.0001897018321879077, | |
| "loss": 3.9646, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.753252029418945, | |
| "learning_rate": 0.00018962064291321747, | |
| "loss": 3.8574, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.962434768676758, | |
| "learning_rate": 0.0001895391523658562, | |
| "loss": 3.9757, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.875513553619385, | |
| "learning_rate": 0.00018945736081976607, | |
| "loss": 4.0424, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.298293590545654, | |
| "learning_rate": 0.00018937526854990108, | |
| "loss": 3.958, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.98872184753418, | |
| "learning_rate": 0.00018929287583222625, | |
| "loss": 3.9225, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.467836380004883, | |
| "learning_rate": 0.00018921018294371645, | |
| "loss": 3.9369, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.920988082885742, | |
| "learning_rate": 0.0001891271901623558, | |
| "loss": 3.975, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.652931213378906, | |
| "learning_rate": 0.00018904389776713641, | |
| "loss": 3.9067, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.372093200683594, | |
| "learning_rate": 0.00018896030603805767, | |
| "loss": 3.9267, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.743618965148926, | |
| "learning_rate": 0.00018887641525612518, | |
| "loss": 3.8912, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 8.207468032836914, | |
| "learning_rate": 0.00018879222570334985, | |
| "loss": 3.9101, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.930370807647705, | |
| "learning_rate": 0.00018870773766274697, | |
| "loss": 3.8817, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.367077350616455, | |
| "learning_rate": 0.00018862295141833523, | |
| "loss": 3.8931, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.587210178375244, | |
| "learning_rate": 0.00018853786725513575, | |
| "loss": 3.9393, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.502545356750488, | |
| "learning_rate": 0.0001884524854591712, | |
| "loss": 3.8489, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.352043628692627, | |
| "learning_rate": 0.00018836680631746476, | |
| "loss": 3.8162, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.686196804046631, | |
| "learning_rate": 0.00018828083011803917, | |
| "loss": 3.9476, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.225170612335205, | |
| "learning_rate": 0.00018819455714991578, | |
| "loss": 3.9404, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.1347150802612305, | |
| "learning_rate": 0.0001881079877031136, | |
| "loss": 3.9798, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.343573093414307, | |
| "learning_rate": 0.0001880211220686482, | |
| "loss": 3.9038, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.858921051025391, | |
| "learning_rate": 0.00018793396053853098, | |
| "loss": 3.8792, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.721033573150635, | |
| "learning_rate": 0.0001878482554434291, | |
| "loss": 3.8421, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.173632621765137, | |
| "learning_rate": 0.00018776050890530516, | |
| "loss": 4.0233, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.996013164520264, | |
| "learning_rate": 0.00018767246734761796, | |
| "loss": 3.8057, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.707641124725342, | |
| "learning_rate": 0.00018758413106633186, | |
| "loss": 3.8299, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.221241474151611, | |
| "learning_rate": 0.00018749550035840193, | |
| "loss": 3.8828, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.554357528686523, | |
| "learning_rate": 0.00018740657552177305, | |
| "loss": 3.8553, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.664674282073975, | |
| "learning_rate": 0.00018731735685537885, | |
| "loss": 3.8838, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.485450267791748, | |
| "learning_rate": 0.00018722784465914071, | |
| "loss": 3.8165, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.825826644897461, | |
| "learning_rate": 0.00018713803923396668, | |
| "loss": 3.7588, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.392491817474365, | |
| "learning_rate": 0.0001870479408817507, | |
| "loss": 3.8001, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.493740081787109, | |
| "learning_rate": 0.00018695754990537123, | |
| "loss": 3.9735, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.905117511749268, | |
| "learning_rate": 0.00018686686660869062, | |
| "loss": 3.7334, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.598316192626953, | |
| "learning_rate": 0.0001867758912965537, | |
| "loss": 3.8269, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.979629039764404, | |
| "learning_rate": 0.00018668462427478714, | |
| "loss": 3.8713, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.480854511260986, | |
| "learning_rate": 0.00018659306585019813, | |
| "loss": 3.7792, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.820549488067627, | |
| "learning_rate": 0.00018650121633057346, | |
| "loss": 3.6656, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.579679012298584, | |
| "learning_rate": 0.0001864090760246785, | |
| "loss": 3.9109, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.669819355010986, | |
| "learning_rate": 0.00018631664524225615, | |
| "loss": 3.7815, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.644351005554199, | |
| "learning_rate": 0.0001862239242940257, | |
| "loss": 3.7529, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.022332191467285, | |
| "learning_rate": 0.00018613091349168205, | |
| "loss": 3.7001, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.376641273498535, | |
| "learning_rate": 0.00018603761314789425, | |
| "loss": 3.6871, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.298123359680176, | |
| "learning_rate": 0.00018594402357630495, | |
| "loss": 3.8095, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.590997695922852, | |
| "learning_rate": 0.00018585014509152882, | |
| "loss": 3.8069, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.710943222045898, | |
| "learning_rate": 0.00018575597800915198, | |
| "loss": 3.8547, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.5094499588012695, | |
| "learning_rate": 0.0001856615226457305, | |
| "loss": 3.7314, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.584799766540527, | |
| "learning_rate": 0.0001855667793187898, | |
| "loss": 3.7514, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.0391154289245605, | |
| "learning_rate": 0.00018547174834682308, | |
| "loss": 3.6231, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.92927885055542, | |
| "learning_rate": 0.00018537643004929067, | |
| "loss": 3.7008, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.359600067138672, | |
| "learning_rate": 0.00018528082474661867, | |
| "loss": 3.798, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.198579788208008, | |
| "learning_rate": 0.0001851849327601981, | |
| "loss": 3.7187, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.796758651733398, | |
| "learning_rate": 0.00018508875441238364, | |
| "loss": 3.7086, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.889728546142578, | |
| "learning_rate": 0.00018499229002649258, | |
| "loss": 3.7387, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.382203102111816, | |
| "learning_rate": 0.0001848955399268039, | |
| "loss": 3.5992, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.061376571655273, | |
| "learning_rate": 0.00018479850443855686, | |
| "loss": 3.6865, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.2180681228637695, | |
| "learning_rate": 0.0001847011838879503, | |
| "loss": 3.7467, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.063679218292236, | |
| "learning_rate": 0.0001846035786021412, | |
| "loss": 3.6894, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.036098480224609, | |
| "learning_rate": 0.00018450568890924373, | |
| "loss": 3.6412, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.86781644821167, | |
| "learning_rate": 0.00018440751513832822, | |
| "loss": 3.637, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.41668176651001, | |
| "learning_rate": 0.00018430905761941983, | |
| "loss": 3.6814, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.117024898529053, | |
| "learning_rate": 0.00018421031668349773, | |
| "loss": 3.6257, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.368699073791504, | |
| "learning_rate": 0.00018411129266249373, | |
| "loss": 3.7111, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.378394603729248, | |
| "learning_rate": 0.0001840119858892913, | |
| "loss": 3.7197, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.029990196228027, | |
| "learning_rate": 0.0001839123966977245, | |
| "loss": 3.7267, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 9.922813415527344, | |
| "learning_rate": 0.00018381252542257662, | |
| "loss": 3.7203, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.9374518394470215, | |
| "learning_rate": 0.00018371237239957932, | |
| "loss": 3.6876, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.682550430297852, | |
| "learning_rate": 0.00018361193796541142, | |
| "loss": 3.6862, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.477772235870361, | |
| "learning_rate": 0.00018351122245769771, | |
| "loss": 3.5982, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.745680332183838, | |
| "learning_rate": 0.00018341224888886997, | |
| "loss": 3.6978, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.691402912139893, | |
| "learning_rate": 0.0001833109778552932, | |
| "loss": 3.6693, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.229629993438721, | |
| "learning_rate": 0.00018320942675989125, | |
| "loss": 3.6327, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.655289649963379, | |
| "learning_rate": 0.0001831075959440427, | |
| "loss": 3.6032, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.4868927001953125, | |
| "learning_rate": 0.00018300548575006658, | |
| "loss": 3.7059, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.387706756591797, | |
| "learning_rate": 0.00018290309652122083, | |
| "loss": 3.6838, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.884798049926758, | |
| "learning_rate": 0.00018280042860170168, | |
| "loss": 3.665, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.185595512390137, | |
| "learning_rate": 0.00018269748233664204, | |
| "loss": 3.6057, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.449123382568359, | |
| "learning_rate": 0.0001825942580721106, | |
| "loss": 3.6262, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.469310283660889, | |
| "learning_rate": 0.00018249075615511053, | |
| "loss": 3.522, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.678877353668213, | |
| "learning_rate": 0.0001823869769335784, | |
| "loss": 3.6757, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.033955097198486, | |
| "learning_rate": 0.000182282920756383, | |
| "loss": 3.7316, | |
| "step": 21000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 90183, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "total_flos": 7551401066496000.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |