behemoth-1.2-distill / trainer_state.json
reissbaker's picture
Upload 6 files
9ddbf48 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 0,
"global_step": 452,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004424778761061947,
"grad_norm": 0.057373046875,
"learning_rate": 0.00039911504424778763,
"loss": 1.3739,
"step": 1
},
{
"epoch": 0.008849557522123894,
"grad_norm": 0.1201171875,
"learning_rate": 0.00039823008849557525,
"loss": 1.4091,
"step": 2
},
{
"epoch": 0.01327433628318584,
"grad_norm": 0.0751953125,
"learning_rate": 0.00039734513274336286,
"loss": 1.2628,
"step": 3
},
{
"epoch": 0.017699115044247787,
"grad_norm": 0.064453125,
"learning_rate": 0.0003964601769911505,
"loss": 1.1101,
"step": 4
},
{
"epoch": 0.022123893805309734,
"grad_norm": 0.06396484375,
"learning_rate": 0.0003955752212389381,
"loss": 1.344,
"step": 5
},
{
"epoch": 0.02654867256637168,
"grad_norm": 0.0634765625,
"learning_rate": 0.00039469026548672565,
"loss": 1.1884,
"step": 6
},
{
"epoch": 0.030973451327433628,
"grad_norm": 0.0625,
"learning_rate": 0.0003938053097345133,
"loss": 1.1329,
"step": 7
},
{
"epoch": 0.035398230088495575,
"grad_norm": 0.052490234375,
"learning_rate": 0.0003929203539823009,
"loss": 1.138,
"step": 8
},
{
"epoch": 0.03982300884955752,
"grad_norm": 0.0625,
"learning_rate": 0.00039203539823008855,
"loss": 1.0113,
"step": 9
},
{
"epoch": 0.04424778761061947,
"grad_norm": 0.041748046875,
"learning_rate": 0.0003911504424778761,
"loss": 1.087,
"step": 10
},
{
"epoch": 0.048672566371681415,
"grad_norm": 0.046630859375,
"learning_rate": 0.0003902654867256637,
"loss": 1.1459,
"step": 11
},
{
"epoch": 0.05309734513274336,
"grad_norm": 0.03662109375,
"learning_rate": 0.00038938053097345134,
"loss": 1.1421,
"step": 12
},
{
"epoch": 0.05752212389380531,
"grad_norm": 0.035888671875,
"learning_rate": 0.00038849557522123895,
"loss": 1.175,
"step": 13
},
{
"epoch": 0.061946902654867256,
"grad_norm": 0.037109375,
"learning_rate": 0.00038761061946902657,
"loss": 1.2099,
"step": 14
},
{
"epoch": 0.06637168141592921,
"grad_norm": 0.038818359375,
"learning_rate": 0.0003867256637168142,
"loss": 1.1295,
"step": 15
},
{
"epoch": 0.07079646017699115,
"grad_norm": 0.0419921875,
"learning_rate": 0.00038584070796460174,
"loss": 1.0737,
"step": 16
},
{
"epoch": 0.0752212389380531,
"grad_norm": 0.037109375,
"learning_rate": 0.0003849557522123894,
"loss": 1.1563,
"step": 17
},
{
"epoch": 0.07964601769911504,
"grad_norm": 0.039306640625,
"learning_rate": 0.000384070796460177,
"loss": 1.1061,
"step": 18
},
{
"epoch": 0.084070796460177,
"grad_norm": 0.050048828125,
"learning_rate": 0.00038318584070796464,
"loss": 1.1052,
"step": 19
},
{
"epoch": 0.08849557522123894,
"grad_norm": 0.036865234375,
"learning_rate": 0.00038230088495575226,
"loss": 1.0009,
"step": 20
},
{
"epoch": 0.09292035398230089,
"grad_norm": 0.041015625,
"learning_rate": 0.0003814159292035398,
"loss": 0.9805,
"step": 21
},
{
"epoch": 0.09734513274336283,
"grad_norm": 0.03173828125,
"learning_rate": 0.0003805309734513275,
"loss": 1.1098,
"step": 22
},
{
"epoch": 0.10176991150442478,
"grad_norm": 0.0322265625,
"learning_rate": 0.00037964601769911505,
"loss": 1.0691,
"step": 23
},
{
"epoch": 0.10619469026548672,
"grad_norm": 0.05029296875,
"learning_rate": 0.00037876106194690266,
"loss": 1.2944,
"step": 24
},
{
"epoch": 0.11061946902654868,
"grad_norm": 0.0419921875,
"learning_rate": 0.0003778761061946903,
"loss": 1.0819,
"step": 25
},
{
"epoch": 0.11504424778761062,
"grad_norm": 0.0341796875,
"learning_rate": 0.0003769911504424779,
"loss": 1.215,
"step": 26
},
{
"epoch": 0.11946902654867257,
"grad_norm": 0.041015625,
"learning_rate": 0.0003761061946902655,
"loss": 1.0624,
"step": 27
},
{
"epoch": 0.12389380530973451,
"grad_norm": 0.03271484375,
"learning_rate": 0.0003752212389380531,
"loss": 1.0258,
"step": 28
},
{
"epoch": 0.12831858407079647,
"grad_norm": 0.038330078125,
"learning_rate": 0.00037433628318584073,
"loss": 1.0544,
"step": 29
},
{
"epoch": 0.13274336283185842,
"grad_norm": 0.035400390625,
"learning_rate": 0.00037345132743362835,
"loss": 1.0203,
"step": 30
},
{
"epoch": 0.13716814159292035,
"grad_norm": 0.05810546875,
"learning_rate": 0.0003725663716814159,
"loss": 1.1584,
"step": 31
},
{
"epoch": 0.1415929203539823,
"grad_norm": 0.0341796875,
"learning_rate": 0.0003716814159292036,
"loss": 0.9215,
"step": 32
},
{
"epoch": 0.14601769911504425,
"grad_norm": 0.03857421875,
"learning_rate": 0.0003707964601769912,
"loss": 1.1255,
"step": 33
},
{
"epoch": 0.1504424778761062,
"grad_norm": 0.053466796875,
"learning_rate": 0.00036991150442477875,
"loss": 1.3504,
"step": 34
},
{
"epoch": 0.15486725663716813,
"grad_norm": 0.0419921875,
"learning_rate": 0.0003690265486725664,
"loss": 1.0819,
"step": 35
},
{
"epoch": 0.1592920353982301,
"grad_norm": 0.041259765625,
"learning_rate": 0.000368141592920354,
"loss": 1.2328,
"step": 36
},
{
"epoch": 0.16371681415929204,
"grad_norm": 0.04345703125,
"learning_rate": 0.00036725663716814165,
"loss": 1.1783,
"step": 37
},
{
"epoch": 0.168141592920354,
"grad_norm": 0.044189453125,
"learning_rate": 0.0003663716814159292,
"loss": 1.105,
"step": 38
},
{
"epoch": 0.17256637168141592,
"grad_norm": 0.05224609375,
"learning_rate": 0.0003654867256637168,
"loss": 1.1757,
"step": 39
},
{
"epoch": 0.17699115044247787,
"grad_norm": 0.042236328125,
"learning_rate": 0.00036460176991150444,
"loss": 1.1601,
"step": 40
},
{
"epoch": 0.18141592920353983,
"grad_norm": 0.04296875,
"learning_rate": 0.00036371681415929205,
"loss": 0.9869,
"step": 41
},
{
"epoch": 0.18584070796460178,
"grad_norm": 0.05419921875,
"learning_rate": 0.00036283185840707967,
"loss": 1.0769,
"step": 42
},
{
"epoch": 0.1902654867256637,
"grad_norm": 0.0361328125,
"learning_rate": 0.0003619469026548673,
"loss": 1.015,
"step": 43
},
{
"epoch": 0.19469026548672566,
"grad_norm": 0.03564453125,
"learning_rate": 0.00036106194690265484,
"loss": 0.9435,
"step": 44
},
{
"epoch": 0.19911504424778761,
"grad_norm": 0.058837890625,
"learning_rate": 0.0003601769911504425,
"loss": 1.1832,
"step": 45
},
{
"epoch": 0.20353982300884957,
"grad_norm": 0.052490234375,
"learning_rate": 0.00035929203539823007,
"loss": 1.1826,
"step": 46
},
{
"epoch": 0.2079646017699115,
"grad_norm": 0.0625,
"learning_rate": 0.00035840707964601774,
"loss": 1.02,
"step": 47
},
{
"epoch": 0.21238938053097345,
"grad_norm": 0.047607421875,
"learning_rate": 0.0003575221238938053,
"loss": 1.0803,
"step": 48
},
{
"epoch": 0.2168141592920354,
"grad_norm": 0.041015625,
"learning_rate": 0.0003566371681415929,
"loss": 1.021,
"step": 49
},
{
"epoch": 0.22123893805309736,
"grad_norm": 0.041015625,
"learning_rate": 0.0003557522123893806,
"loss": 1.0058,
"step": 50
},
{
"epoch": 0.22566371681415928,
"grad_norm": 0.040771484375,
"learning_rate": 0.00035486725663716814,
"loss": 1.0489,
"step": 51
},
{
"epoch": 0.23008849557522124,
"grad_norm": 0.040771484375,
"learning_rate": 0.0003539823008849558,
"loss": 0.986,
"step": 52
},
{
"epoch": 0.2345132743362832,
"grad_norm": 0.039794921875,
"learning_rate": 0.00035309734513274337,
"loss": 1.0928,
"step": 53
},
{
"epoch": 0.23893805309734514,
"grad_norm": 0.0419921875,
"learning_rate": 0.000352212389380531,
"loss": 1.0037,
"step": 54
},
{
"epoch": 0.24336283185840707,
"grad_norm": 0.035888671875,
"learning_rate": 0.0003513274336283186,
"loss": 1.0165,
"step": 55
},
{
"epoch": 0.24778761061946902,
"grad_norm": 0.046630859375,
"learning_rate": 0.0003504424778761062,
"loss": 0.9856,
"step": 56
},
{
"epoch": 0.252212389380531,
"grad_norm": 0.0390625,
"learning_rate": 0.00034955752212389383,
"loss": 1.0988,
"step": 57
},
{
"epoch": 0.25663716814159293,
"grad_norm": 0.035400390625,
"learning_rate": 0.00034867256637168145,
"loss": 0.9983,
"step": 58
},
{
"epoch": 0.2610619469026549,
"grad_norm": 0.0390625,
"learning_rate": 0.000347787610619469,
"loss": 1.0727,
"step": 59
},
{
"epoch": 0.26548672566371684,
"grad_norm": 0.0380859375,
"learning_rate": 0.0003469026548672567,
"loss": 0.9617,
"step": 60
},
{
"epoch": 0.26991150442477874,
"grad_norm": 0.04638671875,
"learning_rate": 0.00034601769911504423,
"loss": 1.1435,
"step": 61
},
{
"epoch": 0.2743362831858407,
"grad_norm": 0.0419921875,
"learning_rate": 0.0003451327433628319,
"loss": 1.0895,
"step": 62
},
{
"epoch": 0.27876106194690264,
"grad_norm": 0.038330078125,
"learning_rate": 0.00034424778761061946,
"loss": 1.0823,
"step": 63
},
{
"epoch": 0.2831858407079646,
"grad_norm": 0.042236328125,
"learning_rate": 0.0003433628318584071,
"loss": 1.1119,
"step": 64
},
{
"epoch": 0.28761061946902655,
"grad_norm": 0.0576171875,
"learning_rate": 0.00034247787610619475,
"loss": 1.2428,
"step": 65
},
{
"epoch": 0.2920353982300885,
"grad_norm": 0.04541015625,
"learning_rate": 0.0003415929203539823,
"loss": 0.9943,
"step": 66
},
{
"epoch": 0.29646017699115046,
"grad_norm": 0.0439453125,
"learning_rate": 0.0003407079646017699,
"loss": 1.3215,
"step": 67
},
{
"epoch": 0.3008849557522124,
"grad_norm": 0.03515625,
"learning_rate": 0.00033982300884955754,
"loss": 0.9997,
"step": 68
},
{
"epoch": 0.3053097345132743,
"grad_norm": 0.039794921875,
"learning_rate": 0.00033893805309734515,
"loss": 0.9796,
"step": 69
},
{
"epoch": 0.30973451327433627,
"grad_norm": 0.044677734375,
"learning_rate": 0.00033805309734513277,
"loss": 1.1079,
"step": 70
},
{
"epoch": 0.3141592920353982,
"grad_norm": 0.041259765625,
"learning_rate": 0.0003371681415929204,
"loss": 1.0242,
"step": 71
},
{
"epoch": 0.3185840707964602,
"grad_norm": 0.04638671875,
"learning_rate": 0.000336283185840708,
"loss": 1.0227,
"step": 72
},
{
"epoch": 0.3230088495575221,
"grad_norm": 0.042236328125,
"learning_rate": 0.0003353982300884956,
"loss": 0.9375,
"step": 73
},
{
"epoch": 0.3274336283185841,
"grad_norm": 0.03759765625,
"learning_rate": 0.00033451327433628317,
"loss": 1.0104,
"step": 74
},
{
"epoch": 0.33185840707964603,
"grad_norm": 0.041748046875,
"learning_rate": 0.00033362831858407084,
"loss": 1.1685,
"step": 75
},
{
"epoch": 0.336283185840708,
"grad_norm": 0.051513671875,
"learning_rate": 0.0003327433628318584,
"loss": 1.2954,
"step": 76
},
{
"epoch": 0.3407079646017699,
"grad_norm": 0.0517578125,
"learning_rate": 0.000331858407079646,
"loss": 0.9816,
"step": 77
},
{
"epoch": 0.34513274336283184,
"grad_norm": 0.04248046875,
"learning_rate": 0.00033097345132743363,
"loss": 1.0791,
"step": 78
},
{
"epoch": 0.3495575221238938,
"grad_norm": 0.043701171875,
"learning_rate": 0.00033008849557522124,
"loss": 1.0989,
"step": 79
},
{
"epoch": 0.35398230088495575,
"grad_norm": 0.05029296875,
"learning_rate": 0.00032920353982300886,
"loss": 1.1164,
"step": 80
},
{
"epoch": 0.3584070796460177,
"grad_norm": 0.0400390625,
"learning_rate": 0.00032831858407079647,
"loss": 1.2053,
"step": 81
},
{
"epoch": 0.36283185840707965,
"grad_norm": 0.041748046875,
"learning_rate": 0.0003274336283185841,
"loss": 1.0322,
"step": 82
},
{
"epoch": 0.3672566371681416,
"grad_norm": 0.064453125,
"learning_rate": 0.0003265486725663717,
"loss": 0.9184,
"step": 83
},
{
"epoch": 0.37168141592920356,
"grad_norm": 0.037353515625,
"learning_rate": 0.0003256637168141593,
"loss": 1.0874,
"step": 84
},
{
"epoch": 0.37610619469026546,
"grad_norm": 0.04638671875,
"learning_rate": 0.00032477876106194693,
"loss": 1.0051,
"step": 85
},
{
"epoch": 0.3805309734513274,
"grad_norm": 0.052001953125,
"learning_rate": 0.00032389380530973454,
"loss": 1.1232,
"step": 86
},
{
"epoch": 0.38495575221238937,
"grad_norm": 0.036865234375,
"learning_rate": 0.0003230088495575221,
"loss": 0.9745,
"step": 87
},
{
"epoch": 0.3893805309734513,
"grad_norm": 0.037353515625,
"learning_rate": 0.0003221238938053098,
"loss": 0.9092,
"step": 88
},
{
"epoch": 0.3938053097345133,
"grad_norm": 0.04931640625,
"learning_rate": 0.00032123893805309733,
"loss": 1.0712,
"step": 89
},
{
"epoch": 0.39823008849557523,
"grad_norm": 0.043701171875,
"learning_rate": 0.000320353982300885,
"loss": 1.0908,
"step": 90
},
{
"epoch": 0.4026548672566372,
"grad_norm": 0.04150390625,
"learning_rate": 0.00031946902654867256,
"loss": 1.0897,
"step": 91
},
{
"epoch": 0.40707964601769914,
"grad_norm": 0.03857421875,
"learning_rate": 0.0003185840707964602,
"loss": 0.8939,
"step": 92
},
{
"epoch": 0.41150442477876104,
"grad_norm": 0.044677734375,
"learning_rate": 0.0003176991150442478,
"loss": 1.0992,
"step": 93
},
{
"epoch": 0.415929203539823,
"grad_norm": 0.038818359375,
"learning_rate": 0.0003168141592920354,
"loss": 0.937,
"step": 94
},
{
"epoch": 0.42035398230088494,
"grad_norm": 0.0634765625,
"learning_rate": 0.000315929203539823,
"loss": 1.1744,
"step": 95
},
{
"epoch": 0.4247787610619469,
"grad_norm": 0.042236328125,
"learning_rate": 0.00031504424778761064,
"loss": 1.0227,
"step": 96
},
{
"epoch": 0.42920353982300885,
"grad_norm": 0.041259765625,
"learning_rate": 0.00031415929203539825,
"loss": 1.112,
"step": 97
},
{
"epoch": 0.4336283185840708,
"grad_norm": 0.047119140625,
"learning_rate": 0.00031327433628318586,
"loss": 0.9122,
"step": 98
},
{
"epoch": 0.43805309734513276,
"grad_norm": 0.04931640625,
"learning_rate": 0.0003123893805309735,
"loss": 1.0073,
"step": 99
},
{
"epoch": 0.4424778761061947,
"grad_norm": 0.040283203125,
"learning_rate": 0.0003115044247787611,
"loss": 1.0326,
"step": 100
},
{
"epoch": 0.4469026548672566,
"grad_norm": 0.046142578125,
"learning_rate": 0.0003106194690265487,
"loss": 1.0014,
"step": 101
},
{
"epoch": 0.45132743362831856,
"grad_norm": 0.041015625,
"learning_rate": 0.00030973451327433627,
"loss": 1.1081,
"step": 102
},
{
"epoch": 0.4557522123893805,
"grad_norm": 0.041015625,
"learning_rate": 0.00030884955752212394,
"loss": 1.1268,
"step": 103
},
{
"epoch": 0.46017699115044247,
"grad_norm": 0.05078125,
"learning_rate": 0.0003079646017699115,
"loss": 1.0382,
"step": 104
},
{
"epoch": 0.4646017699115044,
"grad_norm": 0.0576171875,
"learning_rate": 0.00030707964601769917,
"loss": 0.9887,
"step": 105
},
{
"epoch": 0.4690265486725664,
"grad_norm": 0.0390625,
"learning_rate": 0.0003061946902654867,
"loss": 1.0143,
"step": 106
},
{
"epoch": 0.47345132743362833,
"grad_norm": 0.06982421875,
"learning_rate": 0.00030530973451327434,
"loss": 1.0332,
"step": 107
},
{
"epoch": 0.4778761061946903,
"grad_norm": 0.044921875,
"learning_rate": 0.00030442477876106196,
"loss": 0.9422,
"step": 108
},
{
"epoch": 0.4823008849557522,
"grad_norm": 0.06298828125,
"learning_rate": 0.00030353982300884957,
"loss": 1.0376,
"step": 109
},
{
"epoch": 0.48672566371681414,
"grad_norm": 0.04833984375,
"learning_rate": 0.0003026548672566372,
"loss": 1.1175,
"step": 110
},
{
"epoch": 0.4911504424778761,
"grad_norm": 0.044189453125,
"learning_rate": 0.0003017699115044248,
"loss": 0.9571,
"step": 111
},
{
"epoch": 0.49557522123893805,
"grad_norm": 0.0478515625,
"learning_rate": 0.00030088495575221236,
"loss": 1.0857,
"step": 112
},
{
"epoch": 0.5,
"grad_norm": 0.059814453125,
"learning_rate": 0.00030000000000000003,
"loss": 0.9346,
"step": 113
},
{
"epoch": 0.504424778761062,
"grad_norm": 0.054443359375,
"learning_rate": 0.00029911504424778764,
"loss": 1.0317,
"step": 114
},
{
"epoch": 0.5088495575221239,
"grad_norm": 0.0625,
"learning_rate": 0.00029823008849557526,
"loss": 1.0535,
"step": 115
},
{
"epoch": 0.5132743362831859,
"grad_norm": 0.04150390625,
"learning_rate": 0.00029734513274336287,
"loss": 1.0437,
"step": 116
},
{
"epoch": 0.5176991150442478,
"grad_norm": 0.046142578125,
"learning_rate": 0.00029646017699115043,
"loss": 1.0253,
"step": 117
},
{
"epoch": 0.5221238938053098,
"grad_norm": 0.07421875,
"learning_rate": 0.0002955752212389381,
"loss": 1.022,
"step": 118
},
{
"epoch": 0.5265486725663717,
"grad_norm": 0.058837890625,
"learning_rate": 0.00029469026548672566,
"loss": 1.2344,
"step": 119
},
{
"epoch": 0.5309734513274337,
"grad_norm": 0.0576171875,
"learning_rate": 0.0002938053097345133,
"loss": 0.9828,
"step": 120
},
{
"epoch": 0.5353982300884956,
"grad_norm": 0.05078125,
"learning_rate": 0.0002929203539823009,
"loss": 0.9207,
"step": 121
},
{
"epoch": 0.5398230088495575,
"grad_norm": 0.050537109375,
"learning_rate": 0.0002920353982300885,
"loss": 0.9794,
"step": 122
},
{
"epoch": 0.5442477876106194,
"grad_norm": 0.05908203125,
"learning_rate": 0.0002911504424778761,
"loss": 1.0962,
"step": 123
},
{
"epoch": 0.5486725663716814,
"grad_norm": 0.041748046875,
"learning_rate": 0.00029026548672566373,
"loss": 1.1614,
"step": 124
},
{
"epoch": 0.5530973451327433,
"grad_norm": 0.038330078125,
"learning_rate": 0.00028938053097345135,
"loss": 0.9082,
"step": 125
},
{
"epoch": 0.5575221238938053,
"grad_norm": 0.037353515625,
"learning_rate": 0.00028849557522123896,
"loss": 0.9406,
"step": 126
},
{
"epoch": 0.5619469026548672,
"grad_norm": 0.039306640625,
"learning_rate": 0.0002876106194690265,
"loss": 1.1105,
"step": 127
},
{
"epoch": 0.5663716814159292,
"grad_norm": 0.051025390625,
"learning_rate": 0.0002867256637168142,
"loss": 0.9679,
"step": 128
},
{
"epoch": 0.5707964601769911,
"grad_norm": 0.037109375,
"learning_rate": 0.00028584070796460175,
"loss": 0.9529,
"step": 129
},
{
"epoch": 0.5752212389380531,
"grad_norm": 0.056396484375,
"learning_rate": 0.00028495575221238937,
"loss": 1.0341,
"step": 130
},
{
"epoch": 0.5796460176991151,
"grad_norm": 0.039306640625,
"learning_rate": 0.00028407079646017704,
"loss": 0.9493,
"step": 131
},
{
"epoch": 0.584070796460177,
"grad_norm": 0.06591796875,
"learning_rate": 0.0002831858407079646,
"loss": 1.262,
"step": 132
},
{
"epoch": 0.588495575221239,
"grad_norm": 0.038330078125,
"learning_rate": 0.00028230088495575226,
"loss": 0.9412,
"step": 133
},
{
"epoch": 0.5929203539823009,
"grad_norm": 0.046875,
"learning_rate": 0.0002814159292035398,
"loss": 1.0563,
"step": 134
},
{
"epoch": 0.5973451327433629,
"grad_norm": 0.05712890625,
"learning_rate": 0.00028053097345132744,
"loss": 1.0201,
"step": 135
},
{
"epoch": 0.6017699115044248,
"grad_norm": 0.04052734375,
"learning_rate": 0.00027964601769911505,
"loss": 1.0401,
"step": 136
},
{
"epoch": 0.6061946902654868,
"grad_norm": 0.05078125,
"learning_rate": 0.00027876106194690267,
"loss": 1.0241,
"step": 137
},
{
"epoch": 0.6106194690265486,
"grad_norm": 0.05810546875,
"learning_rate": 0.0002778761061946903,
"loss": 1.1263,
"step": 138
},
{
"epoch": 0.6150442477876106,
"grad_norm": 0.048095703125,
"learning_rate": 0.0002769911504424779,
"loss": 1.0869,
"step": 139
},
{
"epoch": 0.6194690265486725,
"grad_norm": 0.447265625,
"learning_rate": 0.0002761061946902655,
"loss": 0.9944,
"step": 140
},
{
"epoch": 0.6238938053097345,
"grad_norm": 0.038818359375,
"learning_rate": 0.0002752212389380531,
"loss": 0.9675,
"step": 141
},
{
"epoch": 0.6283185840707964,
"grad_norm": 0.068359375,
"learning_rate": 0.0002743362831858407,
"loss": 1.0227,
"step": 142
},
{
"epoch": 0.6327433628318584,
"grad_norm": 0.072265625,
"learning_rate": 0.00027345132743362836,
"loss": 1.0381,
"step": 143
},
{
"epoch": 0.6371681415929203,
"grad_norm": 0.055908203125,
"learning_rate": 0.0002725663716814159,
"loss": 0.9385,
"step": 144
},
{
"epoch": 0.6415929203539823,
"grad_norm": 0.04248046875,
"learning_rate": 0.00027168141592920353,
"loss": 1.001,
"step": 145
},
{
"epoch": 0.6460176991150443,
"grad_norm": 0.06005859375,
"learning_rate": 0.0002707964601769912,
"loss": 1.04,
"step": 146
},
{
"epoch": 0.6504424778761062,
"grad_norm": 0.049072265625,
"learning_rate": 0.00026991150442477876,
"loss": 0.9735,
"step": 147
},
{
"epoch": 0.6548672566371682,
"grad_norm": 0.045654296875,
"learning_rate": 0.00026902654867256643,
"loss": 1.0873,
"step": 148
},
{
"epoch": 0.6592920353982301,
"grad_norm": 0.04638671875,
"learning_rate": 0.000268141592920354,
"loss": 1.1032,
"step": 149
},
{
"epoch": 0.6637168141592921,
"grad_norm": 0.051513671875,
"learning_rate": 0.0002672566371681416,
"loss": 1.0414,
"step": 150
},
{
"epoch": 0.668141592920354,
"grad_norm": 0.0419921875,
"learning_rate": 0.0002663716814159292,
"loss": 0.892,
"step": 151
},
{
"epoch": 0.672566371681416,
"grad_norm": 0.040771484375,
"learning_rate": 0.00026548672566371683,
"loss": 0.9048,
"step": 152
},
{
"epoch": 0.6769911504424779,
"grad_norm": 0.06494140625,
"learning_rate": 0.00026460176991150445,
"loss": 1.0745,
"step": 153
},
{
"epoch": 0.6814159292035398,
"grad_norm": 0.059814453125,
"learning_rate": 0.00026371681415929206,
"loss": 1.2796,
"step": 154
},
{
"epoch": 0.6858407079646017,
"grad_norm": 0.050048828125,
"learning_rate": 0.0002628318584070796,
"loss": 0.9484,
"step": 155
},
{
"epoch": 0.6902654867256637,
"grad_norm": 0.0458984375,
"learning_rate": 0.0002619469026548673,
"loss": 1.0571,
"step": 156
},
{
"epoch": 0.6946902654867256,
"grad_norm": 0.0439453125,
"learning_rate": 0.00026106194690265485,
"loss": 1.1435,
"step": 157
},
{
"epoch": 0.6991150442477876,
"grad_norm": 0.0458984375,
"learning_rate": 0.0002601769911504425,
"loss": 1.0,
"step": 158
},
{
"epoch": 0.7035398230088495,
"grad_norm": 0.039794921875,
"learning_rate": 0.0002592920353982301,
"loss": 1.0044,
"step": 159
},
{
"epoch": 0.7079646017699115,
"grad_norm": 0.049072265625,
"learning_rate": 0.0002584070796460177,
"loss": 1.001,
"step": 160
},
{
"epoch": 0.7123893805309734,
"grad_norm": 0.04541015625,
"learning_rate": 0.0002575221238938053,
"loss": 1.0643,
"step": 161
},
{
"epoch": 0.7168141592920354,
"grad_norm": 0.046630859375,
"learning_rate": 0.0002566371681415929,
"loss": 1.2461,
"step": 162
},
{
"epoch": 0.7212389380530974,
"grad_norm": 0.0458984375,
"learning_rate": 0.00025575221238938054,
"loss": 1.297,
"step": 163
},
{
"epoch": 0.7256637168141593,
"grad_norm": 0.349609375,
"learning_rate": 0.00025486725663716815,
"loss": 0.9718,
"step": 164
},
{
"epoch": 0.7300884955752213,
"grad_norm": 0.039794921875,
"learning_rate": 0.00025398230088495577,
"loss": 0.9553,
"step": 165
},
{
"epoch": 0.7345132743362832,
"grad_norm": 0.041748046875,
"learning_rate": 0.0002530973451327434,
"loss": 1.074,
"step": 166
},
{
"epoch": 0.7389380530973452,
"grad_norm": 0.0615234375,
"learning_rate": 0.000252212389380531,
"loss": 1.0015,
"step": 167
},
{
"epoch": 0.7433628318584071,
"grad_norm": 0.043212890625,
"learning_rate": 0.0002513274336283186,
"loss": 1.021,
"step": 168
},
{
"epoch": 0.7477876106194691,
"grad_norm": 0.0556640625,
"learning_rate": 0.0002504424778761062,
"loss": 1.063,
"step": 169
},
{
"epoch": 0.7522123893805309,
"grad_norm": 0.03759765625,
"learning_rate": 0.0002495575221238938,
"loss": 0.9415,
"step": 170
},
{
"epoch": 0.7566371681415929,
"grad_norm": 0.0673828125,
"learning_rate": 0.00024867256637168145,
"loss": 1.0556,
"step": 171
},
{
"epoch": 0.7610619469026548,
"grad_norm": 0.06298828125,
"learning_rate": 0.000247787610619469,
"loss": 1.1345,
"step": 172
},
{
"epoch": 0.7654867256637168,
"grad_norm": 0.044189453125,
"learning_rate": 0.00024690265486725663,
"loss": 0.9686,
"step": 173
},
{
"epoch": 0.7699115044247787,
"grad_norm": 0.18359375,
"learning_rate": 0.00024601769911504424,
"loss": 0.8729,
"step": 174
},
{
"epoch": 0.7743362831858407,
"grad_norm": 0.04736328125,
"learning_rate": 0.00024513274336283186,
"loss": 1.0424,
"step": 175
},
{
"epoch": 0.7787610619469026,
"grad_norm": 0.05322265625,
"learning_rate": 0.00024424778761061947,
"loss": 1.0317,
"step": 176
},
{
"epoch": 0.7831858407079646,
"grad_norm": 0.043212890625,
"learning_rate": 0.0002433628318584071,
"loss": 1.1979,
"step": 177
},
{
"epoch": 0.7876106194690266,
"grad_norm": 0.0615234375,
"learning_rate": 0.00024247787610619473,
"loss": 1.0134,
"step": 178
},
{
"epoch": 0.7920353982300885,
"grad_norm": 0.0615234375,
"learning_rate": 0.00024159292035398232,
"loss": 1.1044,
"step": 179
},
{
"epoch": 0.7964601769911505,
"grad_norm": 0.04443359375,
"learning_rate": 0.00024070796460176993,
"loss": 1.0293,
"step": 180
},
{
"epoch": 0.8008849557522124,
"grad_norm": 0.04248046875,
"learning_rate": 0.00023982300884955752,
"loss": 0.9629,
"step": 181
},
{
"epoch": 0.8053097345132744,
"grad_norm": 0.03857421875,
"learning_rate": 0.00023893805309734516,
"loss": 0.9511,
"step": 182
},
{
"epoch": 0.8097345132743363,
"grad_norm": 0.046142578125,
"learning_rate": 0.00023805309734513275,
"loss": 1.0096,
"step": 183
},
{
"epoch": 0.8141592920353983,
"grad_norm": 0.0498046875,
"learning_rate": 0.0002371681415929204,
"loss": 0.8986,
"step": 184
},
{
"epoch": 0.8185840707964602,
"grad_norm": 0.050048828125,
"learning_rate": 0.00023628318584070798,
"loss": 0.9618,
"step": 185
},
{
"epoch": 0.8230088495575221,
"grad_norm": 0.07177734375,
"learning_rate": 0.0002353982300884956,
"loss": 1.0183,
"step": 186
},
{
"epoch": 0.827433628318584,
"grad_norm": 0.06982421875,
"learning_rate": 0.00023451327433628318,
"loss": 0.9824,
"step": 187
},
{
"epoch": 0.831858407079646,
"grad_norm": 0.0439453125,
"learning_rate": 0.00023362831858407082,
"loss": 0.9304,
"step": 188
},
{
"epoch": 0.8362831858407079,
"grad_norm": 0.04736328125,
"learning_rate": 0.0002327433628318584,
"loss": 0.9942,
"step": 189
},
{
"epoch": 0.8407079646017699,
"grad_norm": 0.05029296875,
"learning_rate": 0.00023185840707964602,
"loss": 1.1299,
"step": 190
},
{
"epoch": 0.8451327433628318,
"grad_norm": 0.046875,
"learning_rate": 0.0002309734513274336,
"loss": 1.0395,
"step": 191
},
{
"epoch": 0.8495575221238938,
"grad_norm": 0.04296875,
"learning_rate": 0.00023008849557522125,
"loss": 0.9442,
"step": 192
},
{
"epoch": 0.8539823008849557,
"grad_norm": 0.05078125,
"learning_rate": 0.00022920353982300884,
"loss": 1.0056,
"step": 193
},
{
"epoch": 0.8584070796460177,
"grad_norm": 0.050537109375,
"learning_rate": 0.00022831858407079648,
"loss": 0.9217,
"step": 194
},
{
"epoch": 0.8628318584070797,
"grad_norm": 0.040771484375,
"learning_rate": 0.0002274336283185841,
"loss": 0.9522,
"step": 195
},
{
"epoch": 0.8672566371681416,
"grad_norm": 0.042236328125,
"learning_rate": 0.00022654867256637168,
"loss": 0.9525,
"step": 196
},
{
"epoch": 0.8716814159292036,
"grad_norm": 0.048095703125,
"learning_rate": 0.00022566371681415932,
"loss": 1.0493,
"step": 197
},
{
"epoch": 0.8761061946902655,
"grad_norm": 0.047607421875,
"learning_rate": 0.0002247787610619469,
"loss": 1.1643,
"step": 198
},
{
"epoch": 0.8805309734513275,
"grad_norm": 0.041748046875,
"learning_rate": 0.00022389380530973453,
"loss": 0.8968,
"step": 199
},
{
"epoch": 0.8849557522123894,
"grad_norm": 0.046875,
"learning_rate": 0.0002230088495575221,
"loss": 0.8145,
"step": 200
},
{
"epoch": 0.8893805309734514,
"grad_norm": 0.0693359375,
"learning_rate": 0.00022212389380530975,
"loss": 1.1892,
"step": 201
},
{
"epoch": 0.8938053097345132,
"grad_norm": 0.0673828125,
"learning_rate": 0.00022123893805309734,
"loss": 0.9646,
"step": 202
},
{
"epoch": 0.8982300884955752,
"grad_norm": 0.046630859375,
"learning_rate": 0.00022035398230088498,
"loss": 1.0692,
"step": 203
},
{
"epoch": 0.9026548672566371,
"grad_norm": 0.06396484375,
"learning_rate": 0.00021946902654867257,
"loss": 0.9034,
"step": 204
},
{
"epoch": 0.9070796460176991,
"grad_norm": 0.04150390625,
"learning_rate": 0.00021858407079646019,
"loss": 1.1094,
"step": 205
},
{
"epoch": 0.911504424778761,
"grad_norm": 0.064453125,
"learning_rate": 0.00021769911504424777,
"loss": 1.1966,
"step": 206
},
{
"epoch": 0.915929203539823,
"grad_norm": 0.049560546875,
"learning_rate": 0.00021681415929203541,
"loss": 1.1902,
"step": 207
},
{
"epoch": 0.9203539823008849,
"grad_norm": 0.06884765625,
"learning_rate": 0.000215929203539823,
"loss": 1.1077,
"step": 208
},
{
"epoch": 0.9247787610619469,
"grad_norm": 0.042236328125,
"learning_rate": 0.00021504424778761064,
"loss": 0.9293,
"step": 209
},
{
"epoch": 0.9292035398230089,
"grad_norm": 0.040283203125,
"learning_rate": 0.00021415929203539826,
"loss": 1.0238,
"step": 210
},
{
"epoch": 0.9336283185840708,
"grad_norm": 0.046142578125,
"learning_rate": 0.00021327433628318585,
"loss": 0.9889,
"step": 211
},
{
"epoch": 0.9380530973451328,
"grad_norm": 0.048583984375,
"learning_rate": 0.0002123893805309735,
"loss": 1.0614,
"step": 212
},
{
"epoch": 0.9424778761061947,
"grad_norm": 0.048095703125,
"learning_rate": 0.00021150442477876107,
"loss": 1.0836,
"step": 213
},
{
"epoch": 0.9469026548672567,
"grad_norm": 0.047607421875,
"learning_rate": 0.0002106194690265487,
"loss": 1.0815,
"step": 214
},
{
"epoch": 0.9513274336283186,
"grad_norm": 0.039794921875,
"learning_rate": 0.00020973451327433628,
"loss": 1.0021,
"step": 215
},
{
"epoch": 0.9557522123893806,
"grad_norm": 0.049072265625,
"learning_rate": 0.00020884955752212392,
"loss": 1.0002,
"step": 216
},
{
"epoch": 0.9601769911504425,
"grad_norm": 0.04541015625,
"learning_rate": 0.0002079646017699115,
"loss": 1.2081,
"step": 217
},
{
"epoch": 0.9646017699115044,
"grad_norm": 0.0439453125,
"learning_rate": 0.00020707964601769915,
"loss": 1.0711,
"step": 218
},
{
"epoch": 0.9690265486725663,
"grad_norm": 0.049072265625,
"learning_rate": 0.00020619469026548673,
"loss": 1.0342,
"step": 219
},
{
"epoch": 0.9734513274336283,
"grad_norm": 0.0556640625,
"learning_rate": 0.00020530973451327435,
"loss": 1.0103,
"step": 220
},
{
"epoch": 0.9778761061946902,
"grad_norm": 0.04931640625,
"learning_rate": 0.00020442477876106194,
"loss": 0.9692,
"step": 221
},
{
"epoch": 0.9823008849557522,
"grad_norm": 0.04296875,
"learning_rate": 0.00020353982300884958,
"loss": 0.9639,
"step": 222
},
{
"epoch": 0.9867256637168141,
"grad_norm": 0.040771484375,
"learning_rate": 0.00020265486725663717,
"loss": 0.9039,
"step": 223
},
{
"epoch": 0.9911504424778761,
"grad_norm": 0.049560546875,
"learning_rate": 0.00020176991150442478,
"loss": 0.9265,
"step": 224
},
{
"epoch": 0.995575221238938,
"grad_norm": 0.04248046875,
"learning_rate": 0.00020088495575221237,
"loss": 0.8961,
"step": 225
},
{
"epoch": 1.0,
"grad_norm": 0.0625,
"learning_rate": 0.0002,
"loss": 1.0299,
"step": 226
},
{
"epoch": 1.0044247787610618,
"grad_norm": 0.052978515625,
"learning_rate": 0.00019911504424778762,
"loss": 0.8533,
"step": 227
},
{
"epoch": 1.008849557522124,
"grad_norm": 0.042236328125,
"learning_rate": 0.00019823008849557524,
"loss": 0.937,
"step": 228
},
{
"epoch": 1.0132743362831858,
"grad_norm": 0.05029296875,
"learning_rate": 0.00019734513274336283,
"loss": 0.8202,
"step": 229
},
{
"epoch": 1.0176991150442478,
"grad_norm": 0.0517578125,
"learning_rate": 0.00019646017699115044,
"loss": 0.8976,
"step": 230
},
{
"epoch": 1.0221238938053097,
"grad_norm": 0.048828125,
"learning_rate": 0.00019557522123893806,
"loss": 0.8791,
"step": 231
},
{
"epoch": 1.0265486725663717,
"grad_norm": 0.050537109375,
"learning_rate": 0.00019469026548672567,
"loss": 1.0753,
"step": 232
},
{
"epoch": 1.0309734513274336,
"grad_norm": 0.05615234375,
"learning_rate": 0.00019380530973451328,
"loss": 1.0464,
"step": 233
},
{
"epoch": 1.0353982300884956,
"grad_norm": 0.059326171875,
"learning_rate": 0.00019292035398230087,
"loss": 0.8115,
"step": 234
},
{
"epoch": 1.0398230088495575,
"grad_norm": 0.058349609375,
"learning_rate": 0.0001920353982300885,
"loss": 0.9851,
"step": 235
},
{
"epoch": 1.0442477876106195,
"grad_norm": 0.068359375,
"learning_rate": 0.00019115044247787613,
"loss": 0.8867,
"step": 236
},
{
"epoch": 1.0486725663716814,
"grad_norm": 0.059814453125,
"learning_rate": 0.00019026548672566374,
"loss": 0.7882,
"step": 237
},
{
"epoch": 1.0530973451327434,
"grad_norm": 0.06494140625,
"learning_rate": 0.00018938053097345133,
"loss": 1.0028,
"step": 238
},
{
"epoch": 1.0575221238938053,
"grad_norm": 0.06103515625,
"learning_rate": 0.00018849557522123894,
"loss": 0.9446,
"step": 239
},
{
"epoch": 1.0619469026548674,
"grad_norm": 0.059814453125,
"learning_rate": 0.00018761061946902656,
"loss": 1.0249,
"step": 240
},
{
"epoch": 1.0663716814159292,
"grad_norm": 0.053955078125,
"learning_rate": 0.00018672566371681417,
"loss": 0.9277,
"step": 241
},
{
"epoch": 1.0707964601769913,
"grad_norm": 0.0751953125,
"learning_rate": 0.0001858407079646018,
"loss": 0.8228,
"step": 242
},
{
"epoch": 1.075221238938053,
"grad_norm": 0.058837890625,
"learning_rate": 0.00018495575221238938,
"loss": 0.8757,
"step": 243
},
{
"epoch": 1.079646017699115,
"grad_norm": 0.059326171875,
"learning_rate": 0.000184070796460177,
"loss": 0.7868,
"step": 244
},
{
"epoch": 1.084070796460177,
"grad_norm": 0.07275390625,
"learning_rate": 0.0001831858407079646,
"loss": 0.878,
"step": 245
},
{
"epoch": 1.0884955752212389,
"grad_norm": 0.05908203125,
"learning_rate": 0.00018230088495575222,
"loss": 0.8944,
"step": 246
},
{
"epoch": 1.092920353982301,
"grad_norm": 0.059326171875,
"learning_rate": 0.00018141592920353983,
"loss": 0.8831,
"step": 247
},
{
"epoch": 1.0973451327433628,
"grad_norm": 0.060302734375,
"learning_rate": 0.00018053097345132742,
"loss": 0.9312,
"step": 248
},
{
"epoch": 1.1017699115044248,
"grad_norm": 0.053955078125,
"learning_rate": 0.00017964601769911504,
"loss": 0.7488,
"step": 249
},
{
"epoch": 1.1061946902654867,
"grad_norm": 0.06298828125,
"learning_rate": 0.00017876106194690265,
"loss": 0.9677,
"step": 250
},
{
"epoch": 1.1106194690265487,
"grad_norm": 0.06298828125,
"learning_rate": 0.0001778761061946903,
"loss": 0.8391,
"step": 251
},
{
"epoch": 1.1150442477876106,
"grad_norm": 0.061279296875,
"learning_rate": 0.0001769911504424779,
"loss": 0.9225,
"step": 252
},
{
"epoch": 1.1194690265486726,
"grad_norm": 0.080078125,
"learning_rate": 0.0001761061946902655,
"loss": 0.7969,
"step": 253
},
{
"epoch": 1.1238938053097345,
"grad_norm": 0.06494140625,
"learning_rate": 0.0001752212389380531,
"loss": 0.8957,
"step": 254
},
{
"epoch": 1.1283185840707965,
"grad_norm": 0.062255859375,
"learning_rate": 0.00017433628318584072,
"loss": 0.9192,
"step": 255
},
{
"epoch": 1.1327433628318584,
"grad_norm": 0.1005859375,
"learning_rate": 0.00017345132743362834,
"loss": 0.8669,
"step": 256
},
{
"epoch": 1.1371681415929205,
"grad_norm": 0.0810546875,
"learning_rate": 0.00017256637168141595,
"loss": 0.9332,
"step": 257
},
{
"epoch": 1.1415929203539823,
"grad_norm": 0.06689453125,
"learning_rate": 0.00017168141592920354,
"loss": 0.8392,
"step": 258
},
{
"epoch": 1.1460176991150441,
"grad_norm": 0.06494140625,
"learning_rate": 0.00017079646017699115,
"loss": 1.1159,
"step": 259
},
{
"epoch": 1.1504424778761062,
"grad_norm": 0.0625,
"learning_rate": 0.00016991150442477877,
"loss": 0.9649,
"step": 260
},
{
"epoch": 1.154867256637168,
"grad_norm": 0.059326171875,
"learning_rate": 0.00016902654867256638,
"loss": 0.9653,
"step": 261
},
{
"epoch": 1.1592920353982301,
"grad_norm": 0.05322265625,
"learning_rate": 0.000168141592920354,
"loss": 0.8342,
"step": 262
},
{
"epoch": 1.163716814159292,
"grad_norm": 0.109375,
"learning_rate": 0.00016725663716814158,
"loss": 0.7385,
"step": 263
},
{
"epoch": 1.168141592920354,
"grad_norm": 0.076171875,
"learning_rate": 0.0001663716814159292,
"loss": 0.7605,
"step": 264
},
{
"epoch": 1.1725663716814159,
"grad_norm": 0.057373046875,
"learning_rate": 0.00016548672566371681,
"loss": 0.8457,
"step": 265
},
{
"epoch": 1.176991150442478,
"grad_norm": 0.08447265625,
"learning_rate": 0.00016460176991150443,
"loss": 0.872,
"step": 266
},
{
"epoch": 1.1814159292035398,
"grad_norm": 0.07470703125,
"learning_rate": 0.00016371681415929204,
"loss": 1.0322,
"step": 267
},
{
"epoch": 1.1858407079646018,
"grad_norm": 0.06640625,
"learning_rate": 0.00016283185840707966,
"loss": 1.0532,
"step": 268
},
{
"epoch": 1.1902654867256637,
"grad_norm": 0.059814453125,
"learning_rate": 0.00016194690265486727,
"loss": 0.9205,
"step": 269
},
{
"epoch": 1.1946902654867257,
"grad_norm": 0.060546875,
"learning_rate": 0.0001610619469026549,
"loss": 0.8789,
"step": 270
},
{
"epoch": 1.1991150442477876,
"grad_norm": 0.0654296875,
"learning_rate": 0.0001601769911504425,
"loss": 1.0501,
"step": 271
},
{
"epoch": 1.2035398230088497,
"grad_norm": 0.0634765625,
"learning_rate": 0.0001592920353982301,
"loss": 0.8666,
"step": 272
},
{
"epoch": 1.2079646017699115,
"grad_norm": 0.0595703125,
"learning_rate": 0.0001584070796460177,
"loss": 0.8761,
"step": 273
},
{
"epoch": 1.2123893805309733,
"grad_norm": 0.057373046875,
"learning_rate": 0.00015752212389380532,
"loss": 0.8827,
"step": 274
},
{
"epoch": 1.2168141592920354,
"grad_norm": 0.07373046875,
"learning_rate": 0.00015663716814159293,
"loss": 0.8162,
"step": 275
},
{
"epoch": 1.2212389380530975,
"grad_norm": 0.06494140625,
"learning_rate": 0.00015575221238938055,
"loss": 0.7613,
"step": 276
},
{
"epoch": 1.2256637168141593,
"grad_norm": 0.06494140625,
"learning_rate": 0.00015486725663716813,
"loss": 0.825,
"step": 277
},
{
"epoch": 1.2300884955752212,
"grad_norm": 0.061767578125,
"learning_rate": 0.00015398230088495575,
"loss": 0.9633,
"step": 278
},
{
"epoch": 1.2345132743362832,
"grad_norm": 0.0595703125,
"learning_rate": 0.00015309734513274336,
"loss": 0.9036,
"step": 279
},
{
"epoch": 1.238938053097345,
"grad_norm": 0.076171875,
"learning_rate": 0.00015221238938053098,
"loss": 0.9527,
"step": 280
},
{
"epoch": 1.2433628318584071,
"grad_norm": 0.06005859375,
"learning_rate": 0.0001513274336283186,
"loss": 0.9089,
"step": 281
},
{
"epoch": 1.247787610619469,
"grad_norm": 0.056884765625,
"learning_rate": 0.00015044247787610618,
"loss": 0.8911,
"step": 282
},
{
"epoch": 1.252212389380531,
"grad_norm": 0.0908203125,
"learning_rate": 0.00014955752212389382,
"loss": 0.7871,
"step": 283
},
{
"epoch": 1.2566371681415929,
"grad_norm": 0.0771484375,
"learning_rate": 0.00014867256637168144,
"loss": 0.8415,
"step": 284
},
{
"epoch": 1.261061946902655,
"grad_norm": 0.07177734375,
"learning_rate": 0.00014778761061946905,
"loss": 1.0105,
"step": 285
},
{
"epoch": 1.2654867256637168,
"grad_norm": 0.0986328125,
"learning_rate": 0.00014690265486725664,
"loss": 0.9677,
"step": 286
},
{
"epoch": 1.2699115044247788,
"grad_norm": 0.0888671875,
"learning_rate": 0.00014601769911504425,
"loss": 0.837,
"step": 287
},
{
"epoch": 1.2743362831858407,
"grad_norm": 0.126953125,
"learning_rate": 0.00014513274336283187,
"loss": 0.8605,
"step": 288
},
{
"epoch": 1.2787610619469025,
"grad_norm": 0.06298828125,
"learning_rate": 0.00014424778761061948,
"loss": 0.8717,
"step": 289
},
{
"epoch": 1.2831858407079646,
"grad_norm": 0.08740234375,
"learning_rate": 0.0001433628318584071,
"loss": 1.0469,
"step": 290
},
{
"epoch": 1.2876106194690267,
"grad_norm": 0.061767578125,
"learning_rate": 0.00014247787610619468,
"loss": 0.9339,
"step": 291
},
{
"epoch": 1.2920353982300885,
"grad_norm": 0.072265625,
"learning_rate": 0.0001415929203539823,
"loss": 0.7235,
"step": 292
},
{
"epoch": 1.2964601769911503,
"grad_norm": 0.087890625,
"learning_rate": 0.0001407079646017699,
"loss": 0.8648,
"step": 293
},
{
"epoch": 1.3008849557522124,
"grad_norm": 0.062255859375,
"learning_rate": 0.00013982300884955753,
"loss": 0.8842,
"step": 294
},
{
"epoch": 1.3053097345132743,
"grad_norm": 0.08056640625,
"learning_rate": 0.00013893805309734514,
"loss": 0.9593,
"step": 295
},
{
"epoch": 1.3097345132743363,
"grad_norm": 0.0771484375,
"learning_rate": 0.00013805309734513276,
"loss": 0.9122,
"step": 296
},
{
"epoch": 1.3141592920353982,
"grad_norm": 0.06396484375,
"learning_rate": 0.00013716814159292034,
"loss": 1.0082,
"step": 297
},
{
"epoch": 1.3185840707964602,
"grad_norm": 0.06298828125,
"learning_rate": 0.00013628318584070796,
"loss": 0.884,
"step": 298
},
{
"epoch": 1.323008849557522,
"grad_norm": 0.08349609375,
"learning_rate": 0.0001353982300884956,
"loss": 0.8348,
"step": 299
},
{
"epoch": 1.3274336283185841,
"grad_norm": 0.0732421875,
"learning_rate": 0.00013451327433628321,
"loss": 0.747,
"step": 300
},
{
"epoch": 1.331858407079646,
"grad_norm": 0.06396484375,
"learning_rate": 0.0001336283185840708,
"loss": 0.8841,
"step": 301
},
{
"epoch": 1.336283185840708,
"grad_norm": 0.06005859375,
"learning_rate": 0.00013274336283185842,
"loss": 0.8985,
"step": 302
},
{
"epoch": 1.3407079646017699,
"grad_norm": 0.068359375,
"learning_rate": 0.00013185840707964603,
"loss": 0.9008,
"step": 303
},
{
"epoch": 1.3451327433628317,
"grad_norm": 0.076171875,
"learning_rate": 0.00013097345132743365,
"loss": 0.8909,
"step": 304
},
{
"epoch": 1.3495575221238938,
"grad_norm": 0.09521484375,
"learning_rate": 0.00013008849557522126,
"loss": 0.8108,
"step": 305
},
{
"epoch": 1.3539823008849559,
"grad_norm": 0.08154296875,
"learning_rate": 0.00012920353982300885,
"loss": 0.8546,
"step": 306
},
{
"epoch": 1.3584070796460177,
"grad_norm": 0.0771484375,
"learning_rate": 0.00012831858407079646,
"loss": 1.0212,
"step": 307
},
{
"epoch": 1.3628318584070795,
"grad_norm": 0.06201171875,
"learning_rate": 0.00012743362831858408,
"loss": 0.974,
"step": 308
},
{
"epoch": 1.3672566371681416,
"grad_norm": 0.095703125,
"learning_rate": 0.0001265486725663717,
"loss": 0.7493,
"step": 309
},
{
"epoch": 1.3716814159292037,
"grad_norm": 0.09765625,
"learning_rate": 0.0001256637168141593,
"loss": 1.0118,
"step": 310
},
{
"epoch": 1.3761061946902655,
"grad_norm": 0.08740234375,
"learning_rate": 0.0001247787610619469,
"loss": 0.8243,
"step": 311
},
{
"epoch": 1.3805309734513274,
"grad_norm": 0.06884765625,
"learning_rate": 0.0001238938053097345,
"loss": 0.9024,
"step": 312
},
{
"epoch": 1.3849557522123894,
"grad_norm": 0.08740234375,
"learning_rate": 0.00012300884955752212,
"loss": 0.9018,
"step": 313
},
{
"epoch": 1.3893805309734513,
"grad_norm": 0.09814453125,
"learning_rate": 0.00012212389380530974,
"loss": 1.1168,
"step": 314
},
{
"epoch": 1.3938053097345133,
"grad_norm": 0.07861328125,
"learning_rate": 0.00012123893805309736,
"loss": 0.9847,
"step": 315
},
{
"epoch": 1.3982300884955752,
"grad_norm": 0.07080078125,
"learning_rate": 0.00012035398230088497,
"loss": 0.9884,
"step": 316
},
{
"epoch": 1.4026548672566372,
"grad_norm": 0.07568359375,
"learning_rate": 0.00011946902654867258,
"loss": 0.9483,
"step": 317
},
{
"epoch": 1.407079646017699,
"grad_norm": 0.06787109375,
"learning_rate": 0.0001185840707964602,
"loss": 0.8768,
"step": 318
},
{
"epoch": 1.411504424778761,
"grad_norm": 0.0751953125,
"learning_rate": 0.0001176991150442478,
"loss": 0.9072,
"step": 319
},
{
"epoch": 1.415929203539823,
"grad_norm": 0.0810546875,
"learning_rate": 0.00011681415929203541,
"loss": 0.8627,
"step": 320
},
{
"epoch": 1.420353982300885,
"grad_norm": 0.07275390625,
"learning_rate": 0.00011592920353982301,
"loss": 0.9518,
"step": 321
},
{
"epoch": 1.424778761061947,
"grad_norm": 0.0830078125,
"learning_rate": 0.00011504424778761063,
"loss": 0.8705,
"step": 322
},
{
"epoch": 1.4292035398230087,
"grad_norm": 0.061767578125,
"learning_rate": 0.00011415929203539824,
"loss": 0.8535,
"step": 323
},
{
"epoch": 1.4336283185840708,
"grad_norm": 0.06396484375,
"learning_rate": 0.00011327433628318584,
"loss": 0.8835,
"step": 324
},
{
"epoch": 1.4380530973451329,
"grad_norm": 0.09033203125,
"learning_rate": 0.00011238938053097346,
"loss": 1.1187,
"step": 325
},
{
"epoch": 1.4424778761061947,
"grad_norm": 0.08935546875,
"learning_rate": 0.00011150442477876106,
"loss": 0.6991,
"step": 326
},
{
"epoch": 1.4469026548672566,
"grad_norm": 0.10546875,
"learning_rate": 0.00011061946902654867,
"loss": 0.8172,
"step": 327
},
{
"epoch": 1.4513274336283186,
"grad_norm": 0.1015625,
"learning_rate": 0.00010973451327433629,
"loss": 0.8526,
"step": 328
},
{
"epoch": 1.4557522123893805,
"grad_norm": 0.06640625,
"learning_rate": 0.00010884955752212389,
"loss": 0.8048,
"step": 329
},
{
"epoch": 1.4601769911504425,
"grad_norm": 0.0693359375,
"learning_rate": 0.0001079646017699115,
"loss": 0.9438,
"step": 330
},
{
"epoch": 1.4646017699115044,
"grad_norm": 0.08837890625,
"learning_rate": 0.00010707964601769913,
"loss": 0.9667,
"step": 331
},
{
"epoch": 1.4690265486725664,
"grad_norm": 0.0810546875,
"learning_rate": 0.00010619469026548674,
"loss": 1.0007,
"step": 332
},
{
"epoch": 1.4734513274336283,
"grad_norm": 0.07470703125,
"learning_rate": 0.00010530973451327434,
"loss": 0.971,
"step": 333
},
{
"epoch": 1.4778761061946903,
"grad_norm": 0.09033203125,
"learning_rate": 0.00010442477876106196,
"loss": 0.8334,
"step": 334
},
{
"epoch": 1.4823008849557522,
"grad_norm": 0.06640625,
"learning_rate": 0.00010353982300884957,
"loss": 0.7885,
"step": 335
},
{
"epoch": 1.4867256637168142,
"grad_norm": 0.0947265625,
"learning_rate": 0.00010265486725663717,
"loss": 0.825,
"step": 336
},
{
"epoch": 1.491150442477876,
"grad_norm": 0.08154296875,
"learning_rate": 0.00010176991150442479,
"loss": 0.9044,
"step": 337
},
{
"epoch": 1.495575221238938,
"grad_norm": 0.07763671875,
"learning_rate": 0.00010088495575221239,
"loss": 0.7607,
"step": 338
},
{
"epoch": 1.5,
"grad_norm": 0.0693359375,
"learning_rate": 0.0001,
"loss": 0.966,
"step": 339
},
{
"epoch": 1.504424778761062,
"grad_norm": 0.1005859375,
"learning_rate": 9.911504424778762e-05,
"loss": 0.7745,
"step": 340
},
{
"epoch": 1.508849557522124,
"grad_norm": 0.058837890625,
"learning_rate": 9.823008849557522e-05,
"loss": 0.8849,
"step": 341
},
{
"epoch": 1.5132743362831858,
"grad_norm": 0.0703125,
"learning_rate": 9.734513274336283e-05,
"loss": 0.9905,
"step": 342
},
{
"epoch": 1.5176991150442478,
"grad_norm": 0.1025390625,
"learning_rate": 9.646017699115044e-05,
"loss": 0.8459,
"step": 343
},
{
"epoch": 1.5221238938053099,
"grad_norm": 0.07275390625,
"learning_rate": 9.557522123893806e-05,
"loss": 0.8842,
"step": 344
},
{
"epoch": 1.5265486725663717,
"grad_norm": 0.083984375,
"learning_rate": 9.469026548672566e-05,
"loss": 1.0654,
"step": 345
},
{
"epoch": 1.5309734513274336,
"grad_norm": 0.0615234375,
"learning_rate": 9.380530973451328e-05,
"loss": 0.8734,
"step": 346
},
{
"epoch": 1.5353982300884956,
"grad_norm": 0.0791015625,
"learning_rate": 9.29203539823009e-05,
"loss": 0.9752,
"step": 347
},
{
"epoch": 1.5398230088495575,
"grad_norm": 0.0751953125,
"learning_rate": 9.20353982300885e-05,
"loss": 0.7664,
"step": 348
},
{
"epoch": 1.5442477876106193,
"grad_norm": 0.0888671875,
"learning_rate": 9.115044247787611e-05,
"loss": 0.8328,
"step": 349
},
{
"epoch": 1.5486725663716814,
"grad_norm": 0.0712890625,
"learning_rate": 9.026548672566371e-05,
"loss": 0.8581,
"step": 350
},
{
"epoch": 1.5530973451327434,
"grad_norm": 0.0888671875,
"learning_rate": 8.938053097345133e-05,
"loss": 0.7521,
"step": 351
},
{
"epoch": 1.5575221238938053,
"grad_norm": 0.0810546875,
"learning_rate": 8.849557522123895e-05,
"loss": 1.1778,
"step": 352
},
{
"epoch": 1.5619469026548671,
"grad_norm": 0.08447265625,
"learning_rate": 8.761061946902655e-05,
"loss": 0.8007,
"step": 353
},
{
"epoch": 1.5663716814159292,
"grad_norm": 0.08544921875,
"learning_rate": 8.672566371681417e-05,
"loss": 1.1795,
"step": 354
},
{
"epoch": 1.5707964601769913,
"grad_norm": 0.08642578125,
"learning_rate": 8.584070796460177e-05,
"loss": 0.9632,
"step": 355
},
{
"epoch": 1.575221238938053,
"grad_norm": 0.11572265625,
"learning_rate": 8.495575221238938e-05,
"loss": 0.7671,
"step": 356
},
{
"epoch": 1.579646017699115,
"grad_norm": 0.1396484375,
"learning_rate": 8.4070796460177e-05,
"loss": 0.692,
"step": 357
},
{
"epoch": 1.584070796460177,
"grad_norm": 0.10791015625,
"learning_rate": 8.31858407079646e-05,
"loss": 0.6548,
"step": 358
},
{
"epoch": 1.588495575221239,
"grad_norm": 0.080078125,
"learning_rate": 8.230088495575221e-05,
"loss": 0.805,
"step": 359
},
{
"epoch": 1.592920353982301,
"grad_norm": 0.06005859375,
"learning_rate": 8.141592920353983e-05,
"loss": 0.7988,
"step": 360
},
{
"epoch": 1.5973451327433628,
"grad_norm": 0.07861328125,
"learning_rate": 8.053097345132744e-05,
"loss": 0.9695,
"step": 361
},
{
"epoch": 1.6017699115044248,
"grad_norm": 0.07421875,
"learning_rate": 7.964601769911504e-05,
"loss": 1.0397,
"step": 362
},
{
"epoch": 1.606194690265487,
"grad_norm": 0.0830078125,
"learning_rate": 7.876106194690266e-05,
"loss": 0.9098,
"step": 363
},
{
"epoch": 1.6106194690265485,
"grad_norm": 0.07861328125,
"learning_rate": 7.787610619469027e-05,
"loss": 0.9249,
"step": 364
},
{
"epoch": 1.6150442477876106,
"grad_norm": 0.0615234375,
"learning_rate": 7.699115044247787e-05,
"loss": 0.7443,
"step": 365
},
{
"epoch": 1.6194690265486726,
"grad_norm": 0.08935546875,
"learning_rate": 7.610619469026549e-05,
"loss": 0.8042,
"step": 366
},
{
"epoch": 1.6238938053097345,
"grad_norm": 0.0810546875,
"learning_rate": 7.522123893805309e-05,
"loss": 0.8271,
"step": 367
},
{
"epoch": 1.6283185840707963,
"grad_norm": 0.06884765625,
"learning_rate": 7.433628318584072e-05,
"loss": 0.9711,
"step": 368
},
{
"epoch": 1.6327433628318584,
"grad_norm": 0.06689453125,
"learning_rate": 7.345132743362832e-05,
"loss": 0.8821,
"step": 369
},
{
"epoch": 1.6371681415929205,
"grad_norm": 0.0556640625,
"learning_rate": 7.256637168141593e-05,
"loss": 0.7417,
"step": 370
},
{
"epoch": 1.6415929203539823,
"grad_norm": 0.06591796875,
"learning_rate": 7.168141592920355e-05,
"loss": 0.9247,
"step": 371
},
{
"epoch": 1.6460176991150441,
"grad_norm": 0.06396484375,
"learning_rate": 7.079646017699115e-05,
"loss": 0.9101,
"step": 372
},
{
"epoch": 1.6504424778761062,
"grad_norm": 0.091796875,
"learning_rate": 6.991150442477876e-05,
"loss": 1.0123,
"step": 373
},
{
"epoch": 1.6548672566371683,
"grad_norm": 0.103515625,
"learning_rate": 6.902654867256638e-05,
"loss": 0.7791,
"step": 374
},
{
"epoch": 1.6592920353982301,
"grad_norm": 0.07275390625,
"learning_rate": 6.814159292035398e-05,
"loss": 1.0589,
"step": 375
},
{
"epoch": 1.663716814159292,
"grad_norm": 0.058349609375,
"learning_rate": 6.725663716814161e-05,
"loss": 0.8401,
"step": 376
},
{
"epoch": 1.668141592920354,
"grad_norm": 0.059814453125,
"learning_rate": 6.637168141592921e-05,
"loss": 0.8201,
"step": 377
},
{
"epoch": 1.672566371681416,
"grad_norm": 0.0927734375,
"learning_rate": 6.548672566371682e-05,
"loss": 0.913,
"step": 378
},
{
"epoch": 1.676991150442478,
"grad_norm": 0.060302734375,
"learning_rate": 6.460176991150442e-05,
"loss": 0.8276,
"step": 379
},
{
"epoch": 1.6814159292035398,
"grad_norm": 0.08349609375,
"learning_rate": 6.371681415929204e-05,
"loss": 0.7729,
"step": 380
},
{
"epoch": 1.6858407079646018,
"grad_norm": 0.0703125,
"learning_rate": 6.283185840707965e-05,
"loss": 1.0113,
"step": 381
},
{
"epoch": 1.6902654867256637,
"grad_norm": 0.0634765625,
"learning_rate": 6.194690265486725e-05,
"loss": 0.8446,
"step": 382
},
{
"epoch": 1.6946902654867255,
"grad_norm": 0.0673828125,
"learning_rate": 6.106194690265487e-05,
"loss": 0.8878,
"step": 383
},
{
"epoch": 1.6991150442477876,
"grad_norm": 0.1103515625,
"learning_rate": 6.017699115044248e-05,
"loss": 0.6718,
"step": 384
},
{
"epoch": 1.7035398230088497,
"grad_norm": 0.060302734375,
"learning_rate": 5.92920353982301e-05,
"loss": 0.8153,
"step": 385
},
{
"epoch": 1.7079646017699115,
"grad_norm": 0.0712890625,
"learning_rate": 5.8407079646017705e-05,
"loss": 0.9931,
"step": 386
},
{
"epoch": 1.7123893805309733,
"grad_norm": 0.0556640625,
"learning_rate": 5.752212389380531e-05,
"loss": 0.7466,
"step": 387
},
{
"epoch": 1.7168141592920354,
"grad_norm": 0.09033203125,
"learning_rate": 5.663716814159292e-05,
"loss": 0.9364,
"step": 388
},
{
"epoch": 1.7212389380530975,
"grad_norm": 0.068359375,
"learning_rate": 5.575221238938053e-05,
"loss": 0.8851,
"step": 389
},
{
"epoch": 1.7256637168141593,
"grad_norm": 0.061279296875,
"learning_rate": 5.486725663716814e-05,
"loss": 0.8714,
"step": 390
},
{
"epoch": 1.7300884955752212,
"grad_norm": 0.06982421875,
"learning_rate": 5.398230088495575e-05,
"loss": 0.8885,
"step": 391
},
{
"epoch": 1.7345132743362832,
"grad_norm": 0.06298828125,
"learning_rate": 5.309734513274337e-05,
"loss": 0.8724,
"step": 392
},
{
"epoch": 1.7389380530973453,
"grad_norm": 0.08056640625,
"learning_rate": 5.221238938053098e-05,
"loss": 1.1328,
"step": 393
},
{
"epoch": 1.7433628318584071,
"grad_norm": 0.099609375,
"learning_rate": 5.132743362831859e-05,
"loss": 0.7735,
"step": 394
},
{
"epoch": 1.747787610619469,
"grad_norm": 0.06982421875,
"learning_rate": 5.0442477876106195e-05,
"loss": 0.9325,
"step": 395
},
{
"epoch": 1.752212389380531,
"grad_norm": 0.07080078125,
"learning_rate": 4.955752212389381e-05,
"loss": 0.9273,
"step": 396
},
{
"epoch": 1.7566371681415929,
"grad_norm": 0.10009765625,
"learning_rate": 4.867256637168142e-05,
"loss": 0.7756,
"step": 397
},
{
"epoch": 1.7610619469026547,
"grad_norm": 0.0908203125,
"learning_rate": 4.778761061946903e-05,
"loss": 1.0591,
"step": 398
},
{
"epoch": 1.7654867256637168,
"grad_norm": 0.09423828125,
"learning_rate": 4.690265486725664e-05,
"loss": 0.7867,
"step": 399
},
{
"epoch": 1.7699115044247788,
"grad_norm": 0.0888671875,
"learning_rate": 4.601769911504425e-05,
"loss": 0.8369,
"step": 400
},
{
"epoch": 1.7743362831858407,
"grad_norm": 0.06396484375,
"learning_rate": 4.5132743362831855e-05,
"loss": 0.9999,
"step": 401
},
{
"epoch": 1.7787610619469025,
"grad_norm": 0.061767578125,
"learning_rate": 4.4247787610619477e-05,
"loss": 0.8612,
"step": 402
},
{
"epoch": 1.7831858407079646,
"grad_norm": 0.09716796875,
"learning_rate": 4.3362831858407084e-05,
"loss": 0.8529,
"step": 403
},
{
"epoch": 1.7876106194690267,
"grad_norm": 0.07763671875,
"learning_rate": 4.247787610619469e-05,
"loss": 0.8809,
"step": 404
},
{
"epoch": 1.7920353982300885,
"grad_norm": 0.07177734375,
"learning_rate": 4.15929203539823e-05,
"loss": 0.9739,
"step": 405
},
{
"epoch": 1.7964601769911503,
"grad_norm": 0.07568359375,
"learning_rate": 4.0707964601769914e-05,
"loss": 0.9416,
"step": 406
},
{
"epoch": 1.8008849557522124,
"grad_norm": 0.061767578125,
"learning_rate": 3.982300884955752e-05,
"loss": 0.8359,
"step": 407
},
{
"epoch": 1.8053097345132745,
"grad_norm": 0.0712890625,
"learning_rate": 3.893805309734514e-05,
"loss": 0.9323,
"step": 408
},
{
"epoch": 1.8097345132743363,
"grad_norm": 0.0810546875,
"learning_rate": 3.8053097345132744e-05,
"loss": 0.8084,
"step": 409
},
{
"epoch": 1.8141592920353982,
"grad_norm": 0.06298828125,
"learning_rate": 3.716814159292036e-05,
"loss": 0.9237,
"step": 410
},
{
"epoch": 1.8185840707964602,
"grad_norm": 0.08447265625,
"learning_rate": 3.628318584070797e-05,
"loss": 1.0047,
"step": 411
},
{
"epoch": 1.823008849557522,
"grad_norm": 0.0654296875,
"learning_rate": 3.5398230088495574e-05,
"loss": 0.9763,
"step": 412
},
{
"epoch": 1.827433628318584,
"grad_norm": 0.06201171875,
"learning_rate": 3.451327433628319e-05,
"loss": 0.7498,
"step": 413
},
{
"epoch": 1.831858407079646,
"grad_norm": 0.087890625,
"learning_rate": 3.3628318584070804e-05,
"loss": 0.8973,
"step": 414
},
{
"epoch": 1.836283185840708,
"grad_norm": 0.0966796875,
"learning_rate": 3.274336283185841e-05,
"loss": 0.9526,
"step": 415
},
{
"epoch": 1.8407079646017699,
"grad_norm": 0.061767578125,
"learning_rate": 3.185840707964602e-05,
"loss": 0.9184,
"step": 416
},
{
"epoch": 1.8451327433628317,
"grad_norm": 0.0673828125,
"learning_rate": 3.097345132743363e-05,
"loss": 0.9124,
"step": 417
},
{
"epoch": 1.8495575221238938,
"grad_norm": 0.056884765625,
"learning_rate": 3.008849557522124e-05,
"loss": 0.8303,
"step": 418
},
{
"epoch": 1.8539823008849559,
"grad_norm": 0.0703125,
"learning_rate": 2.9203539823008852e-05,
"loss": 0.9533,
"step": 419
},
{
"epoch": 1.8584070796460177,
"grad_norm": 0.064453125,
"learning_rate": 2.831858407079646e-05,
"loss": 0.8822,
"step": 420
},
{
"epoch": 1.8628318584070795,
"grad_norm": 0.072265625,
"learning_rate": 2.743362831858407e-05,
"loss": 0.911,
"step": 421
},
{
"epoch": 1.8672566371681416,
"grad_norm": 0.060546875,
"learning_rate": 2.6548672566371686e-05,
"loss": 0.8209,
"step": 422
},
{
"epoch": 1.8716814159292037,
"grad_norm": 0.072265625,
"learning_rate": 2.5663716814159294e-05,
"loss": 0.8294,
"step": 423
},
{
"epoch": 1.8761061946902655,
"grad_norm": 0.061279296875,
"learning_rate": 2.4778761061946905e-05,
"loss": 0.7602,
"step": 424
},
{
"epoch": 1.8805309734513274,
"grad_norm": 0.0810546875,
"learning_rate": 2.3893805309734516e-05,
"loss": 0.8862,
"step": 425
},
{
"epoch": 1.8849557522123894,
"grad_norm": 0.06494140625,
"learning_rate": 2.3008849557522124e-05,
"loss": 0.8715,
"step": 426
},
{
"epoch": 1.8893805309734515,
"grad_norm": 0.06982421875,
"learning_rate": 2.2123893805309738e-05,
"loss": 0.9235,
"step": 427
},
{
"epoch": 1.893805309734513,
"grad_norm": 0.07958984375,
"learning_rate": 2.1238938053097346e-05,
"loss": 0.8975,
"step": 428
},
{
"epoch": 1.8982300884955752,
"grad_norm": 0.08935546875,
"learning_rate": 2.0353982300884957e-05,
"loss": 1.0014,
"step": 429
},
{
"epoch": 1.9026548672566372,
"grad_norm": 0.05712890625,
"learning_rate": 1.946902654867257e-05,
"loss": 0.8397,
"step": 430
},
{
"epoch": 1.907079646017699,
"grad_norm": 0.0859375,
"learning_rate": 1.858407079646018e-05,
"loss": 1.0832,
"step": 431
},
{
"epoch": 1.911504424778761,
"grad_norm": 0.09375,
"learning_rate": 1.7699115044247787e-05,
"loss": 0.7726,
"step": 432
},
{
"epoch": 1.915929203539823,
"grad_norm": 0.06884765625,
"learning_rate": 1.6814159292035402e-05,
"loss": 0.936,
"step": 433
},
{
"epoch": 1.920353982300885,
"grad_norm": 0.062255859375,
"learning_rate": 1.592920353982301e-05,
"loss": 1.0048,
"step": 434
},
{
"epoch": 1.924778761061947,
"grad_norm": 0.08349609375,
"learning_rate": 1.504424778761062e-05,
"loss": 0.864,
"step": 435
},
{
"epoch": 1.9292035398230087,
"grad_norm": 0.0869140625,
"learning_rate": 1.415929203539823e-05,
"loss": 0.9952,
"step": 436
},
{
"epoch": 1.9336283185840708,
"grad_norm": 0.06982421875,
"learning_rate": 1.3274336283185843e-05,
"loss": 0.8628,
"step": 437
},
{
"epoch": 1.9380530973451329,
"grad_norm": 0.060546875,
"learning_rate": 1.2389380530973452e-05,
"loss": 0.8487,
"step": 438
},
{
"epoch": 1.9424778761061947,
"grad_norm": 0.0634765625,
"learning_rate": 1.1504424778761062e-05,
"loss": 0.8495,
"step": 439
},
{
"epoch": 1.9469026548672566,
"grad_norm": 0.06689453125,
"learning_rate": 1.0619469026548673e-05,
"loss": 0.8815,
"step": 440
},
{
"epoch": 1.9513274336283186,
"grad_norm": 0.0634765625,
"learning_rate": 9.734513274336284e-06,
"loss": 0.8667,
"step": 441
},
{
"epoch": 1.9557522123893807,
"grad_norm": 0.0869140625,
"learning_rate": 8.849557522123894e-06,
"loss": 0.7515,
"step": 442
},
{
"epoch": 1.9601769911504425,
"grad_norm": 0.07275390625,
"learning_rate": 7.964601769911505e-06,
"loss": 0.8048,
"step": 443
},
{
"epoch": 1.9646017699115044,
"grad_norm": 0.0625,
"learning_rate": 7.079646017699115e-06,
"loss": 0.9373,
"step": 444
},
{
"epoch": 1.9690265486725664,
"grad_norm": 0.0859375,
"learning_rate": 6.194690265486726e-06,
"loss": 0.7985,
"step": 445
},
{
"epoch": 1.9734513274336283,
"grad_norm": 0.1083984375,
"learning_rate": 5.3097345132743365e-06,
"loss": 0.9149,
"step": 446
},
{
"epoch": 1.9778761061946901,
"grad_norm": 0.05615234375,
"learning_rate": 4.424778761061947e-06,
"loss": 0.8296,
"step": 447
},
{
"epoch": 1.9823008849557522,
"grad_norm": 0.061767578125,
"learning_rate": 3.5398230088495575e-06,
"loss": 0.8539,
"step": 448
},
{
"epoch": 1.9867256637168142,
"grad_norm": 0.0625,
"learning_rate": 2.6548672566371683e-06,
"loss": 0.8847,
"step": 449
},
{
"epoch": 1.991150442477876,
"grad_norm": 0.09033203125,
"learning_rate": 1.7699115044247788e-06,
"loss": 0.8814,
"step": 450
},
{
"epoch": 1.995575221238938,
"grad_norm": 0.0625,
"learning_rate": 8.849557522123894e-07,
"loss": 0.8299,
"step": 451
},
{
"epoch": 2.0,
"grad_norm": 0.12451171875,
"learning_rate": 0.0,
"loss": 0.8232,
"step": 452
}
],
"logging_steps": 1.0,
"max_steps": 452,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4086515032577802e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}