{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 97.44590163934426,
  "eval_steps": 500,
  "global_step": 3800,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 1.2885245901639344,
      "grad_norm": 49.25,
      "learning_rate": 0.00019747235387045816,
      "loss": 6.9218,
      "step": 50
    },
    {
      "epoch": 2.577049180327869,
      "grad_norm": 73.5,
      "learning_rate": 0.0001948393891521854,
      "loss": 3.5446,
      "step": 100
    },
    {
      "epoch": 3.865573770491803,
      "grad_norm": 58.25,
      "learning_rate": 0.0001922064244339126,
      "loss": 3.191,
      "step": 150
    },
    {
      "epoch": 5.131147540983607,
      "grad_norm": 49.0,
      "learning_rate": 0.00018957345971563983,
      "loss": 2.9104,
      "step": 200
    },
    {
      "epoch": 6.419672131147541,
      "grad_norm": 66.5,
      "learning_rate": 0.00018694049499736707,
      "loss": 2.0795,
      "step": 250
    },
    {
      "epoch": 7.7081967213114755,
      "grad_norm": 45.75,
      "learning_rate": 0.00018430753027909427,
      "loss": 2.3055,
      "step": 300
    },
    {
      "epoch": 8.99672131147541,
      "grad_norm": 56.25,
      "learning_rate": 0.0001816745655608215,
      "loss": 1.8394,
      "step": 350
    },
    {
      "epoch": 10.262295081967213,
      "grad_norm": 50.75,
      "learning_rate": 0.00017904160084254874,
      "loss": 1.5723,
      "step": 400
    },
    {
      "epoch": 11.550819672131148,
      "grad_norm": 48.5,
      "learning_rate": 0.00017640863612427594,
      "loss": 1.4006,
      "step": 450
    },
    {
      "epoch": 12.839344262295082,
      "grad_norm": 39.75,
      "learning_rate": 0.00017377567140600318,
      "loss": 1.363,
      "step": 500
    },
    {
      "epoch": 14.104918032786886,
      "grad_norm": 38.5,
      "learning_rate": 0.0001711427066877304,
      "loss": 1.3352,
      "step": 550
    },
    {
      "epoch": 15.39344262295082,
      "grad_norm": 45.0,
      "learning_rate": 0.00016850974196945762,
      "loss": 1.1165,
      "step": 600
    },
    {
      "epoch": 16.681967213114753,
      "grad_norm": 44.0,
      "learning_rate": 0.00016587677725118485,
      "loss": 0.8736,
      "step": 650
    },
    {
      "epoch": 17.970491803278687,
      "grad_norm": 43.5,
      "learning_rate": 0.00016324381253291208,
      "loss": 1.0635,
      "step": 700
    },
    {
      "epoch": 19.236065573770492,
      "grad_norm": 34.25,
      "learning_rate": 0.0001606108478146393,
      "loss": 0.7858,
      "step": 750
    },
    {
      "epoch": 20.524590163934427,
      "grad_norm": 37.25,
      "learning_rate": 0.00015797788309636652,
      "loss": 0.8236,
      "step": 800
    },
    {
      "epoch": 21.81311475409836,
      "grad_norm": 35.5,
      "learning_rate": 0.00015534491837809376,
      "loss": 0.7766,
      "step": 850
    },
    {
      "epoch": 23.078688524590163,
      "grad_norm": 33.0,
      "learning_rate": 0.00015271195365982096,
      "loss": 0.6612,
      "step": 900
    },
    {
      "epoch": 24.367213114754097,
      "grad_norm": 33.75,
      "learning_rate": 0.0001500789889415482,
      "loss": 0.6364,
      "step": 950
    },
    {
      "epoch": 25.65573770491803,
      "grad_norm": 38.25,
      "learning_rate": 0.00014744602422327543,
      "loss": 0.6553,
      "step": 1000
    },
    {
      "epoch": 26.944262295081966,
      "grad_norm": 29.25,
      "learning_rate": 0.00014481305950500263,
      "loss": 0.5468,
      "step": 1050
    },
    {
      "epoch": 28.20983606557377,
      "grad_norm": 35.25,
      "learning_rate": 0.00014218009478672987,
      "loss": 0.5311,
      "step": 1100
    },
    {
      "epoch": 29.498360655737706,
      "grad_norm": 27.75,
      "learning_rate": 0.0001395471300684571,
      "loss": 0.5019,
      "step": 1150
    },
    {
      "epoch": 30.78688524590164,
      "grad_norm": 28.125,
      "learning_rate": 0.0001369141653501843,
      "loss": 0.6387,
      "step": 1200
    },
    {
      "epoch": 32.05245901639344,
      "grad_norm": 38.75,
      "learning_rate": 0.00013428120063191154,
      "loss": 0.5054,
      "step": 1250
    },
    {
      "epoch": 33.34098360655738,
      "grad_norm": 21.875,
      "learning_rate": 0.00013164823591363877,
      "loss": 0.4805,
      "step": 1300
    },
    {
      "epoch": 34.62950819672131,
      "grad_norm": 29.375,
      "learning_rate": 0.00012901527119536598,
      "loss": 0.5118,
      "step": 1350
    },
    {
      "epoch": 35.91803278688525,
      "grad_norm": 36.0,
      "learning_rate": 0.0001263823064770932,
      "loss": 0.447,
      "step": 1400
    },
    {
      "epoch": 37.18360655737705,
      "grad_norm": 24.125,
      "learning_rate": 0.00012374934175882045,
      "loss": 0.3921,
      "step": 1450
    },
    {
      "epoch": 38.472131147540985,
      "grad_norm": 21.875,
      "learning_rate": 0.00012111637704054765,
      "loss": 0.4268,
      "step": 1500
    },
    {
      "epoch": 39.760655737704916,
      "grad_norm": 22.25,
      "learning_rate": 0.00011848341232227489,
      "loss": 0.3317,
      "step": 1550
    },
    {
      "epoch": 41.02622950819672,
      "grad_norm": 15.0625,
      "learning_rate": 0.00011585044760400212,
      "loss": 0.387,
      "step": 1600
    },
    {
      "epoch": 42.31475409836066,
      "grad_norm": 20.875,
      "learning_rate": 0.00011321748288572934,
      "loss": 0.3285,
      "step": 1650
    },
    {
      "epoch": 43.60327868852459,
      "grad_norm": 21.375,
      "learning_rate": 0.00011058451816745656,
      "loss": 0.3281,
      "step": 1700
    },
    {
      "epoch": 44.89180327868853,
      "grad_norm": 22.75,
      "learning_rate": 0.00010795155344918379,
      "loss": 0.3148,
      "step": 1750
    },
    {
      "epoch": 46.157377049180326,
      "grad_norm": 18.75,
      "learning_rate": 0.00010531858873091101,
      "loss": 0.2567,
      "step": 1800
    },
    {
      "epoch": 47.445901639344264,
      "grad_norm": 23.75,
      "learning_rate": 0.00010268562401263824,
      "loss": 0.2609,
      "step": 1850
    },
    {
      "epoch": 48.734426229508195,
      "grad_norm": 18.75,
      "learning_rate": 0.00010005265929436546,
      "loss": 0.2365,
      "step": 1900
    },
    {
      "epoch": 50.0,
      "grad_norm": 5.09375,
      "learning_rate": 9.74196945760927e-05,
      "loss": 0.2555,
      "step": 1950
    },
    {
      "epoch": 51.28852459016394,
      "grad_norm": 11.9375,
      "learning_rate": 9.478672985781992e-05,
      "loss": 0.2184,
      "step": 2000
    },
    {
      "epoch": 52.57704918032787,
      "grad_norm": 12.8125,
      "learning_rate": 9.215376513954714e-05,
      "loss": 0.2279,
      "step": 2050
    },
    {
      "epoch": 53.86557377049181,
      "grad_norm": 13.8125,
      "learning_rate": 8.952080042127437e-05,
      "loss": 0.202,
      "step": 2100
    },
    {
      "epoch": 55.131147540983605,
      "grad_norm": 13.0625,
      "learning_rate": 8.688783570300159e-05,
      "loss": 0.1651,
      "step": 2150
    },
    {
      "epoch": 56.41967213114754,
      "grad_norm": 11.4375,
      "learning_rate": 8.425487098472881e-05,
      "loss": 0.2015,
      "step": 2200
    },
    {
      "epoch": 57.708196721311474,
      "grad_norm": 16.375,
      "learning_rate": 8.162190626645604e-05,
      "loss": 0.1504,
      "step": 2250
    },
    {
      "epoch": 58.99672131147541,
      "grad_norm": 13.0625,
      "learning_rate": 7.898894154818326e-05,
      "loss": 0.1725,
      "step": 2300
    },
    {
      "epoch": 60.26229508196721,
      "grad_norm": 13.6875,
      "learning_rate": 7.635597682991048e-05,
      "loss": 0.1499,
      "step": 2350
    },
    {
      "epoch": 61.55081967213115,
      "grad_norm": 7.59375,
      "learning_rate": 7.372301211163771e-05,
      "loss": 0.145,
      "step": 2400
    },
    {
      "epoch": 62.83934426229508,
      "grad_norm": 7.0625,
      "learning_rate": 7.109004739336493e-05,
      "loss": 0.1379,
      "step": 2450
    },
    {
      "epoch": 64.10491803278688,
      "grad_norm": 4.15625,
      "learning_rate": 6.845708267509215e-05,
      "loss": 0.1244,
      "step": 2500
    },
    {
      "epoch": 65.39344262295081,
      "grad_norm": 7.0,
      "learning_rate": 6.582411795681939e-05,
      "loss": 0.1214,
      "step": 2550
    },
    {
      "epoch": 66.68196721311476,
      "grad_norm": 9.3125,
      "learning_rate": 6.31911532385466e-05,
      "loss": 0.1341,
      "step": 2600
    },
    {
      "epoch": 67.97049180327869,
      "grad_norm": 7.09375,
      "learning_rate": 6.0558188520273826e-05,
      "loss": 0.1201,
      "step": 2650
    },
    {
      "epoch": 69.23606557377049,
      "grad_norm": 20.5,
      "learning_rate": 5.792522380200106e-05,
      "loss": 0.1049,
      "step": 2700
    },
    {
      "epoch": 70.52459016393442,
      "grad_norm": 5.90625,
      "learning_rate": 5.529225908372828e-05,
      "loss": 0.1033,
      "step": 2750
    },
    {
      "epoch": 71.81311475409836,
      "grad_norm": 3.25,
      "learning_rate": 5.2659294365455505e-05,
      "loss": 0.1028,
      "step": 2800
    },
    {
      "epoch": 73.07868852459016,
      "grad_norm": 8.0625,
      "learning_rate": 5.002632964718273e-05,
      "loss": 0.1003,
      "step": 2850
    },
    {
      "epoch": 74.3672131147541,
      "grad_norm": 6.9375,
      "learning_rate": 4.739336492890996e-05,
      "loss": 0.0993,
      "step": 2900
    },
    {
      "epoch": 75.65573770491804,
      "grad_norm": 3.734375,
      "learning_rate": 4.4760400210637185e-05,
      "loss": 0.0988,
      "step": 2950
    },
    {
      "epoch": 76.94426229508197,
      "grad_norm": 4.84375,
      "learning_rate": 4.2127435492364404e-05,
      "loss": 0.0885,
      "step": 3000
    },
    {
      "epoch": 78.20983606557377,
      "grad_norm": 2.71875,
      "learning_rate": 3.949447077409163e-05,
      "loss": 0.0816,
      "step": 3050
    },
    {
      "epoch": 79.4983606557377,
      "grad_norm": 1.59375,
      "learning_rate": 3.686150605581886e-05,
      "loss": 0.0969,
      "step": 3100
    },
    {
      "epoch": 80.78688524590164,
      "grad_norm": 2.5625,
      "learning_rate": 3.422854133754608e-05,
      "loss": 0.0886,
      "step": 3150
    },
    {
      "epoch": 82.05245901639344,
      "grad_norm": 4.6875,
      "learning_rate": 3.15955766192733e-05,
      "loss": 0.0801,
      "step": 3200
    },
    {
      "epoch": 83.34098360655737,
      "grad_norm": 2.53125,
      "learning_rate": 2.896261190100053e-05,
      "loss": 0.0888,
      "step": 3250
    },
    {
      "epoch": 84.62950819672132,
      "grad_norm": 4.1875,
      "learning_rate": 2.6329647182727753e-05,
      "loss": 0.0872,
      "step": 3300
    },
    {
      "epoch": 85.91803278688525,
      "grad_norm": 2.9375,
      "learning_rate": 2.369668246445498e-05,
      "loss": 0.0807,
      "step": 3350
    },
    {
      "epoch": 87.18360655737705,
      "grad_norm": 2.84375,
      "learning_rate": 2.1063717746182202e-05,
      "loss": 0.0779,
      "step": 3400
    },
    {
      "epoch": 88.47213114754098,
      "grad_norm": 1.8125,
      "learning_rate": 1.843075302790943e-05,
      "loss": 0.0741,
      "step": 3450
    },
    {
      "epoch": 89.76065573770492,
      "grad_norm": 2.171875,
      "learning_rate": 1.579778830963665e-05,
      "loss": 0.0833,
      "step": 3500
    },
    {
      "epoch": 91.02622950819672,
      "grad_norm": 2.484375,
      "learning_rate": 1.3164823591363876e-05,
      "loss": 0.0861,
      "step": 3550
    },
    {
      "epoch": 92.31475409836065,
      "grad_norm": 2.046875,
      "learning_rate": 1.0531858873091101e-05,
      "loss": 0.08,
      "step": 3600
    },
    {
      "epoch": 93.6032786885246,
      "grad_norm": 2.84375,
      "learning_rate": 7.898894154818326e-06,
      "loss": 0.0785,
      "step": 3650
    },
    {
      "epoch": 94.89180327868853,
      "grad_norm": 2.28125,
      "learning_rate": 5.2659294365455505e-06,
      "loss": 0.0936,
      "step": 3700
    },
    {
      "epoch": 96.15737704918033,
      "grad_norm": 1.7734375,
      "learning_rate": 2.6329647182727753e-06,
      "loss": 0.0741,
      "step": 3750
    },
    {
      "epoch": 97.44590163934426,
      "grad_norm": 2.21875,
      "learning_rate": 0.0,
      "loss": 0.0927,
      "step": 3800
    }
  ],
  "logging_steps": 50,
  "max_steps": 3800,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 100,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 9.470967617037125e+17,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}