Update fine-tuned model

Browse files

Files changed (7) hide show

adapter_config.json +5 -5
adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +405 -755
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -23,13 +23,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
-    "o_proj",
-    "up_proj",
-    "v_proj",
     "down_proj",
     "q_proj",
-    "gate_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "gate_proj",
     "down_proj",
+    "v_proj",
+    "up_proj",
+    "o_proj",
     "q_proj",
+    "k_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e39357b5e1933c0645027a46b91d0ecdc6f4bf8cd51738f54357c8987d67592
 size 159967880

 version https://git-lfs.github.com/spec/v1
+oid sha256:dc6a40ff3d37d8bf4cbd39b104eda057f3448f96f9e8547f016be36df6a3e524
 size 159967880

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:035fd60242408b013b171574b1e908c2df6aebaaeb341864628bf0b3695e99c2
 size 852876198

 version https://git-lfs.github.com/spec/v1
+oid sha256:657628d5d155d79b9aad8789e0951663eaa204fb58bf12b323c023fffe2b0085
 size 852876198

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9fdc09048aeb5e786b623e473b823031e30bdd8fc2c3f0655e8e64ce6286d57
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:1dbbb3800d4095b7540d07b5bcccd341ea22380b31ae2d3484b7f5c78f026c73
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:194456d3c9e165255d5406a0f3f62973b0bede79d91784f72431350783e27ae7
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:3a60c7d771c1fd156acee762fba03c724cb41829a3f71df370ecd1d20b134982
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,1068 +1,718 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 3.0,
   "eval_steps": 500,
-  "global_step": 150,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.02,
-      "grad_norm": 0.5529273748397827,
-      "learning_rate": 2.5e-05,
-      "loss": 0.2724,
       "step": 1
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 0.5265244245529175,
-      "learning_rate": 5e-05,
-      "loss": 0.278,
       "step": 2
     },
     {
-      "epoch": 0.06,
-      "grad_norm": 0.49891576170921326,
-      "learning_rate": 7.500000000000001e-05,
-      "loss": 0.2999,
       "step": 3
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 0.5106935501098633,
-      "learning_rate": 0.0001,
-      "loss": 0.2894,
       "step": 4
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 0.5380656719207764,
-      "learning_rate": 0.000125,
-      "loss": 0.2936,
       "step": 5
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 0.5809736847877502,
-      "learning_rate": 0.00015000000000000001,
-      "loss": 0.2693,
       "step": 6
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 0.6198976039886475,
-      "learning_rate": 0.000175,
-      "loss": 0.2862,
       "step": 7
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 0.5730974674224854,
-      "learning_rate": 0.0002,
-      "loss": 0.3018,
       "step": 8
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 0.5895105004310608,
-      "learning_rate": 0.00019997552766852432,
-      "loss": 0.2923,
       "step": 9
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 0.5411907434463501,
-      "learning_rate": 0.00019990212265199738,
-      "loss": 0.2853,
       "step": 10
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 0.5928137302398682,
-      "learning_rate": 0.00019977982087825713,
-      "loss": 0.3421,
       "step": 11
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 0.5884208083152771,
-      "learning_rate": 0.00019960868220749448,
-      "loss": 0.2963,
       "step": 12
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 0.5560010075569153,
-      "learning_rate": 0.00019938879040295508,
-      "loss": 0.3257,
       "step": 13
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 0.5500514507293701,
-      "learning_rate": 0.00019912025308994148,
-      "loss": 0.3122,
       "step": 14
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 0.5563028454780579,
-      "learning_rate": 0.0001988032017031364,
-      "loss": 0.3222,
       "step": 15
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 0.5426145792007446,
-      "learning_rate": 0.00019843779142227256,
-      "loss": 0.3383,
       "step": 16
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 0.5534917116165161,
-      "learning_rate": 0.0001980242010961803,
-      "loss": 0.3199,
       "step": 17
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 0.5668300986289978,
-      "learning_rate": 0.0001975626331552507,
-      "loss": 0.3568,
       "step": 18
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 0.5552094578742981,
-      "learning_rate": 0.00019705331351235674,
-      "loss": 0.3289,
       "step": 19
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 0.5791586637496948,
-      "learning_rate": 0.00019649649145228102,
-      "loss": 0.3168,
       "step": 20
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 0.5354134440422058,
-      "learning_rate": 0.00019589243950970402,
-      "loss": 0.332,
       "step": 21
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 0.5309009552001953,
-      "learning_rate": 0.00019524145333581317,
-      "loss": 0.3707,
       "step": 22
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 0.5601043105125427,
-      "learning_rate": 0.00019454385155359702,
-      "loss": 0.3234,
       "step": 23
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 0.5431475639343262,
-      "learning_rate": 0.00019379997560189675,
-      "loss": 0.2946,
       "step": 24
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 0.5860360860824585,
-      "learning_rate": 0.00019301018956828964,
-      "loss": 0.3075,
       "step": 25
     },
     {
-      "epoch": 0.52,
-      "grad_norm": 0.5602842569351196,
-      "learning_rate": 0.00019217488001088784,
-      "loss": 0.3358,
       "step": 26
     },
     {
-      "epoch": 0.54,
-      "grad_norm": 0.5663277506828308,
-      "learning_rate": 0.00019129445576913888,
-      "loss": 0.3259,
       "step": 27
     },
     {
-      "epoch": 0.56,
-      "grad_norm": 0.5709177851676941,
-      "learning_rate": 0.0001903693477637204,
-      "loss": 0.3508,
       "step": 28
     },
     {
-      "epoch": 0.58,
-      "grad_norm": 0.5128621459007263,
-      "learning_rate": 0.00018940000878562758,
-      "loss": 0.3175,
       "step": 29
     },
     {
-      "epoch": 0.6,
-      "grad_norm": 0.5354805588722229,
-      "learning_rate": 0.0001883869132745561,
-      "loss": 0.3318,
       "step": 30
     },
     {
-      "epoch": 0.62,
-      "grad_norm": 0.5368937253952026,
-      "learning_rate": 0.00018733055708668926,
-      "loss": 0.3451,
       "step": 31
     },
     {
-      "epoch": 0.64,
-      "grad_norm": 0.5688962340354919,
-      "learning_rate": 0.00018623145725200278,
-      "loss": 0.3467,
       "step": 32
     },
     {
-      "epoch": 0.66,
-      "grad_norm": 0.5590227246284485,
-      "learning_rate": 0.00018509015172120621,
-      "loss": 0.3303,
       "step": 33
     },
     {
-      "epoch": 0.68,
-      "grad_norm": 0.5596187710762024,
-      "learning_rate": 0.00018390719910244487,
-      "loss": 0.3384,
       "step": 34
     },
     {
-      "epoch": 0.7,
-      "grad_norm": 0.5584146976470947,
-      "learning_rate": 0.00018268317838789088,
-      "loss": 0.3057,
       "step": 35
     },
     {
-      "epoch": 0.72,
-      "grad_norm": 0.5675226449966431,
-      "learning_rate": 0.00018141868867035745,
-      "loss": 0.3581,
       "step": 36
     },
     {
-      "epoch": 0.74,
-      "grad_norm": 0.5121241807937622,
-      "learning_rate": 0.00018011434885007482,
-      "loss": 0.3716,
       "step": 37
     },
     {
-      "epoch": 0.76,
-      "grad_norm": 0.540127694606781,
-      "learning_rate": 0.00017877079733177184,
-      "loss": 0.3458,
       "step": 38
     },
     {
-      "epoch": 0.78,
-      "grad_norm": 0.5629216432571411,
-      "learning_rate": 0.00017738869171221068,
-      "loss": 0.3457,
       "step": 39
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.5627008080482483,
-      "learning_rate": 0.0001759687084583285,
-      "loss": 0.3462,
       "step": 40
     },
     {
-      "epoch": 0.82,
-      "grad_norm": 0.5851187705993652,
-      "learning_rate": 0.00017451154257614287,
-      "loss": 0.3455,
       "step": 41
     },
     {
-      "epoch": 0.84,
-      "grad_norm": 0.5467076301574707,
-      "learning_rate": 0.00017301790727058345,
-      "loss": 0.3289,
       "step": 42
     },
     {
-      "epoch": 0.86,
-      "grad_norm": 0.5140892267227173,
-      "learning_rate": 0.00017148853359641626,
-      "loss": 0.3478,
       "step": 43
     },
     {
-      "epoch": 0.88,
-      "grad_norm": 0.5295486450195312,
-      "learning_rate": 0.00016992417010043142,
-      "loss": 0.351,
       "step": 44
     },
     {
-      "epoch": 0.9,
-      "grad_norm": 0.5442476868629456,
-      "learning_rate": 0.00016832558245506935,
-      "loss": 0.3461,
       "step": 45
     },
     {
-      "epoch": 0.92,
-      "grad_norm": 0.5530596971511841,
-      "learning_rate": 0.0001666935530836651,
-      "loss": 0.3435,
       "step": 46
     },
     {
-      "epoch": 0.94,
-      "grad_norm": 0.5377740263938904,
-      "learning_rate": 0.0001650288807774937,
-      "loss": 0.3409,
       "step": 47
     },
     {
-      "epoch": 0.96,
-      "grad_norm": 0.5187397003173828,
-      "learning_rate": 0.0001633323803048047,
-      "loss": 0.338,
       "step": 48
     },
     {
-      "epoch": 0.98,
-      "grad_norm": 0.5382808446884155,
-      "learning_rate": 0.00016160488201203644,
-      "loss": 0.3398,
       "step": 49
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 0.5559888482093811,
-      "learning_rate": 0.00015984723141740576,
-      "loss": 0.3366,
       "step": 50
     },
     {
-      "epoch": 1.02,
-      "grad_norm": 0.46435657143592834,
-      "learning_rate": 0.0001580602887970721,
-      "loss": 0.2239,
       "step": 51
     },
     {
-      "epoch": 1.04,
-      "grad_norm": 0.4675140678882599,
-      "learning_rate": 0.0001562449287640781,
-      "loss": 0.2007,
       "step": 52
     },
     {
-      "epoch": 1.06,
-      "grad_norm": 0.4767879247665405,
-      "learning_rate": 0.00015440203984027324,
-      "loss": 0.2223,
       "step": 53
     },
     {
-      "epoch": 1.08,
-      "grad_norm": 0.49781811237335205,
-      "learning_rate": 0.00015253252402142988,
-      "loss": 0.2262,
       "step": 54
     },
     {
-      "epoch": 1.1,
-      "grad_norm": 0.543898344039917,
-      "learning_rate": 0.0001506372963357644,
-      "loss": 0.19,
       "step": 55
     },
     {
-      "epoch": 1.12,
-      "grad_norm": 0.5789697170257568,
-      "learning_rate": 0.00014871728439607966,
-      "loss": 0.2011,
       "step": 56
     },
     {
-      "epoch": 1.1400000000000001,
-      "grad_norm": 0.5885554552078247,
-      "learning_rate": 0.00014677342794574817,
-      "loss": 0.2016,
       "step": 57
     },
     {
-      "epoch": 1.16,
-      "grad_norm": 0.5597030520439148,
-      "learning_rate": 0.00014480667839875786,
-      "loss": 0.1981,
       "step": 58
     },
     {
-      "epoch": 1.18,
-      "grad_norm": 0.5306347012519836,
-      "learning_rate": 0.00014281799837404552,
-      "loss": 0.1918,
       "step": 59
     },
     {
-      "epoch": 1.2,
-      "grad_norm": 0.5231032371520996,
-      "learning_rate": 0.0001408083612243465,
-      "loss": 0.1816,
       "step": 60
     },
     {
-      "epoch": 1.22,
-      "grad_norm": 0.4930267333984375,
-      "learning_rate": 0.00013877875055979023,
-      "loss": 0.1648,
       "step": 61
     },
     {
-      "epoch": 1.24,
-      "grad_norm": 0.5262163281440735,
-      "learning_rate": 0.00013673015976647568,
-      "loss": 0.2092,
       "step": 62
     },
     {
-      "epoch": 1.26,
-      "grad_norm": 0.5433812737464905,
-      "learning_rate": 0.00013466359152026195,
-      "loss": 0.2007,
       "step": 63
     },
     {
-      "epoch": 1.28,
-      "grad_norm": 0.47026363015174866,
-      "learning_rate": 0.00013258005729601177,
-      "loss": 0.1941,
       "step": 64
     },
     {
-      "epoch": 1.3,
-      "grad_norm": 0.4707397222518921,
-      "learning_rate": 0.00013048057687252865,
-      "loss": 0.2069,
       "step": 65
     },
     {
-      "epoch": 1.32,
-      "grad_norm": 0.48763400316238403,
-      "learning_rate": 0.0001283661778334297,
-      "loss": 0.1933,
       "step": 66
     },
     {
-      "epoch": 1.34,
-      "grad_norm": 0.4656035006046295,
-      "learning_rate": 0.0001262378950641979,
-      "loss": 0.1894,
       "step": 67
     },
     {
-      "epoch": 1.3599999999999999,
-      "grad_norm": 0.479379266500473,
-      "learning_rate": 0.00012409677024566144,
-      "loss": 0.2199,
       "step": 68
     },
     {
-      "epoch": 1.38,
-      "grad_norm": 0.5007523894309998,
-      "learning_rate": 0.00012194385134414608,
-      "loss": 0.2088,
       "step": 69
     },
     {
-      "epoch": 1.4,
-      "grad_norm": 0.48662200570106506,
-      "learning_rate": 0.00011978019209855174,
-      "loss": 0.2202,
       "step": 70
     },
     {
-      "epoch": 1.42,
-      "grad_norm": 0.46738380193710327,
-      "learning_rate": 0.00011760685150460362,
-      "loss": 0.1878,
       "step": 71
     },
     {
-      "epoch": 1.44,
-      "grad_norm": 0.4948503077030182,
-      "learning_rate": 0.00011542489329653024,
-      "loss": 0.1882,
       "step": 72
     },
     {
-      "epoch": 1.46,
-      "grad_norm": 0.4791596233844757,
-      "learning_rate": 0.00011323538542642227,
-      "loss": 0.1846,
       "step": 73
     },
     {
-      "epoch": 1.48,
-      "grad_norm": 0.4715379774570465,
-      "learning_rate": 0.000111039399541527,
-      "loss": 0.199,
       "step": 74
     },
     {
-      "epoch": 1.5,
-      "grad_norm": 0.4828183352947235,
-      "learning_rate": 0.00010883801045973425,
-      "loss": 0.2047,
       "step": 75
     },
     {
-      "epoch": 1.52,
-      "grad_norm": 0.48095616698265076,
-      "learning_rate": 0.00010663229564351041,
-      "loss": 0.218,
       "step": 76
     },
     {
-      "epoch": 1.54,
-      "grad_norm": 0.4789870083332062,
-      "learning_rate": 0.00010442333467253789,
-      "loss": 0.1924,
       "step": 77
     },
     {
-      "epoch": 1.56,
-      "grad_norm": 0.4771735966205597,
-      "learning_rate": 0.00010221220871531869,
-      "loss": 0.1864,
       "step": 78
     },
     {
-      "epoch": 1.58,
-      "grad_norm": 0.4639010727405548,
-      "learning_rate": 0.0001,
-      "loss": 0.1917,
       "step": 79
     },
     {
-      "epoch": 1.6,
-      "grad_norm": 0.49689823389053345,
-      "learning_rate": 9.778779128468132e-05,
-      "loss": 0.1769,
       "step": 80
     },
     {
-      "epoch": 1.62,
-      "grad_norm": 0.4767945408821106,
-      "learning_rate": 9.557666532746213e-05,
-      "loss": 0.1743,
       "step": 81
     },
     {
-      "epoch": 1.6400000000000001,
-      "grad_norm": 0.4821512699127197,
-      "learning_rate": 9.336770435648964e-05,
-      "loss": 0.1965,
       "step": 82
     },
     {
-      "epoch": 1.6600000000000001,
-      "grad_norm": 0.6080212593078613,
-      "learning_rate": 9.116198954026577e-05,
-      "loss": 0.1963,
       "step": 83
     },
     {
-      "epoch": 1.6800000000000002,
-      "grad_norm": 0.48027199506759644,
-      "learning_rate": 8.896060045847304e-05,
-      "loss": 0.2075,
       "step": 84
     },
     {
-      "epoch": 1.7,
-      "grad_norm": 0.5060182809829712,
-      "learning_rate": 8.676461457357776e-05,
-      "loss": 0.1882,
       "step": 85
     },
     {
-      "epoch": 1.72,
-      "grad_norm": 0.5219600796699524,
-      "learning_rate": 8.457510670346976e-05,
-      "loss": 0.2068,
       "step": 86
     },
     {
-      "epoch": 1.74,
-      "grad_norm": 0.47161611914634705,
-      "learning_rate": 8.239314849539638e-05,
-      "loss": 0.1924,
       "step": 87
     },
     {
-      "epoch": 1.76,
-      "grad_norm": 0.4543808400630951,
-      "learning_rate": 8.021980790144827e-05,
-      "loss": 0.1861,
       "step": 88
     },
     {
-      "epoch": 1.78,
-      "grad_norm": 0.4798787534236908,
-      "learning_rate": 7.805614865585396e-05,
-      "loss": 0.209,
       "step": 89
     },
     {
-      "epoch": 1.8,
-      "grad_norm": 0.4594615697860718,
-      "learning_rate": 7.590322975433857e-05,
-      "loss": 0.1721,
       "step": 90
     },
     {
-      "epoch": 1.8199999999999998,
-      "grad_norm": 0.49884089827537537,
-      "learning_rate": 7.376210493580212e-05,
-      "loss": 0.1964,
       "step": 91
     },
     {
-      "epoch": 1.8399999999999999,
-      "grad_norm": 0.4920552968978882,
-      "learning_rate": 7.163382216657034e-05,
-      "loss": 0.1944,
       "step": 92
     },
     {
-      "epoch": 1.8599999999999999,
-      "grad_norm": 0.4905566871166229,
-      "learning_rate": 6.951942312747134e-05,
-      "loss": 0.1876,
       "step": 93
     },
     {
-      "epoch": 1.88,
-      "grad_norm": 0.483819842338562,
-      "learning_rate": 6.741994270398826e-05,
-      "loss": 0.1825,
       "step": 94
     },
     {
-      "epoch": 1.9,
-      "grad_norm": 0.5140827894210815,
-      "learning_rate": 6.533640847973808e-05,
-      "loss": 0.1986,
       "step": 95
     },
     {
-      "epoch": 1.92,
-      "grad_norm": 0.49198630452156067,
-      "learning_rate": 6.326984023352435e-05,
-      "loss": 0.2113,
       "step": 96
     },
     {
-      "epoch": 1.94,
-      "grad_norm": 0.47631028294563293,
-      "learning_rate": 6.122124944020977e-05,
-      "loss": 0.1867,
       "step": 97
     },
     {
-      "epoch": 1.96,
-      "grad_norm": 0.47129324078559875,
-      "learning_rate": 5.91916387756535e-05,
-      "loss": 0.2016,
       "step": 98
     },
     {
-      "epoch": 1.98,
-      "grad_norm": 0.4497186839580536,
-      "learning_rate": 5.718200162595449e-05,
-      "loss": 0.1816,
       "step": 99
     },
     {
-      "epoch": 2.0,
-      "grad_norm": 0.4525175392627716,
-      "learning_rate": 5.5193321601242156e-05,
-      "loss": 0.1874,
-      "step": 100
-    },
-    {
-      "epoch": 2.02,
-      "grad_norm": 0.35321784019470215,
-      "learning_rate": 5.322657205425183e-05,
-      "loss": 0.1384,
-      "step": 101
-    },
-    {
-      "epoch": 2.04,
-      "grad_norm": 0.364757239818573,
-      "learning_rate": 5.1282715603920374e-05,
-      "loss": 0.1443,
-      "step": 102
-    },
-    {
-      "epoch": 2.06,
-      "grad_norm": 0.3620398938655853,
-      "learning_rate": 4.936270366423563e-05,
-      "loss": 0.1378,
-      "step": 103
-    },
-    {
-      "epoch": 2.08,
-      "grad_norm": 0.3393998444080353,
-      "learning_rate": 4.746747597857014e-05,
-      "loss": 0.1265,
-      "step": 104
-    },
-    {
-      "epoch": 2.1,
-      "grad_norm": 0.3330179750919342,
-      "learning_rate": 4.559796015972677e-05,
-      "loss": 0.12,
-      "step": 105
-    },
-    {
-      "epoch": 2.12,
-      "grad_norm": 0.3353124260902405,
-      "learning_rate": 4.375507123592194e-05,
-      "loss": 0.1203,
-      "step": 106
-    },
-    {
-      "epoch": 2.14,
-      "grad_norm": 0.36515313386917114,
-      "learning_rate": 4.1939711202927936e-05,
-      "loss": 0.114,
-      "step": 107
-    },
-    {
-      "epoch": 2.16,
-      "grad_norm": 0.3508182168006897,
-      "learning_rate": 4.015276858259427e-05,
-      "loss": 0.1231,
-      "step": 108
-    },
-    {
-      "epoch": 2.18,
-      "grad_norm": 0.37940266728401184,
-      "learning_rate": 3.839511798796357e-05,
-      "loss": 0.1326,
-      "step": 109
-    },
-    {
-      "epoch": 2.2,
-      "grad_norm": 0.35219907760620117,
-      "learning_rate": 3.6667619695195285e-05,
-      "loss": 0.1179,
-      "step": 110
-    },
-    {
-      "epoch": 2.22,
-      "grad_norm": 0.34503066539764404,
-      "learning_rate": 3.49711192225063e-05,
-      "loss": 0.121,
-      "step": 111
-    },
-    {
-      "epoch": 2.24,
-      "grad_norm": 0.36006444692611694,
-      "learning_rate": 3.330644691633492e-05,
-      "loss": 0.1167,
-      "step": 112
-    },
-    {
-      "epoch": 2.26,
-      "grad_norm": 0.412020742893219,
-      "learning_rate": 3.167441754493066e-05,
-      "loss": 0.1367,
-      "step": 113
-    },
-    {
-      "epoch": 2.2800000000000002,
-      "grad_norm": 0.45470723509788513,
-      "learning_rate": 3.0075829899568597e-05,
-      "loss": 0.1319,
-      "step": 114
-    },
-    {
-      "epoch": 2.3,
-      "grad_norm": 0.3515799045562744,
-      "learning_rate": 2.8511466403583766e-05,
-      "loss": 0.1185,
-      "step": 115
-    },
-    {
-      "epoch": 2.32,
-      "grad_norm": 0.37341800332069397,
-      "learning_rate": 2.6982092729416587e-05,
-      "loss": 0.1187,
-      "step": 116
-    },
-    {
-      "epoch": 2.34,
-      "grad_norm": 0.3499296307563782,
-      "learning_rate": 2.548845742385717e-05,
-      "loss": 0.1113,
-      "step": 117
-    },
-    {
-      "epoch": 2.36,
-      "grad_norm": 0.36934876441955566,
-      "learning_rate": 2.403129154167153e-05,
-      "loss": 0.115,
-      "step": 118
-    },
-    {
-      "epoch": 2.38,
-      "grad_norm": 0.3478608727455139,
-      "learning_rate": 2.2611308287789344e-05,
-      "loss": 0.1207,
-      "step": 119
-    },
-    {
-      "epoch": 2.4,
-      "grad_norm": 0.3456974923610687,
-      "learning_rate": 2.1229202668228197e-05,
-      "loss": 0.1139,
-      "step": 120
-    },
-    {
-      "epoch": 2.42,
-      "grad_norm": 0.3334108293056488,
-      "learning_rate": 1.988565114992519e-05,
-      "loss": 0.1296,
-      "step": 121
-    },
-    {
-      "epoch": 2.44,
-      "grad_norm": 0.348541259765625,
-      "learning_rate": 1.858131132964259e-05,
-      "loss": 0.1233,
-      "step": 122
-    },
-    {
-      "epoch": 2.46,
-      "grad_norm": 0.33513638377189636,
-      "learning_rate": 1.7316821612109136e-05,
-      "loss": 0.1269,
-      "step": 123
-    },
-    {
-      "epoch": 2.48,
-      "grad_norm": 0.35099372267723083,
-      "learning_rate": 1.609280089755515e-05,
-      "loss": 0.1132,
-      "step": 124
-    },
-    {
-      "epoch": 2.5,
-      "grad_norm": 0.34724029898643494,
-      "learning_rate": 1.4909848278793782e-05,
-      "loss": 0.1381,
-      "step": 125
-    },
-    {
-      "epoch": 2.52,
-      "grad_norm": 0.3333680033683777,
-      "learning_rate": 1.3768542747997215e-05,
-      "loss": 0.119,
-      "step": 126
-    },
-    {
-      "epoch": 2.54,
-      "grad_norm": 0.35580337047576904,
-      "learning_rate": 1.2669442913310725e-05,
-      "loss": 0.1196,
-      "step": 127
-    },
-    {
-      "epoch": 2.56,
-      "grad_norm": 0.35816505551338196,
-      "learning_rate": 1.161308672544389e-05,
-      "loss": 0.1259,
-      "step": 128
-    },
-    {
-      "epoch": 2.58,
-      "grad_norm": 0.35583576560020447,
-      "learning_rate": 1.059999121437244e-05,
-      "loss": 0.1396,
-      "step": 129
-    },
-    {
-      "epoch": 2.6,
-      "grad_norm": 0.37276527285575867,
-      "learning_rate": 9.630652236279625e-06,
-      "loss": 0.1369,
-      "step": 130
-    },
-    {
-      "epoch": 2.62,
-      "grad_norm": 0.34090283513069153,
-      "learning_rate": 8.70554423086114e-06,
-      "loss": 0.1114,
-      "step": 131
-    },
-    {
-      "epoch": 2.64,
-      "grad_norm": 0.32982465624809265,
-      "learning_rate": 7.825119989112173e-06,
-      "loss": 0.1232,
-      "step": 132
-    },
-    {
-      "epoch": 2.66,
-      "grad_norm": 0.36222097277641296,
-      "learning_rate": 6.989810431710375e-06,
-      "loss": 0.1193,
-      "step": 133
-    },
-    {
-      "epoch": 2.68,
-      "grad_norm": 0.33648136258125305,
-      "learning_rate": 6.200024398103255e-06,
-      "loss": 0.1046,
-      "step": 134
-    },
-    {
-      "epoch": 2.7,
-      "grad_norm": 0.34745728969573975,
-      "learning_rate": 5.456148446402976e-06,
-      "loss": 0.1312,
-      "step": 135
-    },
-    {
-      "epoch": 2.7199999999999998,
-      "grad_norm": 0.3730124235153198,
-      "learning_rate": 4.758546664186869e-06,
-      "loss": 0.1186,
-      "step": 136
-    },
-    {
-      "epoch": 2.74,
-      "grad_norm": 0.3411957919597626,
-      "learning_rate": 4.107560490295992e-06,
-      "loss": 0.1282,
-      "step": 137
-    },
-    {
-      "epoch": 2.76,
-      "grad_norm": 0.35611581802368164,
-      "learning_rate": 3.5035085477190143e-06,
-      "loss": 0.1286,
-      "step": 138
-    },
-    {
-      "epoch": 2.7800000000000002,
-      "grad_norm": 0.37135371565818787,
-      "learning_rate": 2.94668648764328e-06,
-      "loss": 0.1335,
-      "step": 139
-    },
-    {
-      "epoch": 2.8,
-      "grad_norm": 0.3284219205379486,
-      "learning_rate": 2.4373668447493224e-06,
-      "loss": 0.1089,
-      "step": 140
-    },
-    {
-      "epoch": 2.82,
-      "grad_norm": 0.36662524938583374,
-      "learning_rate": 1.9757989038197146e-06,
-      "loss": 0.1355,
-      "step": 141
-    },
-    {
-      "epoch": 2.84,
-      "grad_norm": 0.36059364676475525,
-      "learning_rate": 1.562208577727442e-06,
-      "loss": 0.1229,
-      "step": 142
-    },
-    {
-      "epoch": 2.86,
-      "grad_norm": 0.3409566283226013,
-      "learning_rate": 1.1967982968635993e-06,
-      "loss": 0.1183,
-      "step": 143
-    },
-    {
-      "epoch": 2.88,
-      "grad_norm": 0.36703115701675415,
-      "learning_rate": 8.797469100585431e-07,
-      "loss": 0.1416,
-      "step": 144
-    },
-    {
-      "epoch": 2.9,
-      "grad_norm": 0.34956759214401245,
-      "learning_rate": 6.11209597044926e-07,
-      "loss": 0.1223,
-      "step": 145
-    },
-    {
-      "epoch": 2.92,
-      "grad_norm": 0.3447030782699585,
-      "learning_rate": 3.913177925055189e-07,
-      "loss": 0.1283,
-      "step": 146
-    },
-    {
-      "epoch": 2.94,
-      "grad_norm": 0.35175061225891113,
-      "learning_rate": 2.201791217428917e-07,
-      "loss": 0.131,
-      "step": 147
-    },
-    {
-      "epoch": 2.96,
-      "grad_norm": 0.335248738527298,
-      "learning_rate": 9.78773480026396e-08,
-      "loss": 0.1141,
-      "step": 148
-    },
-    {
-      "epoch": 2.98,
-      "grad_norm": 0.329517126083374,
-      "learning_rate": 2.447233147570005e-08,
-      "loss": 0.1162,
-      "step": 149
-    },
-    {
-      "epoch": 3.0,
-      "grad_norm": 0.3240882158279419,
       "learning_rate": 0.0,
-      "loss": 0.1203,
-      "step": 150
     }
   ],
   "logging_steps": 1,
-  "max_steps": 150,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -1076,7 +726,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.954170222658683e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.01,
+      "grad_norm": 0.8039671182632446,
+      "learning_rate": 4e-05,
+      "loss": 0.3593,
       "step": 1
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 1.0575159788131714,
+      "learning_rate": 8e-05,
+      "loss": 0.4832,
       "step": 2
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 0.9031288027763367,
+      "learning_rate": 0.00012,
+      "loss": 0.4249,
       "step": 3
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 0.6510717868804932,
+      "learning_rate": 0.00016,
+      "loss": 0.4258,
       "step": 4
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 0.577819287776947,
+      "learning_rate": 0.0002,
+      "loss": 0.3766,
       "step": 5
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 0.5394201874732971,
+      "learning_rate": 0.00019994532573409262,
+      "loss": 0.3585,
       "step": 6
     },
     {
+      "epoch": 0.07,
+      "grad_norm": 0.4903915524482727,
+      "learning_rate": 0.00019978136272187747,
+      "loss": 0.3804,
       "step": 7
     },
     {
+      "epoch": 0.08,
+      "grad_norm": 0.48727235198020935,
+      "learning_rate": 0.00019950829025450114,
+      "loss": 0.4129,
       "step": 8
     },
     {
+      "epoch": 0.09,
+      "grad_norm": 0.5153201818466187,
+      "learning_rate": 0.00019912640693269752,
+      "loss": 0.4332,
       "step": 9
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 0.502315104007721,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 0.4077,
       "step": 10
     },
     {
+      "epoch": 0.11,
+      "grad_norm": 0.472267746925354,
+      "learning_rate": 0.00019803799658748094,
+      "loss": 0.4006,
       "step": 11
     },
     {
+      "epoch": 0.12,
+      "grad_norm": 0.5331538319587708,
+      "learning_rate": 0.0001973326597248006,
+      "loss": 0.4175,
       "step": 12
     },
     {
+      "epoch": 0.13,
+      "grad_norm": 0.5011341571807861,
+      "learning_rate": 0.00019652089102773488,
+      "loss": 0.4139,
       "step": 13
     },
     {
+      "epoch": 0.14,
+      "grad_norm": 0.5202248096466064,
+      "learning_rate": 0.00019560357815343577,
+      "loss": 0.3731,
       "step": 14
     },
     {
+      "epoch": 0.15,
+      "grad_norm": 0.5288619995117188,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 0.3912,
       "step": 15
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 0.5330358743667603,
+      "learning_rate": 0.0001934564464599461,
+      "loss": 0.4218,
       "step": 16
     },
     {
+      "epoch": 0.17,
+      "grad_norm": 0.528815507888794,
+      "learning_rate": 0.00019222897549773848,
+      "loss": 0.4527,
       "step": 17
     },
     {
+      "epoch": 0.18,
+      "grad_norm": 0.5266752243041992,
+      "learning_rate": 0.00019090065350491626,
+      "loss": 0.3762,
       "step": 18
     },
     {
+      "epoch": 0.19,
+      "grad_norm": 0.48899364471435547,
+      "learning_rate": 0.00018947293298207635,
+      "loss": 0.3838,
       "step": 19
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 0.4758334159851074,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 0.3798,
       "step": 20
     },
     {
+      "epoch": 0.21,
+      "grad_norm": 0.5223532319068909,
+      "learning_rate": 0.00018632564809575742,
+      "loss": 0.364,
       "step": 21
     },
     {
+      "epoch": 0.22,
+      "grad_norm": 0.5233363509178162,
+      "learning_rate": 0.00018460952524209355,
+      "loss": 0.373,
       "step": 22
     },
     {
+      "epoch": 0.23,
+      "grad_norm": 0.4868537485599518,
+      "learning_rate": 0.00018280088311480201,
+      "loss": 0.4269,
       "step": 23
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 0.5180346965789795,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.4044,
       "step": 24
     },
     {
+      "epoch": 0.25,
+      "grad_norm": 0.5078471899032593,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 0.353,
       "step": 25
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 0.4828091859817505,
+      "learning_rate": 0.00017684011108568592,
+      "loss": 0.373,
       "step": 26
     },
     {
+      "epoch": 0.27,
+      "grad_norm": 0.49357226490974426,
+      "learning_rate": 0.0001746821476984154,
+      "loss": 0.4001,
       "step": 27
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 0.5360887050628662,
+      "learning_rate": 0.00017244252047910892,
+      "loss": 0.424,
       "step": 28
     },
     {
+      "epoch": 0.29,
+      "grad_norm": 0.5232270359992981,
+      "learning_rate": 0.00017012367842724887,
+      "loss": 0.3986,
       "step": 29
     },
     {
+      "epoch": 0.3,
+      "grad_norm": 0.5093458294868469,
+      "learning_rate": 0.00016772815716257412,
+      "loss": 0.3909,
       "step": 30
     },
     {
+      "epoch": 0.31,
+      "grad_norm": 0.49155759811401367,
+      "learning_rate": 0.00016525857615241687,
+      "loss": 0.409,
       "step": 31
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.47392013669013977,
+      "learning_rate": 0.0001627176358473537,
+      "loss": 0.3658,
       "step": 32
     },
     {
+      "epoch": 0.33,
+      "grad_norm": 0.512052595615387,
+      "learning_rate": 0.00016010811472830252,
+      "loss": 0.3979,
       "step": 33
     },
     {
+      "epoch": 0.34,
+      "grad_norm": 0.49206939339637756,
+      "learning_rate": 0.00015743286626829437,
+      "loss": 0.3871,
       "step": 34
     },
     {
+      "epoch": 0.35,
+      "grad_norm": 0.5002603530883789,
+      "learning_rate": 0.00015469481581224272,
+      "loss": 0.3668,
       "step": 35
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 0.5139701962471008,
+      "learning_rate": 0.00015189695737812152,
+      "loss": 0.4014,
       "step": 36
     },
     {
+      "epoch": 0.37,
+      "grad_norm": 0.5197802186012268,
+      "learning_rate": 0.00014904235038305083,
+      "loss": 0.3951,
       "step": 37
     },
     {
+      "epoch": 0.38,
+      "grad_norm": 0.5155587196350098,
+      "learning_rate": 0.0001461341162978688,
+      "loss": 0.4161,
       "step": 38
     },
     {
+      "epoch": 0.39,
+      "grad_norm": 0.49465492367744446,
+      "learning_rate": 0.00014317543523384928,
+      "loss": 0.3898,
       "step": 39
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 0.4801078140735626,
+      "learning_rate": 0.00014016954246529696,
+      "loss": 0.3973,
       "step": 40
     },
     {
+      "epoch": 0.41,
+      "grad_norm": 0.48596322536468506,
+      "learning_rate": 0.00013711972489182208,
+      "loss": 0.4097,
       "step": 41
     },
     {
+      "epoch": 0.42,
+      "grad_norm": 0.5131967663764954,
+      "learning_rate": 0.00013402931744416433,
+      "loss": 0.3829,
       "step": 42
     },
     {
+      "epoch": 0.43,
+      "grad_norm": 0.4835667610168457,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.3642,
       "step": 43
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 0.5013434886932373,
+      "learning_rate": 0.00012774029087618446,
+      "loss": 0.3675,
       "step": 44
     },
     {
+      "epoch": 0.45,
+      "grad_norm": 0.4737671911716461,
+      "learning_rate": 0.00012454854871407994,
+      "loss": 0.4346,
       "step": 45
     },
     {
+      "epoch": 0.46,
+      "grad_norm": 0.544231116771698,
+      "learning_rate": 0.0001213299630743747,
+      "loss": 0.3914,
       "step": 46
     },
     {
+      "epoch": 0.47,
+      "grad_norm": 0.5040849447250366,
+      "learning_rate": 0.000118088053433211,
+      "loss": 0.3744,
       "step": 47
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 0.5225382447242737,
+      "learning_rate": 0.0001148263647711842,
+      "loss": 0.3922,
       "step": 48
     },
     {
+      "epoch": 0.49,
+      "grad_norm": 0.4771358370780945,
+      "learning_rate": 0.00011154846369695863,
+      "loss": 0.3552,
       "step": 49
     },
     {
+      "epoch": 0.5,
+      "grad_norm": 0.4580378532409668,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 0.3667,
       "step": 50
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 0.5024124979972839,
+      "learning_rate": 0.00010495837546732224,
+      "loss": 0.3469,
       "step": 51
     },
     {
+      "epoch": 0.52,
+      "grad_norm": 0.4725678861141205,
+      "learning_rate": 0.00010165339447663587,
+      "loss": 0.3717,
       "step": 52
     },
     {
+      "epoch": 0.53,
+      "grad_norm": 0.47523003816604614,
+      "learning_rate": 9.834660552336415e-05,
+      "loss": 0.3779,
       "step": 53
     },
     {
+      "epoch": 0.54,
+      "grad_norm": 0.5019033551216125,
+      "learning_rate": 9.504162453267777e-05,
+      "loss": 0.3548,
       "step": 54
     },
     {
+      "epoch": 0.55,
+      "grad_norm": 0.5027766227722168,
+      "learning_rate": 9.174206545276677e-05,
+      "loss": 0.4236,
       "step": 55
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 0.512511670589447,
+      "learning_rate": 8.845153630304139e-05,
+      "loss": 0.3492,
       "step": 56
     },
     {
+      "epoch": 0.57,
+      "grad_norm": 0.4811123311519623,
+      "learning_rate": 8.517363522881579e-05,
+      "loss": 0.3627,
       "step": 57
     },
     {
+      "epoch": 0.58,
+      "grad_norm": 0.5243905782699585,
+      "learning_rate": 8.191194656678904e-05,
+      "loss": 0.4194,
       "step": 58
     },
     {
+      "epoch": 0.59,
+      "grad_norm": 0.4740852117538452,
+      "learning_rate": 7.867003692562534e-05,
+      "loss": 0.3481,
       "step": 59
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 0.4817480146884918,
+      "learning_rate": 7.54514512859201e-05,
+      "loss": 0.3156,
       "step": 60
     },
     {
+      "epoch": 0.61,
+      "grad_norm": 0.5003472566604614,
+      "learning_rate": 7.225970912381556e-05,
+      "loss": 0.3746,
       "step": 61
     },
     {
+      "epoch": 0.62,
+      "grad_norm": 0.4828045070171356,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.3616,
       "step": 62
     },
     {
+      "epoch": 0.63,
+      "grad_norm": 0.4666941463947296,
+      "learning_rate": 6.59706825558357e-05,
+      "loss": 0.3411,
       "step": 63
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.5042151808738708,
+      "learning_rate": 6.28802751081779e-05,
+      "loss": 0.3658,
       "step": 64
     },
     {
+      "epoch": 0.65,
+      "grad_norm": 0.49939414858818054,
+      "learning_rate": 5.983045753470308e-05,
+      "loss": 0.3993,
       "step": 65
     },
     {
+      "epoch": 0.66,
+      "grad_norm": 0.48640677332878113,
+      "learning_rate": 5.6824564766150726e-05,
+      "loss": 0.3295,
       "step": 66
     },
     {
+      "epoch": 0.67,
+      "grad_norm": 0.4997316300868988,
+      "learning_rate": 5.386588370213124e-05,
+      "loss": 0.3571,
       "step": 67
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 0.508797287940979,
+      "learning_rate": 5.095764961694922e-05,
+      "loss": 0.3626,
       "step": 68
     },
     {
+      "epoch": 0.69,
+      "grad_norm": 0.45878127217292786,
+      "learning_rate": 4.810304262187852e-05,
+      "loss": 0.3726,
       "step": 69
     },
     {
+      "epoch": 0.7,
+      "grad_norm": 0.49244609475135803,
+      "learning_rate": 4.530518418775733e-05,
+      "loss": 0.3577,
       "step": 70
     },
     {
+      "epoch": 0.71,
+      "grad_norm": 0.46602892875671387,
+      "learning_rate": 4.256713373170564e-05,
+      "loss": 0.3403,
       "step": 71
     },
     {
+      "epoch": 0.72,
+      "grad_norm": 0.502491295337677,
+      "learning_rate": 3.9891885271697496e-05,
+      "loss": 0.3662,
       "step": 72
     },
     {
+      "epoch": 0.73,
+      "grad_norm": 0.47285720705986023,
+      "learning_rate": 3.7282364152646297e-05,
+      "loss": 0.3187,
       "step": 73
     },
     {
+      "epoch": 0.74,
+      "grad_norm": 0.4815748333930969,
+      "learning_rate": 3.4741423847583134e-05,
+      "loss": 0.3742,
       "step": 74
     },
     {
+      "epoch": 0.75,
+      "grad_norm": 0.5235660672187805,
+      "learning_rate": 3.227184283742591e-05,
+      "loss": 0.3809,
       "step": 75
     },
     {
+      "epoch": 0.76,
+      "grad_norm": 0.46197509765625,
+      "learning_rate": 2.9876321572751144e-05,
+      "loss": 0.3298,
       "step": 76
     },
     {
+      "epoch": 0.77,
+      "grad_norm": 0.455169141292572,
+      "learning_rate": 2.7557479520891104e-05,
+      "loss": 0.3544,
       "step": 77
     },
     {
+      "epoch": 0.78,
+      "grad_norm": 0.4787601828575134,
+      "learning_rate": 2.5317852301584643e-05,
+      "loss": 0.3466,
       "step": 78
     },
     {
+      "epoch": 0.79,
+      "grad_norm": 0.47747695446014404,
+      "learning_rate": 2.315988891431412e-05,
+      "loss": 0.3189,
       "step": 79
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 0.475917786359787,
+      "learning_rate": 2.1085949060360654e-05,
+      "loss": 0.3832,
       "step": 80
     },
     {
+      "epoch": 0.81,
+      "grad_norm": 0.4622023105621338,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.3414,
       "step": 81
     },
     {
+      "epoch": 0.82,
+      "grad_norm": 0.48533836007118225,
+      "learning_rate": 1.7199116885197995e-05,
+      "loss": 0.3587,
       "step": 82
     },
     {
+      "epoch": 0.83,
+      "grad_norm": 0.45964503288269043,
+      "learning_rate": 1.5390474757906446e-05,
+      "loss": 0.3244,
       "step": 83
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 0.5024991631507874,
+      "learning_rate": 1.3674351904242611e-05,
+      "loss": 0.4067,
       "step": 84
     },
     {
+      "epoch": 0.85,
+      "grad_norm": 0.4903584420681,
+      "learning_rate": 1.2052624879351104e-05,
+      "loss": 0.3641,
       "step": 85
     },
     {
+      "epoch": 0.86,
+      "grad_norm": 0.44721242785453796,
+      "learning_rate": 1.0527067017923654e-05,
+      "loss": 0.3121,
       "step": 86
     },
     {
+      "epoch": 0.87,
+      "grad_norm": 0.48709988594055176,
+      "learning_rate": 9.09934649508375e-06,
+      "loss": 0.3692,
       "step": 87
     },
     {
+      "epoch": 0.88,
+      "grad_norm": 0.4607761800289154,
+      "learning_rate": 7.771024502261526e-06,
+      "loss": 0.3523,
       "step": 88
     },
     {
+      "epoch": 0.89,
+      "grad_norm": 0.4875771999359131,
+      "learning_rate": 6.543553540053926e-06,
+      "loss": 0.3445,
       "step": 89
     },
     {
+      "epoch": 0.9,
+      "grad_norm": 0.4596504271030426,
+      "learning_rate": 5.418275829936537e-06,
+      "loss": 0.3349,
       "step": 90
     },
     {
+      "epoch": 0.91,
+      "grad_norm": 0.46433568000793457,
+      "learning_rate": 4.3964218465642355e-06,
+      "loss": 0.323,
       "step": 91
     },
     {
+      "epoch": 0.92,
+      "grad_norm": 0.4667503833770752,
+      "learning_rate": 3.4791089722651436e-06,
+      "loss": 0.3228,
       "step": 92
     },
     {
+      "epoch": 0.93,
+      "grad_norm": 0.490509033203125,
+      "learning_rate": 2.667340275199426e-06,
+      "loss": 0.3673,
       "step": 93
     },
     {
+      "epoch": 0.94,
+      "grad_norm": 0.4769146144390106,
+      "learning_rate": 1.9620034125190644e-06,
+      "loss": 0.3211,
       "step": 94
     },
     {
+      "epoch": 0.95,
+      "grad_norm": 0.5546551942825317,
+      "learning_rate": 1.3638696597277679e-06,
+      "loss": 0.3613,
       "step": 95
     },
     {
+      "epoch": 0.96,
+      "grad_norm": 0.47154128551483154,
+      "learning_rate": 8.735930673024806e-07,
+      "loss": 0.35,
       "step": 96
     },
     {
+      "epoch": 0.97,
+      "grad_norm": 0.48646360635757446,
+      "learning_rate": 4.917097454988584e-07,
+      "loss": 0.349,
       "step": 97
     },
     {
+      "epoch": 0.98,
+      "grad_norm": 0.48640263080596924,
+      "learning_rate": 2.1863727812254653e-07,
+      "loss": 0.3713,
       "step": 98
     },
     {
+      "epoch": 0.99,
+      "grad_norm": 0.47486788034439087,
+      "learning_rate": 5.467426590739511e-08,
+      "loss": 0.296,
       "step": 99
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 0.4658459424972534,
       "learning_rate": 0.0,
+      "loss": 0.3369,
+      "step": 100
     }
   ],
   "logging_steps": 1,
+  "max_steps": 100,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 1.3312303903280333e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f50a1f8500c6886b194f3cb1f9dec14b859ae7d726b38257900cde0c2d2f4eef
 size 5240

 version https://git-lfs.github.com/spec/v1
+oid sha256:84840f0c9d05975fb66dbe0df3b9fc3f7f2326fff03a93716d61f75cc3024fc7
 size 5240