Update fine-tuned model

Browse files

Files changed (7) hide show

adapter_config.json +4 -4
adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +410 -165
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -23,13 +23,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
-    "gate_proj",
     "q_proj",
-    "down_proj",
     "v_proj",
     "up_proj",
-    "o_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "q_proj",
+    "o_proj",
+    "k_proj",
     "v_proj",
     "up_proj",
+    "down_proj",
+    "gate_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4da9de7e6a0c855646daa4e8692b61bf593a9e5db24ad71f56b011fd878c8a3
 size 159967880

 version https://git-lfs.github.com/spec/v1
+oid sha256:ae7d6124db2999b66ba36817f8c2d2311a0f4f6fb36106ec14c0bc5b31769573
 size 159967880

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a93b54fda0e2952d60fd2423d8d212c30612413fbad1598c7e439365d4fa8fc
 size 852876198

 version https://git-lfs.github.com/spec/v1
+oid sha256:d60e76bf5ab8ae5b9e9260b0b8905e5be77afb63ceeab61da8f715e051ae9ec5
 size 852876198

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:042eeaadaa77e55313f6c5e71c307c518f1290d990d00304e40386bd32b1d3e0
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:841c3d2e9b5e46e8a77c6c9e705dba80a96ae5b9084634adc158035e3d78011a
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d01151db1fc4f9c05131abecdc90435e3aab7eb2c3021fc926311286e779587
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:babd43118aa66a4f5266d730539cf7f09611158b169d9e63dbcb83f6bbaa8626
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,298 +1,543 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 8.016032064128256,
   "eval_steps": 500,
-  "global_step": 40,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.20040080160320642,
-      "grad_norm": 3.027674674987793,
-      "learning_rate": 0.0001,
-      "loss": 1.9225,
       "step": 1
     },
     {
-      "epoch": 0.40080160320641284,
-      "grad_norm": 3.0814154148101807,
-      "learning_rate": 0.0002,
-      "loss": 1.937,
       "step": 2
     },
     {
-      "epoch": 0.6012024048096193,
-      "grad_norm": 2.178030252456665,
-      "learning_rate": 0.000199658449300667,
-      "loss": 1.585,
       "step": 3
     },
     {
-      "epoch": 0.8016032064128257,
-      "grad_norm": 1.4929898977279663,
-      "learning_rate": 0.00019863613034027224,
-      "loss": 1.2414,
       "step": 4
     },
     {
-      "epoch": 1.002004008016032,
-      "grad_norm": 1.2824300527572632,
-      "learning_rate": 0.00019694002659393305,
-      "loss": 1.109,
       "step": 5
     },
     {
-      "epoch": 1.2024048096192386,
-      "grad_norm": 0.8394728899002075,
-      "learning_rate": 0.00019458172417006347,
-      "loss": 0.7618,
       "step": 6
     },
     {
-      "epoch": 1.402805611222445,
-      "grad_norm": 0.7309438586235046,
-      "learning_rate": 0.00019157733266550575,
-      "loss": 0.6732,
       "step": 7
     },
     {
-      "epoch": 1.6032064128256514,
-      "grad_norm": 0.69007807970047,
-      "learning_rate": 0.0001879473751206489,
-      "loss": 0.646,
       "step": 8
     },
     {
-      "epoch": 1.8036072144288577,
-      "grad_norm": 0.6368725299835205,
-      "learning_rate": 0.00018371664782625287,
-      "loss": 0.6147,
       "step": 9
     },
     {
-      "epoch": 2.004008016032064,
-      "grad_norm": 0.5702280402183533,
-      "learning_rate": 0.00017891405093963938,
-      "loss": 0.5384,
       "step": 10
     },
     {
-      "epoch": 2.2044088176352705,
-      "grad_norm": 0.5861708521842957,
-      "learning_rate": 0.00017357239106731317,
-      "loss": 0.4272,
       "step": 11
     },
     {
-      "epoch": 2.404809619238477,
-      "grad_norm": 0.537497341632843,
-      "learning_rate": 0.00016772815716257412,
-      "loss": 0.419,
       "step": 12
     },
     {
-      "epoch": 2.6052104208416833,
-      "grad_norm": 0.4901179373264313,
-      "learning_rate": 0.0001614212712689668,
-      "loss": 0.3951,
       "step": 13
     },
     {
-      "epoch": 2.80561122244489,
-      "grad_norm": 0.44876372814178467,
-      "learning_rate": 0.00015469481581224272,
-      "loss": 0.3362,
       "step": 14
     },
     {
-      "epoch": 3.006012024048096,
-      "grad_norm": 0.4591052234172821,
-      "learning_rate": 0.00014759473930370736,
-      "loss": 0.321,
       "step": 15
     },
     {
-      "epoch": 3.2064128256513027,
-      "grad_norm": 0.3763630986213684,
-      "learning_rate": 0.00014016954246529696,
-      "loss": 0.2673,
       "step": 16
     },
     {
-      "epoch": 3.406813627254509,
-      "grad_norm": 0.37145256996154785,
-      "learning_rate": 0.00013246994692046836,
-      "loss": 0.2554,
       "step": 17
     },
     {
-      "epoch": 3.6072144288577155,
-      "grad_norm": 0.3408704102039337,
-      "learning_rate": 0.00012454854871407994,
-      "loss": 0.229,
       "step": 18
     },
     {
-      "epoch": 3.8076152304609217,
-      "grad_norm": 0.36588045954704285,
-      "learning_rate": 0.00011645945902807341,
-      "loss": 0.2371,
       "step": 19
     },
     {
-      "epoch": 4.008016032064128,
-      "grad_norm": 0.349997341632843,
-      "learning_rate": 0.00010825793454723325,
-      "loss": 0.2127,
       "step": 20
     },
     {
-      "epoch": 4.208416833667335,
-      "grad_norm": 0.30794546008110046,
-      "learning_rate": 0.0001,
-      "loss": 0.1856,
       "step": 21
     },
     {
-      "epoch": 4.408817635270541,
-      "grad_norm": 0.2869230806827545,
-      "learning_rate": 9.174206545276677e-05,
-      "loss": 0.152,
       "step": 22
     },
     {
-      "epoch": 4.609218436873747,
-      "grad_norm": 0.29317694902420044,
-      "learning_rate": 8.35405409719266e-05,
-      "loss": 0.1581,
       "step": 23
     },
     {
-      "epoch": 4.809619238476954,
-      "grad_norm": 0.290088951587677,
-      "learning_rate": 7.54514512859201e-05,
-      "loss": 0.1564,
       "step": 24
     },
     {
-      "epoch": 5.01002004008016,
-      "grad_norm": 0.30606502294540405,
-      "learning_rate": 6.753005307953167e-05,
-      "loss": 0.1594,
       "step": 25
     },
     {
-      "epoch": 5.210420841683367,
-      "grad_norm": 0.2588537335395813,
-      "learning_rate": 5.983045753470308e-05,
-      "loss": 0.1253,
       "step": 26
     },
     {
-      "epoch": 5.410821643286573,
-      "grad_norm": 0.2805459201335907,
-      "learning_rate": 5.240526069629265e-05,
-      "loss": 0.1488,
       "step": 27
     },
     {
-      "epoch": 5.61122244488978,
-      "grad_norm": 0.24708090722560883,
-      "learning_rate": 4.530518418775733e-05,
-      "loss": 0.1084,
       "step": 28
     },
     {
-      "epoch": 5.811623246492986,
-      "grad_norm": 0.2635113298892975,
-      "learning_rate": 3.857872873103322e-05,
-      "loss": 0.1255,
       "step": 29
     },
     {
-      "epoch": 6.012024048096192,
-      "grad_norm": 0.24471008777618408,
-      "learning_rate": 3.227184283742591e-05,
-      "loss": 0.1017,
       "step": 30
     },
     {
-      "epoch": 6.212424849699399,
-      "grad_norm": 0.23846429586410522,
-      "learning_rate": 2.6427608932686843e-05,
-      "loss": 0.1089,
       "step": 31
     },
     {
-      "epoch": 6.412825651302605,
-      "grad_norm": 0.25631099939346313,
-      "learning_rate": 2.1085949060360654e-05,
-      "loss": 0.1166,
       "step": 32
     },
     {
-      "epoch": 6.613226452905812,
-      "grad_norm": 0.24143779277801514,
-      "learning_rate": 1.6283352173747145e-05,
-      "loss": 0.1001,
       "step": 33
     },
     {
-      "epoch": 6.813627254509018,
-      "grad_norm": 0.22203697264194489,
-      "learning_rate": 1.2052624879351104e-05,
-      "loss": 0.0966,
       "step": 34
     },
     {
-      "epoch": 7.014028056112225,
-      "grad_norm": 0.228188157081604,
-      "learning_rate": 8.422667334494249e-06,
-      "loss": 0.0937,
       "step": 35
     },
     {
-      "epoch": 7.214428857715431,
-      "grad_norm": 0.22016100585460663,
-      "learning_rate": 5.418275829936537e-06,
-      "loss": 0.0982,
       "step": 36
     },
     {
-      "epoch": 7.414829659318637,
-      "grad_norm": 0.21842055022716522,
-      "learning_rate": 3.059973406066963e-06,
-      "loss": 0.0914,
       "step": 37
     },
     {
-      "epoch": 7.615230460921843,
-      "grad_norm": 0.22781485319137573,
-      "learning_rate": 1.3638696597277679e-06,
-      "loss": 0.0948,
       "step": 38
     },
     {
-      "epoch": 7.81563126252505,
-      "grad_norm": 0.23596827685832977,
-      "learning_rate": 3.415506993330153e-07,
-      "loss": 0.0959,
       "step": 39
     },
     {
-      "epoch": 8.016032064128256,
-      "grad_norm": 0.24492216110229492,
-      "learning_rate": 0.0,
-      "loss": 0.1,
       "step": 40
     }
   ],
   "logging_steps": 1,
-  "max_steps": 40,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 10,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -306,7 +551,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.4679807860064256e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 3.0,
   "eval_steps": 500,
+  "global_step": 75,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.04,
+      "grad_norm": 1.728904366493225,
+      "learning_rate": 5e-05,
+      "loss": 0.7312,
       "step": 1
     },
     {
+      "epoch": 0.08,
+      "grad_norm": 1.7350375652313232,
+      "learning_rate": 0.0001,
+      "loss": 0.7843,
       "step": 2
     },
     {
+      "epoch": 0.12,
+      "grad_norm": 1.4805001020431519,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6772,
       "step": 3
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 1.0534298419952393,
+      "learning_rate": 0.0002,
+      "loss": 0.6122,
       "step": 4
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 0.8123345375061035,
+      "learning_rate": 0.00019990212265199738,
+      "loss": 0.5329,
       "step": 5
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 0.6635419130325317,
+      "learning_rate": 0.00019960868220749448,
+      "loss": 0.4878,
       "step": 6
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 0.6072973608970642,
+      "learning_rate": 0.00019912025308994148,
+      "loss": 0.4836,
       "step": 7
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.5697150826454163,
+      "learning_rate": 0.00019843779142227256,
+      "loss": 0.5162,
       "step": 8
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 0.5166112780570984,
+      "learning_rate": 0.0001975626331552507,
+      "loss": 0.4825,
       "step": 9
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 0.5054742097854614,
+      "learning_rate": 0.00019649649145228102,
+      "loss": 0.4564,
       "step": 10
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 0.4537505805492401,
+      "learning_rate": 0.00019524145333581317,
+      "loss": 0.4383,
       "step": 11
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 0.4296068251132965,
+      "learning_rate": 0.00019379997560189675,
+      "loss": 0.4529,
       "step": 12
     },
     {
+      "epoch": 0.52,
+      "grad_norm": 0.44367527961730957,
+      "learning_rate": 0.00019217488001088784,
+      "loss": 0.4545,
       "step": 13
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 0.4168623685836792,
+      "learning_rate": 0.0001903693477637204,
+      "loss": 0.418,
       "step": 14
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 0.4231944680213928,
+      "learning_rate": 0.0001883869132745561,
+      "loss": 0.435,
       "step": 15
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.4075939953327179,
+      "learning_rate": 0.00018623145725200278,
+      "loss": 0.4274,
       "step": 16
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 0.36284372210502625,
+      "learning_rate": 0.00018390719910244487,
+      "loss": 0.3972,
       "step": 17
     },
     {
+      "epoch": 0.72,
+      "grad_norm": 0.3902932405471802,
+      "learning_rate": 0.00018141868867035745,
+      "loss": 0.3953,
       "step": 18
     },
     {
+      "epoch": 0.76,
+      "grad_norm": 0.38837161660194397,
+      "learning_rate": 0.00017877079733177184,
+      "loss": 0.4294,
       "step": 19
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 0.38330700993537903,
+      "learning_rate": 0.0001759687084583285,
+      "loss": 0.4155,
       "step": 20
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 0.3897230625152588,
+      "learning_rate": 0.00017301790727058345,
+      "loss": 0.3791,
       "step": 21
     },
     {
+      "epoch": 0.88,
+      "grad_norm": 0.40314897894859314,
+      "learning_rate": 0.00016992417010043142,
+      "loss": 0.4018,
       "step": 22
     },
     {
+      "epoch": 0.92,
+      "grad_norm": 0.4068446755409241,
+      "learning_rate": 0.0001666935530836651,
+      "loss": 0.3801,
       "step": 23
     },
     {
+      "epoch": 0.96,
+      "grad_norm": 0.40863823890686035,
+      "learning_rate": 0.0001633323803048047,
+      "loss": 0.3879,
       "step": 24
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 0.3958357274532318,
+      "learning_rate": 0.00015984723141740576,
+      "loss": 0.3929,
       "step": 25
     },
     {
+      "epoch": 1.04,
+      "grad_norm": 0.3210630416870117,
+      "learning_rate": 0.0001562449287640781,
+      "loss": 0.2722,
       "step": 26
     },
     {
+      "epoch": 1.08,
+      "grad_norm": 0.34723371267318726,
+      "learning_rate": 0.00015253252402142988,
+      "loss": 0.2701,
       "step": 27
     },
     {
+      "epoch": 1.12,
+      "grad_norm": 0.3267146050930023,
+      "learning_rate": 0.00014871728439607966,
+      "loss": 0.2654,
       "step": 28
     },
     {
+      "epoch": 1.16,
+      "grad_norm": 0.3217560350894928,
+      "learning_rate": 0.00014480667839875786,
+      "loss": 0.2653,
       "step": 29
     },
     {
+      "epoch": 1.2,
+      "grad_norm": 0.3129405975341797,
+      "learning_rate": 0.0001408083612243465,
+      "loss": 0.2495,
       "step": 30
     },
     {
+      "epoch": 1.24,
+      "grad_norm": 0.3169604241847992,
+      "learning_rate": 0.00013673015976647568,
+      "loss": 0.2783,
       "step": 31
     },
     {
+      "epoch": 1.28,
+      "grad_norm": 0.3302832543849945,
+      "learning_rate": 0.00013258005729601177,
+      "loss": 0.2589,
       "step": 32
     },
     {
+      "epoch": 1.32,
+      "grad_norm": 0.3463418781757355,
+      "learning_rate": 0.0001283661778334297,
+      "loss": 0.2453,
       "step": 33
     },
     {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.3463260531425476,
+      "learning_rate": 0.00012409677024566144,
+      "loss": 0.242,
       "step": 34
     },
     {
+      "epoch": 1.4,
+      "grad_norm": 0.3702252209186554,
+      "learning_rate": 0.00011978019209855174,
+      "loss": 0.264,
       "step": 35
     },
     {
+      "epoch": 1.44,
+      "grad_norm": 0.3509206771850586,
+      "learning_rate": 0.00011542489329653024,
+      "loss": 0.2593,
       "step": 36
     },
     {
+      "epoch": 1.48,
+      "grad_norm": 0.3612159490585327,
+      "learning_rate": 0.000111039399541527,
+      "loss": 0.2411,
       "step": 37
     },
     {
+      "epoch": 1.52,
+      "grad_norm": 0.3651520609855652,
+      "learning_rate": 0.00010663229564351041,
+      "loss": 0.2378,
       "step": 38
     },
     {
+      "epoch": 1.56,
+      "grad_norm": 0.3665476441383362,
+      "learning_rate": 0.00010221220871531869,
+      "loss": 0.2334,
       "step": 39
     },
     {
+      "epoch": 1.6,
+      "grad_norm": 0.34961438179016113,
+      "learning_rate": 9.778779128468132e-05,
+      "loss": 0.2307,
       "step": 40
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.379111111164093,
+      "learning_rate": 9.336770435648964e-05,
+      "loss": 0.2212,
+      "step": 41
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.38593631982803345,
+      "learning_rate": 8.896060045847304e-05,
+      "loss": 0.2335,
+      "step": 42
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.37961545586586,
+      "learning_rate": 8.457510670346976e-05,
+      "loss": 0.2306,
+      "step": 43
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.3735259771347046,
+      "learning_rate": 8.021980790144827e-05,
+      "loss": 0.2499,
+      "step": 44
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.40170496702194214,
+      "learning_rate": 7.590322975433857e-05,
+      "loss": 0.2275,
+      "step": 45
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.3875046372413635,
+      "learning_rate": 7.163382216657034e-05,
+      "loss": 0.218,
+      "step": 46
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.37028905749320984,
+      "learning_rate": 6.741994270398826e-05,
+      "loss": 0.2422,
+      "step": 47
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.3737669289112091,
+      "learning_rate": 6.326984023352435e-05,
+      "loss": 0.2195,
+      "step": 48
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.3924426734447479,
+      "learning_rate": 5.91916387756535e-05,
+      "loss": 0.2235,
+      "step": 49
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.36921918392181396,
+      "learning_rate": 5.5193321601242156e-05,
+      "loss": 0.2141,
+      "step": 50
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.28800830245018005,
+      "learning_rate": 5.1282715603920374e-05,
+      "loss": 0.1736,
+      "step": 51
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.31100502610206604,
+      "learning_rate": 4.746747597857014e-05,
+      "loss": 0.1578,
+      "step": 52
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.3049222528934479,
+      "learning_rate": 4.375507123592194e-05,
+      "loss": 0.1771,
+      "step": 53
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.28219300508499146,
+      "learning_rate": 4.015276858259427e-05,
+      "loss": 0.1476,
+      "step": 54
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.3022613525390625,
+      "learning_rate": 3.6667619695195285e-05,
+      "loss": 0.1779,
+      "step": 55
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.2823966443538666,
+      "learning_rate": 3.330644691633492e-05,
+      "loss": 0.1501,
+      "step": 56
+    },
+    {
+      "epoch": 2.2800000000000002,
+      "grad_norm": 0.28174689412117004,
+      "learning_rate": 3.0075829899568597e-05,
+      "loss": 0.1511,
+      "step": 57
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.2776714861392975,
+      "learning_rate": 2.6982092729416587e-05,
+      "loss": 0.1568,
+      "step": 58
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.2745690643787384,
+      "learning_rate": 2.403129154167153e-05,
+      "loss": 0.1393,
+      "step": 59
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.2862659692764282,
+      "learning_rate": 2.1229202668228197e-05,
+      "loss": 0.1568,
+      "step": 60
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.30168795585632324,
+      "learning_rate": 1.858131132964259e-05,
+      "loss": 0.164,
+      "step": 61
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.30739548802375793,
+      "learning_rate": 1.609280089755515e-05,
+      "loss": 0.1595,
+      "step": 62
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.2983320355415344,
+      "learning_rate": 1.3768542747997215e-05,
+      "loss": 0.174,
+      "step": 63
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.30526575446128845,
+      "learning_rate": 1.161308672544389e-05,
+      "loss": 0.168,
+      "step": 64
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.28905755281448364,
+      "learning_rate": 9.630652236279625e-06,
+      "loss": 0.1557,
+      "step": 65
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.29685401916503906,
+      "learning_rate": 7.825119989112173e-06,
+      "loss": 0.1531,
+      "step": 66
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.309733510017395,
+      "learning_rate": 6.200024398103255e-06,
+      "loss": 0.1538,
+      "step": 67
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.31325626373291016,
+      "learning_rate": 4.758546664186869e-06,
+      "loss": 0.16,
+      "step": 68
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.29155057668685913,
+      "learning_rate": 3.5035085477190143e-06,
+      "loss": 0.1612,
+      "step": 69
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.31043997406959534,
+      "learning_rate": 2.4373668447493224e-06,
+      "loss": 0.1655,
+      "step": 70
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.30450770258903503,
+      "learning_rate": 1.562208577727442e-06,
+      "loss": 0.1581,
+      "step": 71
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.2816106677055359,
+      "learning_rate": 8.797469100585431e-07,
+      "loss": 0.1369,
+      "step": 72
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.3166625201702118,
+      "learning_rate": 3.913177925055189e-07,
+      "loss": 0.1724,
+      "step": 73
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.30547070503234863,
+      "learning_rate": 9.78773480026396e-08,
+      "loss": 0.1555,
+      "step": 74
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.29624301195144653,
+      "learning_rate": 0.0,
+      "loss": 0.1598,
+      "step": 75
     }
   ],
   "logging_steps": 1,
+  "max_steps": 75,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 9.106714169779814e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8923db3bbc9d64f25728011e5b67d183e43b0af93caeb0253ce117a78cbdba2
 size 5240

 version https://git-lfs.github.com/spec/v1
+oid sha256:c82f1076477830c5279e886b1bb2af3d8f3ef16a462c5c5df5187f2aded327b7
 size 5240