Update fine-tuned model

Browse files

Files changed (7) hide show

adapter_config.json +4 -4
adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +727 -202
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -20,13 +20,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
-    "gate_proj",
     "up_proj",
-    "o_proj",
     "down_proj",
     "q_proj",
-    "v_proj"
   ],
   "task_type": " CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "up_proj",
+    "v_proj",
     "down_proj",
     "q_proj",
+    "o_proj",
+    "k_proj",
+    "gate_proj"
   ],
   "task_type": " CAUSAL_LM",
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a83e9a0016f9da67a243e4a1e37823c5fd485eab0afee3026b8ef7de3841085
 size 159967880

 version https://git-lfs.github.com/spec/v1
+oid sha256:8dc3526319616f564084803a55a4634c00e35413e9965c9cc8ea0d5da4e03d31
 size 159967880

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ecccb0d08fadf6a1ad6f7ab16c4ac9074e62eccf977a1c221a8fb9b0f83894a
 size 852876198

 version https://git-lfs.github.com/spec/v1
+oid sha256:0e24dec637d4370a49253e03f0da5dfa9c777bfbeb86bfe5ba98c750ed05d739
 size 852876198

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c09ae3d43feaff4e0394c4352b4f7f3a126522099688de7cea344ac7d55250d9
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:a1e42b4ab9aefc741b06ae8d61682e61a1ced93aca1dc65b7a76e4f5950596de
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e69e2b49ea642509f0c688c16fb190b7cf27dac0a18903a5e2d1467d0343d8b8
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:25c0243da09e051ffbb600413baac079966a139cd6939cbef7eb321283bd0767
 size 1064

trainer_state.json CHANGED Viewed

@@ -3,364 +3,889 @@
   "best_model_checkpoint": null,
   "epoch": 5.0,
   "eval_steps": 500,
-  "global_step": 50,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.1,
-      "grad_norm": 0.7374217510223389,
-      "learning_rate": 6.666666666666667e-05,
-      "loss": 0.828,
       "step": 1
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 0.7326979041099548,
-      "learning_rate": 0.00013333333333333334,
-      "loss": 0.8692,
       "step": 2
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 0.5996476411819458,
-      "learning_rate": 0.0002,
-      "loss": 0.7978,
       "step": 3
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 0.4960692524909973,
-      "learning_rate": 0.00019977668786231534,
-      "loss": 0.7581,
       "step": 4
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 0.38662832975387573,
-      "learning_rate": 0.000199107748815478,
-      "loss": 0.7099,
       "step": 5
     },
     {
-      "epoch": 0.6,
-      "grad_norm": 0.47017937898635864,
-      "learning_rate": 0.0001979961705036587,
-      "loss": 0.6783,
       "step": 6
     },
     {
-      "epoch": 0.7,
-      "grad_norm": 0.39107415080070496,
-      "learning_rate": 0.00019644691750543767,
-      "loss": 0.6699,
       "step": 7
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.31766048073768616,
-      "learning_rate": 0.0001944669091607919,
-      "loss": 0.6986,
       "step": 8
     },
     {
-      "epoch": 0.9,
-      "grad_norm": 0.28557154536247253,
-      "learning_rate": 0.00019206498866764288,
-      "loss": 0.6539,
       "step": 9
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 0.2970712184906006,
-      "learning_rate": 0.00018925188358598813,
-      "loss": 0.6672,
       "step": 10
     },
     {
-      "epoch": 1.1,
-      "grad_norm": 0.33635565638542175,
-      "learning_rate": 0.00018604015792601396,
-      "loss": 0.5643,
       "step": 11
     },
     {
-      "epoch": 1.2,
-      "grad_norm": 0.34821704030036926,
-      "learning_rate": 0.00018244415603417603,
-      "loss": 0.5525,
       "step": 12
     },
     {
-      "epoch": 1.3,
-      "grad_norm": 0.3633900582790375,
-      "learning_rate": 0.0001784799385278661,
-      "loss": 0.5256,
       "step": 13
     },
     {
-      "epoch": 1.4,
-      "grad_norm": 0.3432920575141907,
-      "learning_rate": 0.00017416521056479577,
-      "loss": 0.5193,
       "step": 14
     },
     {
-      "epoch": 1.5,
-      "grad_norm": 0.3452214002609253,
-      "learning_rate": 0.00016951924276746425,
-      "loss": 0.5046,
       "step": 15
     },
     {
-      "epoch": 1.6,
-      "grad_norm": 0.29285934567451477,
-      "learning_rate": 0.00016456278515588024,
-      "loss": 0.4897,
       "step": 16
     },
     {
-      "epoch": 1.7,
-      "grad_norm": 0.30729544162750244,
-      "learning_rate": 0.00015931797447293552,
-      "loss": 0.501,
       "step": 17
     },
     {
-      "epoch": 1.8,
-      "grad_norm": 0.2720145583152771,
-      "learning_rate": 0.00015380823531633729,
-      "loss": 0.4759,
       "step": 18
     },
     {
-      "epoch": 1.9,
-      "grad_norm": 0.2651268541812897,
-      "learning_rate": 0.00014805817551866838,
-      "loss": 0.4527,
       "step": 19
     },
     {
-      "epoch": 2.0,
-      "grad_norm": 0.2597902715206146,
-      "learning_rate": 0.0001420934762428335,
-      "loss": 0.4479,
       "step": 20
     },
     {
-      "epoch": 2.1,
-      "grad_norm": 0.2531144917011261,
-      "learning_rate": 0.00013594077728375128,
-      "loss": 0.4056,
       "step": 21
     },
     {
-      "epoch": 2.2,
-      "grad_norm": 0.2363481968641281,
-      "learning_rate": 0.00012962755808856342,
-      "loss": 0.3951,
       "step": 22
     },
     {
-      "epoch": 2.3,
-      "grad_norm": 0.23172461986541748,
-      "learning_rate": 0.00012318201502675285,
-      "loss": 0.3863,
       "step": 23
     },
     {
-      "epoch": 2.4,
-      "grad_norm": 0.24559740722179413,
-      "learning_rate": 0.00011663293545831302,
-      "loss": 0.3809,
       "step": 24
     },
     {
-      "epoch": 2.5,
-      "grad_norm": 0.24797773361206055,
-      "learning_rate": 0.00011000956916240985,
-      "loss": 0.3768,
       "step": 25
     },
     {
-      "epoch": 2.6,
-      "grad_norm": 0.25474536418914795,
-      "learning_rate": 0.00010334149770076747,
-      "loss": 0.3628,
       "step": 26
     },
     {
-      "epoch": 2.7,
-      "grad_norm": 0.25486743450164795,
-      "learning_rate": 9.665850229923258e-05,
-      "loss": 0.3603,
       "step": 27
     },
     {
-      "epoch": 2.8,
-      "grad_norm": 0.26718005537986755,
-      "learning_rate": 8.999043083759017e-05,
-      "loss": 0.3584,
       "step": 28
     },
     {
-      "epoch": 2.9,
-      "grad_norm": 0.2610970139503479,
-      "learning_rate": 8.336706454168701e-05,
-      "loss": 0.3546,
       "step": 29
     },
     {
-      "epoch": 3.0,
-      "grad_norm": 0.2513510584831238,
-      "learning_rate": 7.681798497324716e-05,
-      "loss": 0.3507,
       "step": 30
     },
     {
-      "epoch": 3.1,
-      "grad_norm": 0.23601773381233215,
-      "learning_rate": 7.037244191143661e-05,
-      "loss": 0.3212,
       "step": 31
     },
     {
-      "epoch": 3.2,
-      "grad_norm": 0.2521430253982544,
-      "learning_rate": 6.405922271624874e-05,
-      "loss": 0.3399,
       "step": 32
     },
     {
-      "epoch": 3.3,
-      "grad_norm": 0.24177978932857513,
-      "learning_rate": 5.790652375716652e-05,
-      "loss": 0.2891,
       "step": 33
     },
     {
-      "epoch": 3.4,
-      "grad_norm": 0.25264158844947815,
-      "learning_rate": 5.1941824481331626e-05,
-      "loss": 0.3047,
       "step": 34
     },
     {
-      "epoch": 3.5,
-      "grad_norm": 0.23934145271778107,
-      "learning_rate": 4.6191764683662744e-05,
-      "loss": 0.2896,
       "step": 35
     },
     {
-      "epoch": 3.6,
-      "grad_norm": 0.25319987535476685,
-      "learning_rate": 4.0682025527064486e-05,
-      "loss": 0.2977,
       "step": 36
     },
     {
-      "epoch": 3.7,
-      "grad_norm": 0.23155006766319275,
-      "learning_rate": 3.543721484411976e-05,
-      "loss": 0.2793,
       "step": 37
     },
     {
-      "epoch": 3.8,
-      "grad_norm": 0.2498036026954651,
-      "learning_rate": 3.0480757232535772e-05,
-      "loss": 0.3165,
       "step": 38
     },
     {
-      "epoch": 3.9,
-      "grad_norm": 0.23940016329288483,
-      "learning_rate": 2.5834789435204243e-05,
-      "loss": 0.2938,
       "step": 39
     },
     {
-      "epoch": 4.0,
-      "grad_norm": 0.23618489503860474,
-      "learning_rate": 2.1520061472133902e-05,
-      "loss": 0.2713,
       "step": 40
     },
     {
-      "epoch": 4.1,
-      "grad_norm": 0.22089381515979767,
-      "learning_rate": 1.7555843965823992e-05,
-      "loss": 0.2618,
       "step": 41
     },
     {
-      "epoch": 4.2,
-      "grad_norm": 0.2343159019947052,
-      "learning_rate": 1.3959842073986085e-05,
-      "loss": 0.2668,
       "step": 42
     },
     {
-      "epoch": 4.3,
-      "grad_norm": 0.2323874831199646,
-      "learning_rate": 1.0748116414011888e-05,
-      "loss": 0.2645,
       "step": 43
     },
     {
-      "epoch": 4.4,
-      "grad_norm": 0.24557726085186005,
-      "learning_rate": 7.935011332357112e-06,
-      "loss": 0.2728,
       "step": 44
     },
     {
-      "epoch": 4.5,
-      "grad_norm": 0.24076133966445923,
-      "learning_rate": 5.533090839208133e-06,
-      "loss": 0.2729,
       "step": 45
     },
     {
-      "epoch": 4.6,
-      "grad_norm": 0.22625155746936798,
-      "learning_rate": 3.5530824945623542e-06,
-      "loss": 0.2929,
       "step": 46
     },
     {
-      "epoch": 4.7,
-      "grad_norm": 0.23357822000980377,
-      "learning_rate": 2.003829496341325e-06,
-      "loss": 0.2691,
       "step": 47
     },
     {
-      "epoch": 4.8,
-      "grad_norm": 0.22757555544376373,
-      "learning_rate": 8.922511845219971e-07,
-      "loss": 0.2713,
       "step": 48
     },
     {
-      "epoch": 4.9,
-      "grad_norm": 0.23513054847717285,
-      "learning_rate": 2.2331213768468363e-07,
-      "loss": 0.2777,
       "step": 49
     },
     {
       "epoch": 5.0,
-      "grad_norm": 0.23401321470737457,
       "learning_rate": 0.0,
-      "loss": 0.2611,
-      "step": 50
     }
   ],
   "logging_steps": 1,
-  "max_steps": 50,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 5,
   "save_steps": 500,
@@ -376,7 +901,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7.430306788466688e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_model_checkpoint": null,
   "epoch": 5.0,
   "eval_steps": 500,
+  "global_step": 125,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.04,
+      "grad_norm": 0.7209197878837585,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 0.8555,
       "step": 1
     },
     {
+      "epoch": 0.08,
+      "grad_norm": 0.7282789945602417,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 0.8511,
       "step": 2
     },
     {
+      "epoch": 0.12,
+      "grad_norm": 0.6870177984237671,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 0.8511,
       "step": 3
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 0.5710985660552979,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 0.7533,
       "step": 4
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 0.4873427450656891,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 0.7636,
       "step": 5
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 0.44160357117652893,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 0.7484,
       "step": 6
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 0.38276153802871704,
+      "learning_rate": 0.0002,
+      "loss": 0.7157,
       "step": 7
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.3445475101470947,
+      "learning_rate": 0.00019996456111234527,
+      "loss": 0.7039,
       "step": 8
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 0.31933900713920593,
+      "learning_rate": 0.0001998582695676762,
+      "loss": 0.7062,
       "step": 9
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 0.3282822072505951,
+      "learning_rate": 0.000199681200703075,
+      "loss": 0.703,
       "step": 10
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 0.3057241439819336,
+      "learning_rate": 0.00019943348002101371,
+      "loss": 0.704,
       "step": 11
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 0.28319624066352844,
+      "learning_rate": 0.00019911528310040074,
+      "loss": 0.6743,
       "step": 12
     },
     {
+      "epoch": 0.52,
+      "grad_norm": 0.2568361461162567,
+      "learning_rate": 0.00019872683547213446,
+      "loss": 0.6933,
       "step": 13
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 0.25777217745780945,
+      "learning_rate": 0.00019826841245925212,
+      "loss": 0.6664,
       "step": 14
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 0.2636028528213501,
+      "learning_rate": 0.00019774033898178667,
+      "loss": 0.6815,
       "step": 15
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.2726347744464874,
+      "learning_rate": 0.00019714298932647098,
+      "loss": 0.6769,
       "step": 16
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 0.2455451488494873,
+      "learning_rate": 0.0001964767868814516,
+      "loss": 0.6499,
       "step": 17
     },
     {
+      "epoch": 0.72,
+      "grad_norm": 0.2470139116048813,
+      "learning_rate": 0.00019574220383620055,
+      "loss": 0.6729,
       "step": 18
     },
     {
+      "epoch": 0.76,
+      "grad_norm": 0.2509821057319641,
+      "learning_rate": 0.00019493976084683813,
+      "loss": 0.6548,
       "step": 19
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 0.2554020583629608,
+      "learning_rate": 0.00019407002666710336,
+      "loss": 0.6156,
       "step": 20
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 0.26096639037132263,
+      "learning_rate": 0.00019313361774523385,
+      "loss": 0.6278,
       "step": 21
     },
     {
+      "epoch": 0.88,
+      "grad_norm": 0.25810953974723816,
+      "learning_rate": 0.00019213119778704128,
+      "loss": 0.6334,
       "step": 22
     },
     {
+      "epoch": 0.92,
+      "grad_norm": 0.24838820099830627,
+      "learning_rate": 0.00019106347728549135,
+      "loss": 0.6238,
       "step": 23
     },
     {
+      "epoch": 0.96,
+      "grad_norm": 0.2597545385360718,
+      "learning_rate": 0.00018993121301712193,
+      "loss": 0.6083,
       "step": 24
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 0.2563561797142029,
+      "learning_rate": 0.00018873520750565718,
+      "loss": 0.632,
       "step": 25
     },
     {
+      "epoch": 1.04,
+      "grad_norm": 0.2539364993572235,
+      "learning_rate": 0.00018747630845319612,
+      "loss": 0.53,
       "step": 26
     },
     {
+      "epoch": 1.08,
+      "grad_norm": 0.27705103158950806,
+      "learning_rate": 0.0001861554081393806,
+      "loss": 0.5125,
       "step": 27
     },
     {
+      "epoch": 1.12,
+      "grad_norm": 0.2639136016368866,
+      "learning_rate": 0.0001847734427889671,
+      "loss": 0.514,
       "step": 28
     },
     {
+      "epoch": 1.16,
+      "grad_norm": 0.24399739503860474,
+      "learning_rate": 0.0001833313919082515,
+      "loss": 0.5203,
       "step": 29
     },
     {
+      "epoch": 1.2,
+      "grad_norm": 0.24673549830913544,
+      "learning_rate": 0.0001818302775908169,
+      "loss": 0.4994,
       "step": 30
     },
     {
+      "epoch": 1.24,
+      "grad_norm": 0.24222399294376373,
+      "learning_rate": 0.00018027116379309638,
+      "loss": 0.5129,
       "step": 31
     },
     {
+      "epoch": 1.28,
+      "grad_norm": 0.23906999826431274,
+      "learning_rate": 0.00017865515558026428,
+      "loss": 0.4904,
       "step": 32
     },
     {
+      "epoch": 1.32,
+      "grad_norm": 0.2312118262052536,
+      "learning_rate": 0.00017698339834299061,
+      "loss": 0.4781,
       "step": 33
     },
     {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.24449752271175385,
+      "learning_rate": 0.00017525707698561385,
+      "loss": 0.4928,
       "step": 34
     },
     {
+      "epoch": 1.4,
+      "grad_norm": 0.24433298408985138,
+      "learning_rate": 0.00017347741508630672,
+      "loss": 0.4916,
       "step": 35
     },
     {
+      "epoch": 1.44,
+      "grad_norm": 0.25431713461875916,
+      "learning_rate": 0.00017164567402983152,
+      "loss": 0.47,
       "step": 36
     },
     {
+      "epoch": 1.48,
+      "grad_norm": 0.25172215700149536,
+      "learning_rate": 0.0001697631521134985,
+      "loss": 0.4794,
       "step": 37
     },
     {
+      "epoch": 1.52,
+      "grad_norm": 0.25659480690956116,
+      "learning_rate": 0.00016783118362696163,
+      "loss": 0.4773,
       "step": 38
     },
     {
+      "epoch": 1.56,
+      "grad_norm": 0.2669978141784668,
+      "learning_rate": 0.00016585113790650388,
+      "loss": 0.4634,
       "step": 39
     },
     {
+      "epoch": 1.6,
+      "grad_norm": 0.26926499605178833,
+      "learning_rate": 0.00016382441836448202,
+      "loss": 0.4416,
       "step": 40
     },
     {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.26841145753860474,
+      "learning_rate": 0.0001617524614946192,
+      "loss": 0.4492,
       "step": 41
     },
     {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.26070988178253174,
+      "learning_rate": 0.00015963673585385016,
+      "loss": 0.4316,
       "step": 42
     },
     {
+      "epoch": 1.72,
+      "grad_norm": 0.2793709337711334,
+      "learning_rate": 0.0001574787410214407,
+      "loss": 0.4443,
       "step": 43
     },
     {
+      "epoch": 1.76,
+      "grad_norm": 0.2722426652908325,
+      "learning_rate": 0.00015528000653611935,
+      "loss": 0.4469,
       "step": 44
     },
     {
+      "epoch": 1.8,
+      "grad_norm": 0.2657548487186432,
+      "learning_rate": 0.00015304209081197425,
+      "loss": 0.4456,
       "step": 45
     },
     {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.28950250148773193,
+      "learning_rate": 0.000150766580033884,
+      "loss": 0.4388,
       "step": 46
     },
     {
+      "epoch": 1.88,
+      "grad_norm": 0.2785035967826843,
+      "learning_rate": 0.00014845508703326504,
+      "loss": 0.4269,
       "step": 47
     },
     {
+      "epoch": 1.92,
+      "grad_norm": 0.2793751060962677,
+      "learning_rate": 0.0001461092501449326,
+      "loss": 0.4182,
       "step": 48
     },
     {
+      "epoch": 1.96,
+      "grad_norm": 0.28629055619239807,
+      "learning_rate": 0.00014373073204588556,
+      "loss": 0.4433,
       "step": 49
     },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.2770131528377533,
+      "learning_rate": 0.00014132121857683783,
+      "loss": 0.4447,
+      "step": 50
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.2512931823730469,
+      "learning_rate": 0.00013888241754733208,
+      "loss": 0.3446,
+      "step": 51
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.25569790601730347,
+      "learning_rate": 0.00013641605752528224,
+      "loss": 0.385,
+      "step": 52
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.2556793689727783,
+      "learning_rate": 0.00013392388661180303,
+      "loss": 0.3545,
+      "step": 53
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.2621767818927765,
+      "learning_rate": 0.0001314076712021949,
+      "loss": 0.3555,
+      "step": 54
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.2598123550415039,
+      "learning_rate": 0.0001288691947339621,
+      "loss": 0.3403,
+      "step": 55
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.26771417260169983,
+      "learning_rate": 0.00012631025642275212,
+      "loss": 0.3244,
+      "step": 56
+    },
+    {
+      "epoch": 2.2800000000000002,
+      "grad_norm": 0.27799421548843384,
+      "learning_rate": 0.0001237326699871115,
+      "loss": 0.3281,
+      "step": 57
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.2827088534832001,
+      "learning_rate": 0.00012113826236296244,
+      "loss": 0.3308,
+      "step": 58
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.28123676776885986,
+      "learning_rate": 0.00011852887240871145,
+      "loss": 0.3267,
+      "step": 59
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.30861011147499084,
+      "learning_rate": 0.00011590634960190721,
+      "loss": 0.32,
+      "step": 60
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.29810547828674316,
+      "learning_rate": 0.00011327255272837221,
+      "loss": 0.3283,
+      "step": 61
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.296453058719635,
+      "learning_rate": 0.00011062934856473655,
+      "loss": 0.3302,
+      "step": 62
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.32325005531311035,
+      "learning_rate": 0.00010797861055530831,
+      "loss": 0.3381,
+      "step": 63
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.29035669565200806,
+      "learning_rate": 0.00010532221748421787,
+      "loss": 0.3189,
+      "step": 64
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.3050059974193573,
+      "learning_rate": 0.00010266205214377748,
+      "loss": 0.3047,
+      "step": 65
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.30324605107307434,
+      "learning_rate": 0.0001,
+      "loss": 0.3054,
+      "step": 66
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.3131050169467926,
+      "learning_rate": 9.733794785622253e-05,
+      "loss": 0.3238,
+      "step": 67
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.3103804886341095,
+      "learning_rate": 9.467778251578217e-05,
+      "loss": 0.336,
+      "step": 68
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.3039088845252991,
+      "learning_rate": 9.202138944469168e-05,
+      "loss": 0.3304,
+      "step": 69
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.3010499179363251,
+      "learning_rate": 8.937065143526347e-05,
+      "loss": 0.3126,
+      "step": 70
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.2960353195667267,
+      "learning_rate": 8.672744727162781e-05,
+      "loss": 0.3201,
+      "step": 71
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.2927502393722534,
+      "learning_rate": 8.409365039809281e-05,
+      "loss": 0.307,
+      "step": 72
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.29815661907196045,
+      "learning_rate": 8.147112759128859e-05,
+      "loss": 0.313,
+      "step": 73
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.2862599790096283,
+      "learning_rate": 7.886173763703757e-05,
+      "loss": 0.3216,
+      "step": 74
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.28218919038772583,
+      "learning_rate": 7.626733001288851e-05,
+      "loss": 0.3021,
+      "step": 75
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.29704561829566956,
+      "learning_rate": 7.368974357724789e-05,
+      "loss": 0.2639,
+      "step": 76
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 0.2927261292934418,
+      "learning_rate": 7.113080526603792e-05,
+      "loss": 0.2633,
+      "step": 77
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.2776476740837097,
+      "learning_rate": 6.859232879780515e-05,
+      "loss": 0.2565,
+      "step": 78
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 0.27619966864585876,
+      "learning_rate": 6.607611338819697e-05,
+      "loss": 0.2528,
+      "step": 79
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.27579841017723083,
+      "learning_rate": 6.358394247471778e-05,
+      "loss": 0.2512,
+      "step": 80
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 0.2667072117328644,
+      "learning_rate": 6.111758245266794e-05,
+      "loss": 0.2543,
+      "step": 81
+    },
+    {
+      "epoch": 3.2800000000000002,
+      "grad_norm": 0.2724190056324005,
+      "learning_rate": 5.867878142316221e-05,
+      "loss": 0.2619,
+      "step": 82
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 0.2652760446071625,
+      "learning_rate": 5.626926795411447e-05,
+      "loss": 0.2402,
+      "step": 83
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 0.27490079402923584,
+      "learning_rate": 5.38907498550674e-05,
+      "loss": 0.2363,
+      "step": 84
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 0.280770480632782,
+      "learning_rate": 5.1544912966734994e-05,
+      "loss": 0.2517,
+      "step": 85
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.3093857169151306,
+      "learning_rate": 4.9233419966116036e-05,
+      "loss": 0.2593,
+      "step": 86
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 0.2780980169773102,
+      "learning_rate": 4.695790918802576e-05,
+      "loss": 0.2355,
+      "step": 87
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 0.2719030976295471,
+      "learning_rate": 4.47199934638807e-05,
+      "loss": 0.2305,
+      "step": 88
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 0.3024630546569824,
+      "learning_rate": 4.252125897855932e-05,
+      "loss": 0.2359,
+      "step": 89
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.27690544724464417,
+      "learning_rate": 4.036326414614985e-05,
+      "loss": 0.245,
+      "step": 90
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 0.27956703305244446,
+      "learning_rate": 3.824753850538082e-05,
+      "loss": 0.2247,
+      "step": 91
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 0.28436169028282166,
+      "learning_rate": 3.617558163551802e-05,
+      "loss": 0.2467,
+      "step": 92
+    },
+    {
+      "epoch": 3.7199999999999998,
+      "grad_norm": 0.28021368384361267,
+      "learning_rate": 3.414886209349615e-05,
+      "loss": 0.2435,
+      "step": 93
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.3015545904636383,
+      "learning_rate": 3.216881637303839e-05,
+      "loss": 0.2437,
+      "step": 94
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.28339913487434387,
+      "learning_rate": 3.0236847886501542e-05,
+      "loss": 0.2509,
+      "step": 95
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 0.3016417324542999,
+      "learning_rate": 2.8354325970168484e-05,
+      "loss": 0.2596,
+      "step": 96
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 0.284646213054657,
+      "learning_rate": 2.6522584913693294e-05,
+      "loss": 0.2343,
+      "step": 97
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.30606040358543396,
+      "learning_rate": 2.4742923014386156e-05,
+      "loss": 0.2486,
+      "step": 98
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 0.28671491146087646,
+      "learning_rate": 2.301660165700936e-05,
+      "loss": 0.2265,
+      "step": 99
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.2809029519557953,
+      "learning_rate": 2.1344844419735755e-05,
+      "loss": 0.2378,
+      "step": 100
+    },
+    {
+      "epoch": 4.04,
+      "grad_norm": 0.2681520879268646,
+      "learning_rate": 1.9728836206903656e-05,
+      "loss": 0.2296,
+      "step": 101
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 0.2830915153026581,
+      "learning_rate": 1.8169722409183097e-05,
+      "loss": 0.2207,
+      "step": 102
+    },
+    {
+      "epoch": 4.12,
+      "grad_norm": 0.2674114406108856,
+      "learning_rate": 1.6668608091748495e-05,
+      "loss": 0.2273,
+      "step": 103
+    },
+    {
+      "epoch": 4.16,
+      "grad_norm": 0.26131343841552734,
+      "learning_rate": 1.522655721103291e-05,
+      "loss": 0.2048,
+      "step": 104
+    },
+    {
+      "epoch": 4.2,
+      "grad_norm": 0.26205572485923767,
+      "learning_rate": 1.3844591860619383e-05,
+      "loss": 0.2012,
+      "step": 105
+    },
+    {
+      "epoch": 4.24,
+      "grad_norm": 0.2575109302997589,
+      "learning_rate": 1.2523691546803873e-05,
+      "loss": 0.2011,
+      "step": 106
+    },
+    {
+      "epoch": 4.28,
+      "grad_norm": 0.25955626368522644,
+      "learning_rate": 1.1264792494342857e-05,
+      "loss": 0.1869,
+      "step": 107
+    },
+    {
+      "epoch": 4.32,
+      "grad_norm": 0.2660517692565918,
+      "learning_rate": 1.0068786982878087e-05,
+      "loss": 0.218,
+      "step": 108
+    },
+    {
+      "epoch": 4.36,
+      "grad_norm": 0.2595648765563965,
+      "learning_rate": 8.936522714508678e-06,
+      "loss": 0.218,
+      "step": 109
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 0.2599768042564392,
+      "learning_rate": 7.868802212958703e-06,
+      "loss": 0.199,
+      "step": 110
+    },
+    {
+      "epoch": 4.44,
+      "grad_norm": 0.26886242628097534,
+      "learning_rate": 6.866382254766157e-06,
+      "loss": 0.2137,
+      "step": 111
+    },
+    {
+      "epoch": 4.48,
+      "grad_norm": 0.2693222165107727,
+      "learning_rate": 5.929973332896677e-06,
+      "loss": 0.1991,
+      "step": 112
+    },
+    {
+      "epoch": 4.52,
+      "grad_norm": 0.2585678994655609,
+      "learning_rate": 5.060239153161872e-06,
+      "loss": 0.2072,
+      "step": 113
+    },
+    {
+      "epoch": 4.5600000000000005,
+      "grad_norm": 0.26331010460853577,
+      "learning_rate": 4.257796163799455e-06,
+      "loss": 0.2093,
+      "step": 114
+    },
+    {
+      "epoch": 4.6,
+      "grad_norm": 0.2705547511577606,
+      "learning_rate": 3.5232131185484076e-06,
+      "loss": 0.2187,
+      "step": 115
+    },
+    {
+      "epoch": 4.64,
+      "grad_norm": 0.2604030668735504,
+      "learning_rate": 2.857010673529015e-06,
+      "loss": 0.2081,
+      "step": 116
+    },
+    {
+      "epoch": 4.68,
+      "grad_norm": 0.2516978681087494,
+      "learning_rate": 2.259661018213333e-06,
+      "loss": 0.2049,
+      "step": 117
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 0.2609613239765167,
+      "learning_rate": 1.7315875407479032e-06,
+      "loss": 0.2193,
+      "step": 118
+    },
+    {
+      "epoch": 4.76,
+      "grad_norm": 0.25738459825515747,
+      "learning_rate": 1.2731645278655445e-06,
+      "loss": 0.1945,
+      "step": 119
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 0.2718040347099304,
+      "learning_rate": 8.847168995992916e-07,
+      "loss": 0.2248,
+      "step": 120
+    },
+    {
+      "epoch": 4.84,
+      "grad_norm": 0.26361677050590515,
+      "learning_rate": 5.665199789862907e-07,
+      "loss": 0.2188,
+      "step": 121
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 0.2609327435493469,
+      "learning_rate": 3.1879929692498757e-07,
+      "loss": 0.1975,
+      "step": 122
+    },
+    {
+      "epoch": 4.92,
+      "grad_norm": 0.2681811451911926,
+      "learning_rate": 1.4173043232380557e-07,
+      "loss": 0.2177,
+      "step": 123
+    },
+    {
+      "epoch": 4.96,
+      "grad_norm": 0.25932785868644714,
+      "learning_rate": 3.5438887654737355e-08,
+      "loss": 0.2266,
+      "step": 124
+    },
     {
       "epoch": 5.0,
+      "grad_norm": 0.26163730025291443,
       "learning_rate": 0.0,
+      "loss": 0.2116,
+      "step": 125
     }
   ],
   "logging_steps": 1,
+  "max_steps": 125,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 5,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 1.8787073669726208e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e78e03cdc8dd492bf3e2caa50f5fa2767b7f8b075cdb59edee3e79f289bb3410
 size 5240

 version https://git-lfs.github.com/spec/v1
+oid sha256:4114bcdcd5ee84b4b6297142cec343da8142ccd3caff500bedf3bed40df3d2fa
 size 5240