Update fine-tuned model

Browse files

Files changed (7) hide show

adapter_config.json +2 -2
adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +277 -102
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -22,10 +22,10 @@
   "target_modules": [
     "q_proj",
     "k_proj",
-    "gate_proj",
     "down_proj",
-    "up_proj",
     "v_proj",
     "o_proj"
   ],
   "task_type": " CAUSAL_LM",

   "target_modules": [
     "q_proj",
     "k_proj",
     "down_proj",
     "v_proj",
+    "gate_proj",
+    "up_proj",
     "o_proj"
   ],
   "task_type": " CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d054996c8870886154c89eb4b321cffae3782503afa3093805cae5b92e89d63
 size 159967880

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f33495d2a06946619fbcf27dfad787f5d2ecbaff7594e36553591b83763f5dc
 size 159967880

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:75b0df8cc32c0c3fdd058abda370de0bd80f3dfa6026c5b8d7404204aa249974
 size 852876198

 version https://git-lfs.github.com/spec/v1
+oid sha256:ad2b6acd8c794a174246823de83003bf746dd08a56ee2debca38260dd8ede8c0
 size 852876198

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34e497cead5b5d8cb6e8ac2be926d8e9aecf8f729139a14383abc9f0da4763cc
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:f14f3fae9bd6d8bdd744584b0dede52dcf273aa344fff0056431563d7dff0e18
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4521b8db9cc205e54aa606d85e707c024abd2d8ad4a20bec4b2cff365dc59cdf
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:e69e2b49ea642509f0c688c16fb190b7cf27dac0a18903a5e2d1467d0343d8b8
 size 1064

trainer_state.json CHANGED Viewed

@@ -3,189 +3,364 @@
   "best_model_checkpoint": null,
   "epoch": 5.0,
   "eval_steps": 500,
-  "global_step": 25,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.2,
-      "grad_norm": 0.6263333559036255,
-      "learning_rate": 0.0001,
-      "loss": 0.6512,
       "step": 1
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 0.659807562828064,
-      "learning_rate": 0.0002,
-      "loss": 0.6388,
       "step": 2
     },
     {
-      "epoch": 0.6,
-      "grad_norm": 0.4718220829963684,
-      "learning_rate": 0.00019906859460363307,
-      "loss": 0.6469,
       "step": 3
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.3931182324886322,
-      "learning_rate": 0.00019629172873477995,
-      "loss": 0.6385,
       "step": 4
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 0.394342839717865,
-      "learning_rate": 0.00019172113015054532,
-      "loss": 0.6754,
       "step": 5
     },
     {
-      "epoch": 1.2,
-      "grad_norm": 0.4089839458465576,
-      "learning_rate": 0.00018544194045464886,
-      "loss": 0.5292,
       "step": 6
     },
     {
-      "epoch": 1.4,
-      "grad_norm": 0.39185038208961487,
-      "learning_rate": 0.000177571129070442,
-      "loss": 0.5756,
       "step": 7
     },
     {
-      "epoch": 1.6,
-      "grad_norm": 0.35863593220710754,
-      "learning_rate": 0.00016825531432186543,
-      "loss": 0.5311,
       "step": 8
     },
     {
-      "epoch": 1.8,
-      "grad_norm": 0.3710576891899109,
-      "learning_rate": 0.00015766803221148673,
-      "loss": 0.5056,
       "step": 9
     },
     {
-      "epoch": 2.0,
-      "grad_norm": 0.365957647562027,
-      "learning_rate": 0.00014600650377311522,
-      "loss": 0.4892,
       "step": 10
     },
     {
-      "epoch": 2.2,
-      "grad_norm": 0.34360572695732117,
-      "learning_rate": 0.00013348796121709862,
-      "loss": 0.4838,
       "step": 11
     },
     {
-      "epoch": 2.4,
-      "grad_norm": 0.33910414576530457,
-      "learning_rate": 0.0001203456013052634,
-      "loss": 0.4295,
       "step": 12
     },
     {
-      "epoch": 2.6,
-      "grad_norm": 0.31048351526260376,
-      "learning_rate": 0.0001068242413364671,
-      "loss": 0.4072,
       "step": 13
     },
     {
-      "epoch": 2.8,
-      "grad_norm": 0.3141072988510132,
-      "learning_rate": 9.317575866353292e-05,
-      "loss": 0.4244,
       "step": 14
     },
     {
-      "epoch": 3.0,
-      "grad_norm": 0.31501471996307373,
-      "learning_rate": 7.965439869473664e-05,
-      "loss": 0.4128,
       "step": 15
     },
     {
-      "epoch": 3.2,
-      "grad_norm": 0.2909337878227234,
-      "learning_rate": 6.651203878290139e-05,
-      "loss": 0.3821,
       "step": 16
     },
     {
-      "epoch": 3.4,
-      "grad_norm": 0.26387819647789,
-      "learning_rate": 5.399349622688479e-05,
-      "loss": 0.3776,
       "step": 17
     },
     {
-      "epoch": 3.6,
-      "grad_norm": 0.26343247294425964,
-      "learning_rate": 4.2331967788513295e-05,
-      "loss": 0.3619,
       "step": 18
     },
     {
-      "epoch": 3.8,
-      "grad_norm": 0.26532021164894104,
-      "learning_rate": 3.174468567813461e-05,
-      "loss": 0.3715,
       "step": 19
     },
     {
-      "epoch": 4.0,
-      "grad_norm": 0.27087175846099854,
-      "learning_rate": 2.242887092955801e-05,
-      "loss": 0.3683,
       "step": 20
     },
     {
-      "epoch": 4.2,
-      "grad_norm": 0.25199756026268005,
-      "learning_rate": 1.4558059545351143e-05,
-      "loss": 0.3523,
       "step": 21
     },
     {
-      "epoch": 4.4,
-      "grad_norm": 0.2494262307882309,
-      "learning_rate": 8.278869849454718e-06,
-      "loss": 0.333,
       "step": 22
     },
     {
-      "epoch": 4.6,
-      "grad_norm": 0.25421398878097534,
-      "learning_rate": 3.7082712652200867e-06,
-      "loss": 0.3289,
       "step": 23
     },
     {
-      "epoch": 4.8,
-      "grad_norm": 0.26278430223464966,
-      "learning_rate": 9.314053963669245e-07,
-      "loss": 0.3438,
       "step": 24
     },
     {
       "epoch": 5.0,
-      "grad_norm": 0.2513781785964966,
       "learning_rate": 0.0,
-      "loss": 0.379,
-      "step": 25
     }
   ],
   "logging_steps": 1,
-  "max_steps": 25,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 5,
   "save_steps": 500,
@@ -201,7 +376,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.082693846592717e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_model_checkpoint": null,
   "epoch": 5.0,
   "eval_steps": 500,
+  "global_step": 50,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.1,
+      "grad_norm": 0.47972142696380615,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.6289,
       "step": 1
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 0.466259241104126,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.622,
       "step": 2
     },
     {
+      "epoch": 0.3,
+      "grad_norm": 0.4508052170276642,
+      "learning_rate": 0.0002,
+      "loss": 0.6472,
       "step": 3
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 0.39328619837760925,
+      "learning_rate": 0.00019977668786231534,
+      "loss": 0.6188,
       "step": 4
     },
     {
+      "epoch": 0.5,
+      "grad_norm": 0.34969136118888855,
+      "learning_rate": 0.000199107748815478,
+      "loss": 0.5904,
       "step": 5
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 0.34362491965293884,
+      "learning_rate": 0.0001979961705036587,
+      "loss": 0.6218,
       "step": 6
     },
     {
+      "epoch": 0.7,
+      "grad_norm": 0.3137108385562897,
+      "learning_rate": 0.00019644691750543767,
+      "loss": 0.5773,
       "step": 7
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 0.3064671754837036,
+      "learning_rate": 0.0001944669091607919,
+      "loss": 0.6056,
       "step": 8
     },
     {
+      "epoch": 0.9,
+      "grad_norm": 0.28781741857528687,
+      "learning_rate": 0.00019206498866764288,
+      "loss": 0.6026,
       "step": 9
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 0.29755786061286926,
+      "learning_rate": 0.00018925188358598813,
+      "loss": 0.6226,
       "step": 10
     },
     {
+      "epoch": 1.1,
+      "grad_norm": 0.32020193338394165,
+      "learning_rate": 0.00018604015792601396,
+      "loss": 0.4575,
       "step": 11
     },
     {
+      "epoch": 1.2,
+      "grad_norm": 0.37006428837776184,
+      "learning_rate": 0.00018244415603417603,
+      "loss": 0.4834,
       "step": 12
     },
     {
+      "epoch": 1.3,
+      "grad_norm": 0.3829534947872162,
+      "learning_rate": 0.0001784799385278661,
+      "loss": 0.5147,
       "step": 13
     },
     {
+      "epoch": 1.4,
+      "grad_norm": 0.316731721162796,
+      "learning_rate": 0.00017416521056479577,
+      "loss": 0.4958,
       "step": 14
     },
     {
+      "epoch": 1.5,
+      "grad_norm": 0.29609623551368713,
+      "learning_rate": 0.00016951924276746425,
+      "loss": 0.491,
       "step": 15
     },
     {
+      "epoch": 1.6,
+      "grad_norm": 0.29552099108695984,
+      "learning_rate": 0.00016456278515588024,
+      "loss": 0.4543,
       "step": 16
     },
     {
+      "epoch": 1.7,
+      "grad_norm": 0.2840826213359833,
+      "learning_rate": 0.00015931797447293552,
+      "loss": 0.4774,
       "step": 17
     },
     {
+      "epoch": 1.8,
+      "grad_norm": 0.2735576033592224,
+      "learning_rate": 0.00015380823531633729,
+      "loss": 0.4513,
       "step": 18
     },
     {
+      "epoch": 1.9,
+      "grad_norm": 0.2856349050998688,
+      "learning_rate": 0.00014805817551866838,
+      "loss": 0.4556,
       "step": 19
     },
     {
+      "epoch": 2.0,
+      "grad_norm": 0.2851189970970154,
+      "learning_rate": 0.0001420934762428335,
+      "loss": 0.4646,
       "step": 20
     },
     {
+      "epoch": 2.1,
+      "grad_norm": 0.2509067952632904,
+      "learning_rate": 0.00013594077728375128,
+      "loss": 0.4102,
       "step": 21
     },
     {
+      "epoch": 2.2,
+      "grad_norm": 0.2623592019081116,
+      "learning_rate": 0.00012962755808856342,
+      "loss": 0.378,
       "step": 22
     },
     {
+      "epoch": 2.3,
+      "grad_norm": 0.267648845911026,
+      "learning_rate": 0.00012318201502675285,
+      "loss": 0.4115,
       "step": 23
     },
     {
+      "epoch": 2.4,
+      "grad_norm": 0.27410486340522766,
+      "learning_rate": 0.00011663293545831302,
+      "loss": 0.389,
       "step": 24
     },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.2685711979866028,
+      "learning_rate": 0.00011000956916240985,
+      "loss": 0.3922,
+      "step": 25
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.2666301727294922,
+      "learning_rate": 0.00010334149770076747,
+      "loss": 0.3749,
+      "step": 26
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.3079356551170349,
+      "learning_rate": 9.665850229923258e-05,
+      "loss": 0.3486,
+      "step": 27
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.28150758147239685,
+      "learning_rate": 8.999043083759017e-05,
+      "loss": 0.3606,
+      "step": 28
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.28707581758499146,
+      "learning_rate": 8.336706454168701e-05,
+      "loss": 0.3642,
+      "step": 29
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.2959519028663635,
+      "learning_rate": 7.681798497324716e-05,
+      "loss": 0.3624,
+      "step": 30
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 0.26570671796798706,
+      "learning_rate": 7.037244191143661e-05,
+      "loss": 0.3231,
+      "step": 31
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.2674141526222229,
+      "learning_rate": 6.405922271624874e-05,
+      "loss": 0.3298,
+      "step": 32
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 0.2791026532649994,
+      "learning_rate": 5.790652375716652e-05,
+      "loss": 0.3276,
+      "step": 33
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 0.26298171281814575,
+      "learning_rate": 5.1941824481331626e-05,
+      "loss": 0.3048,
+      "step": 34
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.269193559885025,
+      "learning_rate": 4.6191764683662744e-05,
+      "loss": 0.3216,
+      "step": 35
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.26902416348457336,
+      "learning_rate": 4.0682025527064486e-05,
+      "loss": 0.2933,
+      "step": 36
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 0.26919540762901306,
+      "learning_rate": 3.543721484411976e-05,
+      "loss": 0.3138,
+      "step": 37
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.27092471718788147,
+      "learning_rate": 3.0480757232535772e-05,
+      "loss": 0.3184,
+      "step": 38
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 0.26903486251831055,
+      "learning_rate": 2.5834789435204243e-05,
+      "loss": 0.3148,
+      "step": 39
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.28141602873802185,
+      "learning_rate": 2.1520061472133902e-05,
+      "loss": 0.3125,
+      "step": 40
+    },
+    {
+      "epoch": 4.1,
+      "grad_norm": 0.25863194465637207,
+      "learning_rate": 1.7555843965823992e-05,
+      "loss": 0.2954,
+      "step": 41
+    },
+    {
+      "epoch": 4.2,
+      "grad_norm": 0.2740136682987213,
+      "learning_rate": 1.3959842073986085e-05,
+      "loss": 0.2992,
+      "step": 42
+    },
+    {
+      "epoch": 4.3,
+      "grad_norm": 0.2652461528778076,
+      "learning_rate": 1.0748116414011888e-05,
+      "loss": 0.2925,
+      "step": 43
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 0.278138667345047,
+      "learning_rate": 7.935011332357112e-06,
+      "loss": 0.2816,
+      "step": 44
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 0.2779091000556946,
+      "learning_rate": 5.533090839208133e-06,
+      "loss": 0.2726,
+      "step": 45
+    },
+    {
+      "epoch": 4.6,
+      "grad_norm": 0.2674082815647125,
+      "learning_rate": 3.5530824945623542e-06,
+      "loss": 0.2836,
+      "step": 46
+    },
+    {
+      "epoch": 4.7,
+      "grad_norm": 0.2792351245880127,
+      "learning_rate": 2.003829496341325e-06,
+      "loss": 0.2849,
+      "step": 47
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 0.2667449414730072,
+      "learning_rate": 8.922511845219971e-07,
+      "loss": 0.2929,
+      "step": 48
+    },
+    {
+      "epoch": 4.9,
+      "grad_norm": 0.27168118953704834,
+      "learning_rate": 2.2331213768468363e-07,
+      "loss": 0.2739,
+      "step": 49
+    },
     {
       "epoch": 5.0,
+      "grad_norm": 0.27336716651916504,
       "learning_rate": 0.0,
+      "loss": 0.2936,
+      "step": 50
     }
   ],
   "logging_steps": 1,
+  "max_steps": 50,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 5,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 6.031379393052672e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bdb67f7e08d706c1aba378a44137116de683468a035998cac2d26dc2c2a5fd26
 size 5240

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f9069aab4af513f470da5848e8771d43df41e0e1100cb7715f6f428cceb308b
 size 5240