DonJoey commited on
Commit
6ae2063
1 Parent(s): 4c995f3

Model save

Browse files
README.md CHANGED
@@ -1,5 +1,4 @@
1
  ---
2
- library_name: transformers
3
  tags:
4
  - trl
5
  - sft
@@ -18,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model was trained from scratch on the generator dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 1.0593
22
 
23
  ## Model description
24
 
@@ -55,14 +54,14 @@ The following hyperparameters were used during training:
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:------:|:----:|:---------------:|
58
- | 1.3697 | 0.9389 | 12 | 1.2250 |
59
- | 1.0231 | 1.9560 | 25 | 1.0662 |
60
- | 0.901 | 2.8166 | 36 | 1.0593 |
61
 
62
 
63
  ### Framework versions
64
 
65
- - Transformers 4.44.2
66
  - Pytorch 2.4.1+cu121
67
- - Datasets 2.21.0
68
  - Tokenizers 0.19.1
 
1
  ---
 
2
  tags:
3
  - trl
4
  - sft
 
17
 
18
  This model was trained from scratch on the generator dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 1.0547
21
 
22
  ## Model description
23
 
 
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:------:|:----:|:---------------:|
57
+ | 1.4332 | 0.9505 | 12 | 1.2468 |
58
+ | 1.0421 | 1.9802 | 25 | 1.0650 |
59
+ | 0.9193 | 2.8515 | 36 | 1.0547 |
60
 
61
 
62
  ### Framework versions
63
 
64
+ - Transformers 4.42.4
65
  - Pytorch 2.4.1+cu121
66
+ - Datasets 2.20.0
67
  - Tokenizers 0.19.1
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.8166259168704157,
3
  "total_flos": 120497991843840.0,
4
- "train_loss": 1.135702931218677,
5
- "train_runtime": 11898.7844,
6
  "train_samples": 49800,
7
- "train_samples_per_second": 3.299,
8
- "train_steps_per_second": 0.003
9
  }
 
1
  {
2
+ "epoch": 2.8514851485148514,
3
  "total_flos": 120497991843840.0,
4
+ "train_loss": 1.160024169418547,
5
+ "train_runtime": 9224.6118,
6
  "train_samples": 49800,
7
+ "train_samples_per_second": 4.198,
8
+ "train_steps_per_second": 0.004
9
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.44.2"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.42.4"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce7d89f4744a014e2bfbeb450afe7ff389ba2708f0b4f6adb0c84831ebba7bb2
3
  size 4949453792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b48679daa36443b5ff8f888d574e8252a4a5751270673ad02fc4f60703a0a6d
3
  size 4949453792
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22237e24f10cddb852a9b337a3ffae56e4dfe3fe4837650ea0b7f1725daa5f08
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:249978ba656652f2370256a901f63dcdf5bae38a75e510bc953005243e05cc37
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48dbf98abfd3926d37e2a5a966ec2d135ca7c774d82c3aec9e5c38c39a9df30c
3
  size 4546807800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d637cd17384a9c92db142d4fb9e1d05dfb5e6d7e6bb895474f685ab708e4205
3
  size 4546807800
runs/Sep12_14-33-22_cs420n/events.out.tfevents.1726122939.cs420n.2362893.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1608a2cfeb31f0abc670dc88846f164d1a873be6047c2847f575ab7edc163740
3
- size 7653
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e83e579f50df8b8cbf38e93ef1ea62323815a3dede5459ad3b1ea79d3bd7de21
3
+ size 8267
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.8166259168704157,
3
  "total_flos": 120497991843840.0,
4
- "train_loss": 1.135702931218677,
5
- "train_runtime": 11898.7844,
6
  "train_samples": 49800,
7
- "train_samples_per_second": 3.299,
8
- "train_steps_per_second": 0.003
9
  }
 
1
  {
2
+ "epoch": 2.8514851485148514,
3
  "total_flos": 120497991843840.0,
4
+ "train_loss": 1.160024169418547,
5
+ "train_runtime": 9224.6118,
6
  "train_samples": 49800,
7
+ "train_samples_per_second": 4.198,
8
+ "train_steps_per_second": 0.004
9
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.8166259168704157,
5
  "eval_steps": 500,
6
  "global_step": 36,
7
  "is_hyper_param_search": false,
@@ -9,93 +9,93 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.07823960880195599,
13
- "grad_norm": 24.32354215756571,
14
  "learning_rate": 5e-06,
15
- "loss": 1.579,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.39119804400978,
20
- "grad_norm": 49.96314997488065,
21
  "learning_rate": 1.995184726672197e-05,
22
- "loss": 1.4677,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.78239608801956,
27
- "grad_norm": 4.9439904702205855,
28
  "learning_rate": 1.8314696123025456e-05,
29
- "loss": 1.3697,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.9388753056234719,
34
- "eval_loss": 1.2249956130981445,
35
- "eval_runtime": 6.2806,
36
- "eval_samples_per_second": 8.439,
37
- "eval_steps_per_second": 0.318,
38
  "step": 12
39
  },
40
  {
41
- "epoch": 1.17359413202934,
42
- "grad_norm": 5.393870573942289,
43
  "learning_rate": 1.4713967368259981e-05,
44
- "loss": 1.2118,
45
  "step": 15
46
  },
47
  {
48
- "epoch": 1.56479217603912,
49
- "grad_norm": 1.7797044387765701,
50
  "learning_rate": 1e-05,
51
- "loss": 1.0764,
52
  "step": 20
53
  },
54
  {
55
- "epoch": 1.9559902200488999,
56
- "grad_norm": 1.307540863093363,
57
  "learning_rate": 5.286032631740023e-06,
58
- "loss": 1.0231,
59
  "step": 25
60
  },
61
  {
62
- "epoch": 1.9559902200488999,
63
- "eval_loss": 1.066247820854187,
64
- "eval_runtime": 8.3677,
65
- "eval_samples_per_second": 6.334,
66
- "eval_steps_per_second": 0.239,
67
  "step": 25
68
  },
69
  {
70
- "epoch": 2.34718826405868,
71
- "grad_norm": 1.1252690680547968,
72
  "learning_rate": 1.6853038769745466e-06,
73
- "loss": 0.93,
74
  "step": 30
75
  },
76
  {
77
- "epoch": 2.73838630806846,
78
- "grad_norm": 0.7799666146844121,
79
  "learning_rate": 4.815273327803183e-08,
80
- "loss": 0.901,
81
  "step": 35
82
  },
83
  {
84
- "epoch": 2.8166259168704157,
85
- "eval_loss": 1.0592604875564575,
86
- "eval_runtime": 8.1855,
87
- "eval_samples_per_second": 6.475,
88
- "eval_steps_per_second": 0.244,
89
  "step": 36
90
  },
91
  {
92
- "epoch": 2.8166259168704157,
93
  "step": 36,
94
  "total_flos": 120497991843840.0,
95
- "train_loss": 1.135702931218677,
96
- "train_runtime": 11898.7844,
97
- "train_samples_per_second": 3.299,
98
- "train_steps_per_second": 0.003
99
  }
100
  ],
101
  "logging_steps": 5,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.8514851485148514,
5
  "eval_steps": 500,
6
  "global_step": 36,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.07920792079207921,
13
+ "grad_norm": 25.20428267668325,
14
  "learning_rate": 5e-06,
15
+ "loss": 1.5917,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.39603960396039606,
20
+ "grad_norm": 36.917025665063875,
21
  "learning_rate": 1.995184726672197e-05,
22
+ "loss": 1.4728,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.7920792079207921,
27
+ "grad_norm": 8.149660114746355,
28
  "learning_rate": 1.8314696123025456e-05,
29
+ "loss": 1.4332,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.9504950495049505,
34
+ "eval_loss": 1.2468267679214478,
35
+ "eval_runtime": 5.7996,
36
+ "eval_samples_per_second": 8.966,
37
+ "eval_steps_per_second": 0.345,
38
  "step": 12
39
  },
40
  {
41
+ "epoch": 1.188118811881188,
42
+ "grad_norm": 7.683983380513048,
43
  "learning_rate": 1.4713967368259981e-05,
44
+ "loss": 1.2317,
45
  "step": 15
46
  },
47
  {
48
+ "epoch": 1.5841584158415842,
49
+ "grad_norm": 2.458006119450546,
50
  "learning_rate": 1e-05,
51
+ "loss": 1.1031,
52
  "step": 20
53
  },
54
  {
55
+ "epoch": 1.9801980198019802,
56
+ "grad_norm": 1.4615528467429235,
57
  "learning_rate": 5.286032631740023e-06,
58
+ "loss": 1.0421,
59
  "step": 25
60
  },
61
  {
62
+ "epoch": 1.9801980198019802,
63
+ "eval_loss": 1.065010666847229,
64
+ "eval_runtime": 5.6499,
65
+ "eval_samples_per_second": 9.204,
66
+ "eval_steps_per_second": 0.354,
67
  "step": 25
68
  },
69
  {
70
+ "epoch": 2.376237623762376,
71
+ "grad_norm": 1.251106845075615,
72
  "learning_rate": 1.6853038769745466e-06,
73
+ "loss": 0.9444,
74
  "step": 30
75
  },
76
  {
77
+ "epoch": 2.772277227722772,
78
+ "grad_norm": 0.8336224848562745,
79
  "learning_rate": 4.815273327803183e-08,
80
+ "loss": 0.9193,
81
  "step": 35
82
  },
83
  {
84
+ "epoch": 2.8514851485148514,
85
+ "eval_loss": 1.0546845197677612,
86
+ "eval_runtime": 5.7189,
87
+ "eval_samples_per_second": 9.093,
88
+ "eval_steps_per_second": 0.35,
89
  "step": 36
90
  },
91
  {
92
+ "epoch": 2.8514851485148514,
93
  "step": 36,
94
  "total_flos": 120497991843840.0,
95
+ "train_loss": 1.160024169418547,
96
+ "train_runtime": 9224.6118,
97
+ "train_samples_per_second": 4.198,
98
+ "train_steps_per_second": 0.004
99
  }
100
  ],
101
  "logging_steps": 5,