ChenWu98 commited on
Commit
db289f4
·
verified ·
1 Parent(s): bec7e65

Model save

Browse files
Files changed (5) hide show
  1. README.md +7 -15
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +5 -5
  5. trainer_state.json +34 -116
README.md CHANGED
@@ -2,15 +2,9 @@
2
  license: mit
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
- - trl
10
- - sft
11
- - generated_from_trainer
12
- datasets:
13
- - ChenWu98/skills_red_herring_chat
14
  base_model: HuggingFaceH4/zephyr-7b-beta
15
  model-index:
16
  - name: skills_red_herring_chat-lora
@@ -22,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # skills_red_herring_chat-lora
24
 
25
- This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the ChenWu98/skills_red_herring_chat dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.2122
28
 
29
  ## Model description
30
 
@@ -48,21 +42,19 @@ The following hyperparameters were used during training:
48
  - eval_batch_size: 8
49
  - seed: 42
50
  - distributed_type: multi-GPU
51
- - gradient_accumulation_steps: 4
52
- - total_train_batch_size: 16
53
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
  - lr_scheduler_type: cosine
55
  - lr_scheduler_warmup_ratio: 0.1
56
- - num_epochs: 4.0
57
 
58
  ### Training results
59
 
60
  | Training Loss | Epoch | Step | Validation Loss |
61
  |:-------------:|:-----:|:----:|:---------------:|
62
- | 0.3285 | 0.96 | 18 | 0.2505 |
63
- | 0.1944 | 1.97 | 37 | 0.2189 |
64
- | 0.1767 | 2.99 | 56 | 0.2127 |
65
- | 0.1591 | 3.84 | 72 | 0.2122 |
66
 
67
 
68
  ### Framework versions
 
2
  license: mit
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
 
 
 
8
  base_model: HuggingFaceH4/zephyr-7b-beta
9
  model-index:
10
  - name: skills_red_herring_chat-lora
 
16
 
17
  # skills_red_herring_chat-lora
18
 
19
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.2756
22
 
23
  ## Model description
24
 
 
42
  - eval_batch_size: 8
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
+ - gradient_accumulation_steps: 8
46
+ - total_train_batch_size: 32
47
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
  - lr_scheduler_type: cosine
49
  - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 2.0
51
 
52
  ### Training results
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:-----:|:----:|:---------------:|
56
+ | 1.8942 | 0.96 | 9 | 0.3455 |
57
+ | 0.2839 | 1.92 | 18 | 0.2756 |
 
 
58
 
59
 
60
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 3.84,
3
- "eval_loss": 0.21221186220645905,
4
- "eval_runtime": 4.0792,
5
  "eval_samples": 100,
6
- "eval_samples_per_second": 24.515,
7
- "eval_steps_per_second": 3.187,
8
- "train_loss": 0.4049788423710399,
9
- "train_runtime": 399.0811,
10
  "train_samples": 300,
11
- "train_samples_per_second": 3.007,
12
- "train_steps_per_second": 0.18
13
  }
 
1
  {
2
+ "epoch": 1.92,
3
+ "eval_loss": 0.2756038010120392,
4
+ "eval_runtime": 4.1878,
5
  "eval_samples": 100,
6
+ "eval_samples_per_second": 23.879,
7
+ "eval_steps_per_second": 3.104,
8
+ "train_loss": 0.8368253144952986,
9
+ "train_runtime": 204.6909,
10
  "train_samples": 300,
11
+ "train_samples_per_second": 2.931,
12
+ "train_steps_per_second": 0.088
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.84,
3
- "eval_loss": 0.21221186220645905,
4
- "eval_runtime": 4.0792,
5
  "eval_samples": 100,
6
- "eval_samples_per_second": 24.515,
7
- "eval_steps_per_second": 3.187
8
  }
 
1
  {
2
+ "epoch": 1.92,
3
+ "eval_loss": 0.2756038010120392,
4
+ "eval_runtime": 4.1878,
5
  "eval_samples": 100,
6
+ "eval_samples_per_second": 23.879,
7
+ "eval_steps_per_second": 3.104
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.84,
3
- "train_loss": 0.4049788423710399,
4
- "train_runtime": 399.0811,
5
  "train_samples": 300,
6
- "train_samples_per_second": 3.007,
7
- "train_steps_per_second": 0.18
8
  }
 
1
  {
2
+ "epoch": 1.92,
3
+ "train_loss": 0.8368253144952986,
4
+ "train_runtime": 204.6909,
5
  "train_samples": 300,
6
+ "train_samples_per_second": 2.931,
7
+ "train_steps_per_second": 0.088
8
  }
trainer_state.json CHANGED
@@ -1,151 +1,69 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.84,
5
  "eval_steps": 500,
6
- "global_step": 72,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.05,
13
- "learning_rate": 2.5e-05,
14
- "loss": 2.4616,
15
  "step": 1
16
  },
17
- {
18
- "epoch": 0.27,
19
- "learning_rate": 0.000125,
20
- "loss": 2.2926,
21
- "step": 5
22
- },
23
  {
24
  "epoch": 0.53,
25
- "learning_rate": 0.0001995184726672197,
26
- "loss": 1.0788,
27
- "step": 10
28
- },
29
- {
30
- "epoch": 0.8,
31
- "learning_rate": 0.00019415440651830208,
32
- "loss": 0.3285,
33
- "step": 15
34
  },
35
  {
36
  "epoch": 0.96,
37
- "eval_loss": 0.25046542286872864,
38
- "eval_runtime": 4.9708,
39
- "eval_samples_per_second": 20.118,
40
- "eval_steps_per_second": 2.615,
41
- "step": 18
42
  },
43
  {
44
  "epoch": 1.07,
45
- "learning_rate": 0.00018314696123025454,
46
- "loss": 0.2382,
47
- "step": 20
48
- },
49
- {
50
- "epoch": 1.33,
51
- "learning_rate": 0.00016715589548470185,
52
- "loss": 0.2235,
53
- "step": 25
54
- },
55
- {
56
- "epoch": 1.6,
57
- "learning_rate": 0.0001471396736825998,
58
- "loss": 0.2044,
59
- "step": 30
60
- },
61
- {
62
- "epoch": 1.87,
63
- "learning_rate": 0.0001242980179903264,
64
- "loss": 0.1944,
65
- "step": 35
66
- },
67
- {
68
- "epoch": 1.97,
69
- "eval_loss": 0.21891021728515625,
70
- "eval_runtime": 4.0841,
71
- "eval_samples_per_second": 24.485,
72
- "eval_steps_per_second": 3.183,
73
- "step": 37
74
- },
75
- {
76
- "epoch": 2.13,
77
  "learning_rate": 0.0001,
78
- "loss": 0.1881,
79
- "step": 40
80
- },
81
- {
82
- "epoch": 2.4,
83
- "learning_rate": 7.570198200967362e-05,
84
- "loss": 0.1768,
85
- "step": 45
86
- },
87
- {
88
- "epoch": 2.67,
89
- "learning_rate": 5.286032631740023e-05,
90
- "loss": 0.1628,
91
- "step": 50
92
- },
93
- {
94
- "epoch": 2.93,
95
- "learning_rate": 3.2844104515298155e-05,
96
- "loss": 0.1767,
97
- "step": 55
98
- },
99
- {
100
- "epoch": 2.99,
101
- "eval_loss": 0.2127072662115097,
102
- "eval_runtime": 4.0548,
103
- "eval_samples_per_second": 24.662,
104
- "eval_steps_per_second": 3.206,
105
- "step": 56
106
  },
107
  {
108
- "epoch": 3.2,
109
  "learning_rate": 1.6853038769745467e-05,
110
- "loss": 0.1663,
111
- "step": 60
112
- },
113
- {
114
- "epoch": 3.47,
115
- "learning_rate": 5.8455934816979305e-06,
116
- "loss": 0.1468,
117
- "step": 65
118
- },
119
- {
120
- "epoch": 3.73,
121
- "learning_rate": 4.815273327803182e-07,
122
- "loss": 0.1591,
123
- "step": 70
124
  },
125
  {
126
- "epoch": 3.84,
127
- "eval_loss": 0.21221186220645905,
128
- "eval_runtime": 4.0724,
129
- "eval_samples_per_second": 24.556,
130
- "eval_steps_per_second": 3.192,
131
- "step": 72
132
  },
133
  {
134
- "epoch": 3.84,
135
- "step": 72,
136
- "total_flos": 40669301473280.0,
137
- "train_loss": 0.4049788423710399,
138
- "train_runtime": 399.0811,
139
- "train_samples_per_second": 3.007,
140
- "train_steps_per_second": 0.18
141
  }
142
  ],
143
  "logging_steps": 5,
144
- "max_steps": 72,
145
  "num_input_tokens_seen": 0,
146
- "num_train_epochs": 4,
147
  "save_steps": 500,
148
- "total_flos": 40669301473280.0,
149
  "train_batch_size": 4,
150
  "trial_name": null,
151
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.92,
5
  "eval_steps": 500,
6
+ "global_step": 18,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.11,
13
+ "learning_rate": 0.0001,
14
+ "loss": 2.4834,
15
  "step": 1
16
  },
 
 
 
 
 
 
17
  {
18
  "epoch": 0.53,
19
+ "learning_rate": 0.00018314696123025454,
20
+ "loss": 1.8942,
21
+ "step": 5
 
 
 
 
 
 
22
  },
23
  {
24
  "epoch": 0.96,
25
+ "eval_loss": 0.3454779088497162,
26
+ "eval_runtime": 5.1335,
27
+ "eval_samples_per_second": 19.48,
28
+ "eval_steps_per_second": 2.532,
29
+ "step": 9
30
  },
31
  {
32
  "epoch": 1.07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  "learning_rate": 0.0001,
34
+ "loss": 0.5613,
35
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  },
37
  {
38
+ "epoch": 1.6,
39
  "learning_rate": 1.6853038769745467e-05,
40
+ "loss": 0.2839,
41
+ "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
42
  },
43
  {
44
+ "epoch": 1.92,
45
+ "eval_loss": 0.2756038010120392,
46
+ "eval_runtime": 4.1779,
47
+ "eval_samples_per_second": 23.936,
48
+ "eval_steps_per_second": 3.112,
49
+ "step": 18
50
  },
51
  {
52
+ "epoch": 1.92,
53
+ "step": 18,
54
+ "total_flos": 20252706734080.0,
55
+ "train_loss": 0.8368253144952986,
56
+ "train_runtime": 204.6909,
57
+ "train_samples_per_second": 2.931,
58
+ "train_steps_per_second": 0.088
59
  }
60
  ],
61
  "logging_steps": 5,
62
+ "max_steps": 18,
63
  "num_input_tokens_seen": 0,
64
+ "num_train_epochs": 2,
65
  "save_steps": 500,
66
+ "total_flos": 20252706734080.0,
67
  "train_batch_size": 4,
68
  "trial_name": null,
69
  "trial_params": null