ChenWu98 commited on
Commit
7dca212
1 Parent(s): 2c32ede

Model save

Browse files
Files changed (5) hide show
  1. README.md +5 -11
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +5 -5
  5. trainer_state.json +81 -31
README.md CHANGED
@@ -2,16 +2,9 @@
2
  license: mit
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
- - trl
10
- - sft
11
- - generated_from_trainer
12
- datasets:
13
- - ChenWu98/skills_metaphor_chat
14
- - ChenWu98/skills_red_herring_chat
15
  base_model: HuggingFaceH4/zephyr-7b-beta
16
  model-index:
17
  - name: skills_metaphor_chat-skills_red_herring_chat-lora
@@ -23,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  # skills_metaphor_chat-skills_red_herring_chat-lora
25
 
26
- This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the ChenWu98/skills_metaphor_chat and the ChenWu98/skills_red_herring_chat datasets.
27
  It achieves the following results on the evaluation set:
28
- - Loss: 0.2247
29
 
30
  ## Model description
31
 
@@ -54,13 +47,14 @@ The following hyperparameters were used during training:
54
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
55
  - lr_scheduler_type: cosine
56
  - lr_scheduler_warmup_ratio: 0.1
57
- - num_epochs: 1.0
58
 
59
  ### Training results
60
 
61
  | Training Loss | Epoch | Step | Validation Loss |
62
  |:-------------:|:-----:|:----:|:---------------:|
63
- | 0.203 | 0.99 | 37 | 0.2247 |
 
64
 
65
 
66
  ### Framework versions
 
2
  license: mit
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
 
 
 
 
8
  base_model: HuggingFaceH4/zephyr-7b-beta
9
  model-index:
10
  - name: skills_metaphor_chat-skills_red_herring_chat-lora
 
16
 
17
  # skills_metaphor_chat-skills_red_herring_chat-lora
18
 
19
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.2123
22
 
23
  ## Model description
24
 
 
47
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
  - lr_scheduler_type: cosine
49
  - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 2.0
51
 
52
  ### Training results
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:-----:|:----:|:---------------:|
56
+ | 0.199 | 0.99 | 37 | 0.2184 |
57
+ | 0.1668 | 1.97 | 74 | 0.2123 |
58
 
59
 
60
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 0.99,
3
- "eval_loss": 0.22467635571956635,
4
- "eval_runtime": 8.1549,
5
  "eval_samples": 200,
6
- "eval_samples_per_second": 24.525,
7
- "eval_steps_per_second": 3.066,
8
- "train_loss": 0.5330296472923176,
9
- "train_runtime": 203.4112,
10
  "train_samples": 600,
11
- "train_samples_per_second": 2.95,
12
- "train_steps_per_second": 0.182
13
  }
 
1
  {
2
+ "epoch": 1.97,
3
+ "eval_loss": 0.21233995258808136,
4
+ "eval_runtime": 8.1232,
5
  "eval_samples": 200,
6
+ "eval_samples_per_second": 24.621,
7
+ "eval_steps_per_second": 3.078,
8
+ "train_loss": 0.39267757534980774,
9
+ "train_runtime": 398.9638,
10
  "train_samples": 600,
11
+ "train_samples_per_second": 3.008,
12
+ "train_steps_per_second": 0.185
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.99,
3
- "eval_loss": 0.22467635571956635,
4
- "eval_runtime": 8.1549,
5
  "eval_samples": 200,
6
- "eval_samples_per_second": 24.525,
7
- "eval_steps_per_second": 3.066
8
  }
 
1
  {
2
+ "epoch": 1.97,
3
+ "eval_loss": 0.21233995258808136,
4
+ "eval_runtime": 8.1232,
5
  "eval_samples": 200,
6
+ "eval_samples_per_second": 24.621,
7
+ "eval_steps_per_second": 3.078
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.99,
3
- "train_loss": 0.5330296472923176,
4
- "train_runtime": 203.4112,
5
  "train_samples": 600,
6
- "train_samples_per_second": 2.95,
7
- "train_steps_per_second": 0.182
8
  }
 
1
  {
2
+ "epoch": 1.97,
3
+ "train_loss": 0.39267757534980774,
4
+ "train_runtime": 398.9638,
5
  "train_samples": 600,
6
+ "train_samples_per_second": 3.008,
7
+ "train_steps_per_second": 0.185
8
  }
trainer_state.json CHANGED
@@ -1,85 +1,135 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9866666666666667,
5
  "eval_steps": 500,
6
- "global_step": 37,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "learning_rate": 5e-05,
14
  "loss": 2.2275,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.13,
19
- "learning_rate": 0.00019954719225730847,
20
- "loss": 1.9848,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.27,
25
- "learning_rate": 0.00018412535328311814,
26
- "loss": 0.6529,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.4,
31
- "learning_rate": 0.00015000000000000001,
32
- "loss": 0.2662,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.53,
37
- "learning_rate": 0.00010475819158237425,
38
- "loss": 0.238,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.67,
43
- "learning_rate": 5.845849869981137e-05,
44
- "loss": 0.2332,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.8,
49
- "learning_rate": 2.139469052572127e-05,
50
- "loss": 0.2287,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.93,
55
- "learning_rate": 1.8071302737293295e-06,
56
- "loss": 0.203,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.99,
61
- "eval_loss": 0.22467635571956635,
62
- "eval_runtime": 9.0559,
63
- "eval_samples_per_second": 22.085,
64
- "eval_steps_per_second": 2.761,
65
  "step": 37
66
  },
67
  {
68
- "epoch": 0.99,
69
- "step": 37,
70
- "total_flos": 20586009395200.0,
71
- "train_loss": 0.5330296472923176,
72
- "train_runtime": 203.4112,
73
- "train_samples_per_second": 2.95,
74
- "train_steps_per_second": 0.182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  }
76
  ],
77
  "logging_steps": 5,
78
- "max_steps": 37,
79
  "num_input_tokens_seen": 0,
80
- "num_train_epochs": 1,
81
  "save_steps": 500,
82
- "total_flos": 20586009395200.0,
83
  "train_batch_size": 4,
84
  "trial_name": null,
85
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9733333333333334,
5
  "eval_steps": 500,
6
+ "global_step": 74,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "learning_rate": 2.5e-05,
14
  "loss": 2.2275,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.13,
19
+ "learning_rate": 0.000125,
20
+ "loss": 2.1443,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.27,
25
+ "learning_rate": 0.00019954719225730847,
26
+ "loss": 1.0226,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.4,
31
+ "learning_rate": 0.00019450008187146684,
32
+ "loss": 0.31,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.53,
37
+ "learning_rate": 0.00018412535328311814,
38
+ "loss": 0.2456,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.67,
43
+ "learning_rate": 0.00016900790114821122,
44
+ "loss": 0.2362,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.8,
49
+ "learning_rate": 0.00015000000000000001,
50
+ "loss": 0.2269,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.93,
55
+ "learning_rate": 0.00012817325568414297,
56
+ "loss": 0.199,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.99,
61
+ "eval_loss": 0.21835945546627045,
62
+ "eval_runtime": 8.9027,
63
+ "eval_samples_per_second": 22.465,
64
+ "eval_steps_per_second": 2.808,
65
  "step": 37
66
  },
67
  {
68
+ "epoch": 1.07,
69
+ "learning_rate": 0.00010475819158237425,
70
+ "loss": 0.1995,
71
+ "step": 40
72
+ },
73
+ {
74
+ "epoch": 1.2,
75
+ "learning_rate": 8.107487556395901e-05,
76
+ "loss": 0.1853,
77
+ "step": 45
78
+ },
79
+ {
80
+ "epoch": 1.33,
81
+ "learning_rate": 5.845849869981137e-05,
82
+ "loss": 0.1781,
83
+ "step": 50
84
+ },
85
+ {
86
+ "epoch": 1.47,
87
+ "learning_rate": 3.8184101377939476e-05,
88
+ "loss": 0.1821,
89
+ "step": 55
90
+ },
91
+ {
92
+ "epoch": 1.6,
93
+ "learning_rate": 2.139469052572127e-05,
94
+ "loss": 0.1811,
95
+ "step": 60
96
+ },
97
+ {
98
+ "epoch": 1.73,
99
+ "learning_rate": 9.036800464548157e-06,
100
+ "loss": 0.1728,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 1.87,
105
+ "learning_rate": 1.8071302737293295e-06,
106
+ "loss": 0.1668,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 1.97,
111
+ "eval_loss": 0.21233995258808136,
112
+ "eval_runtime": 8.0137,
113
+ "eval_samples_per_second": 24.957,
114
+ "eval_steps_per_second": 3.12,
115
+ "step": 74
116
+ },
117
+ {
118
+ "epoch": 1.97,
119
+ "step": 74,
120
+ "total_flos": 41279761350656.0,
121
+ "train_loss": 0.39267757534980774,
122
+ "train_runtime": 398.9638,
123
+ "train_samples_per_second": 3.008,
124
+ "train_steps_per_second": 0.185
125
  }
126
  ],
127
  "logging_steps": 5,
128
+ "max_steps": 74,
129
  "num_input_tokens_seen": 0,
130
+ "num_train_epochs": 2,
131
  "save_steps": 500,
132
+ "total_flos": 41279761350656.0,
133
  "train_batch_size": 4,
134
  "trial_name": null,
135
  "trial_params": null