ChenWu98 commited on
Commit
58185a0
1 Parent(s): 11e49e4

Model save

Browse files
Files changed (5) hide show
  1. README.md +7 -10
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +5 -5
  5. trainer_state.json +79 -19
README.md CHANGED
@@ -1,15 +1,9 @@
1
  ---
2
  library_name: peft
3
  tags:
4
- - alignment-handbook
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
- - trl
9
- - sft
10
- - generated_from_trainer
11
- datasets:
12
- - ChenWu98/skills_metaphor_chat
13
  base_model: merged_ckpts/skills_red_herring_chat-lora
14
  model-index:
15
  - name: skills_red_herring_chat-then-skills_metaphor_chat-lora
@@ -21,9 +15,9 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # skills_red_herring_chat-then-skills_metaphor_chat-lora
23
 
24
- This model is a fine-tuned version of [merged_ckpts/skills_red_herring_chat-lora](https://huggingface.co/merged_ckpts/skills_red_herring_chat-lora) on the ChenWu98/skills_metaphor_chat dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.2417
27
 
28
  ## Model description
29
 
@@ -52,13 +46,16 @@ The following hyperparameters were used during training:
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 1.0
56
 
57
  ### Training results
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 0.408 | 0.96 | 9 | 0.2417 |
 
 
 
62
 
63
 
64
  ### Framework versions
 
1
  ---
2
  library_name: peft
3
  tags:
 
4
  - trl
5
  - sft
6
  - generated_from_trainer
 
 
 
 
 
7
  base_model: merged_ckpts/skills_red_herring_chat-lora
8
  model-index:
9
  - name: skills_red_herring_chat-then-skills_metaphor_chat-lora
 
15
 
16
  # skills_red_herring_chat-then-skills_metaphor_chat-lora
17
 
18
+ This model was trained from scratch on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.2034
21
 
22
  ## Model description
23
 
 
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
49
+ - num_epochs: 4.0
50
 
51
  ### Training results
52
 
53
  | Training Loss | Epoch | Step | Validation Loss |
54
  |:-------------:|:-----:|:----:|:---------------:|
55
+ | 0.4913 | 0.96 | 9 | 0.2320 |
56
+ | 0.2051 | 1.92 | 18 | 0.2080 |
57
+ | 0.1665 | 2.99 | 28 | 0.2038 |
58
+ | 0.1519 | 3.84 | 36 | 0.2034 |
59
 
60
 
61
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 0.96,
3
- "eval_loss": 0.24174152314662933,
4
- "eval_runtime": 5.1895,
5
  "eval_samples": 100,
6
- "eval_samples_per_second": 19.27,
7
- "eval_steps_per_second": 2.505,
8
- "train_loss": 0.3598402738571167,
9
- "train_runtime": 147.976,
10
  "train_samples": 300,
11
- "train_samples_per_second": 2.027,
12
- "train_steps_per_second": 0.061
13
  }
 
1
  {
2
+ "epoch": 3.84,
3
+ "eval_loss": 0.2034250646829605,
4
+ "eval_runtime": 6.2347,
5
  "eval_samples": 100,
6
+ "eval_samples_per_second": 16.039,
7
+ "eval_steps_per_second": 2.085,
8
+ "train_loss": 0.23011421908934912,
9
+ "train_runtime": 528.713,
10
  "train_samples": 300,
11
+ "train_samples_per_second": 2.27,
12
+ "train_steps_per_second": 0.068
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.96,
3
- "eval_loss": 0.24174152314662933,
4
- "eval_runtime": 5.1895,
5
  "eval_samples": 100,
6
- "eval_samples_per_second": 19.27,
7
- "eval_steps_per_second": 2.505
8
  }
 
1
  {
2
+ "epoch": 3.84,
3
+ "eval_loss": 0.2034250646829605,
4
+ "eval_runtime": 6.2347,
5
  "eval_samples": 100,
6
+ "eval_samples_per_second": 16.039,
7
+ "eval_steps_per_second": 2.085
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.96,
3
- "train_loss": 0.3598402738571167,
4
- "train_runtime": 147.976,
5
  "train_samples": 300,
6
- "train_samples_per_second": 2.027,
7
- "train_steps_per_second": 0.061
8
  }
 
1
  {
2
+ "epoch": 3.84,
3
+ "train_loss": 0.23011421908934912,
4
+ "train_runtime": 528.713,
5
  "train_samples": 300,
6
+ "train_samples_per_second": 2.27,
7
+ "train_steps_per_second": 0.068
8
  }
trainer_state.json CHANGED
@@ -1,49 +1,109 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.96,
5
  "eval_steps": 500,
6
- "global_step": 9,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.11,
13
- "learning_rate": 0.0002,
14
  "loss": 0.6469,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.53,
19
- "learning_rate": 0.0001,
20
- "loss": 0.408,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.96,
25
- "eval_loss": 0.24174152314662933,
26
- "eval_runtime": 6.0031,
27
- "eval_samples_per_second": 16.658,
28
- "eval_steps_per_second": 2.166,
29
  "step": 9
30
  },
31
  {
32
- "epoch": 0.96,
33
- "step": 9,
34
- "total_flos": 9507448946688.0,
35
- "train_loss": 0.3598402738571167,
36
- "train_runtime": 147.976,
37
- "train_samples_per_second": 2.027,
38
- "train_steps_per_second": 0.061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  }
40
  ],
41
  "logging_steps": 5,
42
- "max_steps": 9,
43
  "num_input_tokens_seen": 0,
44
- "num_train_epochs": 1,
45
  "save_steps": 500,
46
- "total_flos": 9507448946688.0,
47
  "train_batch_size": 4,
48
  "trial_name": null,
49
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.84,
5
  "eval_steps": 500,
6
+ "global_step": 36,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.11,
13
+ "learning_rate": 5e-05,
14
  "loss": 0.6469,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.53,
19
+ "learning_rate": 0.0001995184726672197,
20
+ "loss": 0.4913,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.96,
25
+ "eval_loss": 0.23202306032180786,
26
+ "eval_runtime": 6.7844,
27
+ "eval_samples_per_second": 14.74,
28
+ "eval_steps_per_second": 1.916,
29
  "step": 9
30
  },
31
  {
32
+ "epoch": 1.07,
33
+ "learning_rate": 0.00018314696123025454,
34
+ "loss": 0.2397,
35
+ "step": 10
36
+ },
37
+ {
38
+ "epoch": 1.6,
39
+ "learning_rate": 0.0001471396736825998,
40
+ "loss": 0.2051,
41
+ "step": 15
42
+ },
43
+ {
44
+ "epoch": 1.92,
45
+ "eval_loss": 0.20802158117294312,
46
+ "eval_runtime": 4.7703,
47
+ "eval_samples_per_second": 20.963,
48
+ "eval_steps_per_second": 2.725,
49
+ "step": 18
50
+ },
51
+ {
52
+ "epoch": 2.13,
53
+ "learning_rate": 0.0001,
54
+ "loss": 0.1798,
55
+ "step": 20
56
+ },
57
+ {
58
+ "epoch": 2.67,
59
+ "learning_rate": 5.286032631740023e-05,
60
+ "loss": 0.1665,
61
+ "step": 25
62
+ },
63
+ {
64
+ "epoch": 2.99,
65
+ "eval_loss": 0.20379072427749634,
66
+ "eval_runtime": 5.6795,
67
+ "eval_samples_per_second": 17.607,
68
+ "eval_steps_per_second": 2.289,
69
+ "step": 28
70
+ },
71
+ {
72
+ "epoch": 3.2,
73
+ "learning_rate": 1.6853038769745467e-05,
74
+ "loss": 0.1613,
75
+ "step": 30
76
+ },
77
+ {
78
+ "epoch": 3.73,
79
+ "learning_rate": 4.815273327803182e-07,
80
+ "loss": 0.1519,
81
+ "step": 35
82
+ },
83
+ {
84
+ "epoch": 3.84,
85
+ "eval_loss": 0.2034250646829605,
86
+ "eval_runtime": 5.6541,
87
+ "eval_samples_per_second": 17.686,
88
+ "eval_steps_per_second": 2.299,
89
+ "step": 36
90
+ },
91
+ {
92
+ "epoch": 3.84,
93
+ "step": 36,
94
+ "total_flos": 38498471247872.0,
95
+ "train_loss": 0.23011421908934912,
96
+ "train_runtime": 528.713,
97
+ "train_samples_per_second": 2.27,
98
+ "train_steps_per_second": 0.068
99
  }
100
  ],
101
  "logging_steps": 5,
102
+ "max_steps": 36,
103
  "num_input_tokens_seen": 0,
104
+ "num_train_epochs": 4,
105
  "save_steps": 500,
106
+ "total_flos": 38498471247872.0,
107
  "train_batch_size": 4,
108
  "trial_name": null,
109
  "trial_params": null