justinwangx commited on
Commit
07b3e60
1 Parent(s): 10ff823

Model save

Browse files
README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.1
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: mistral-instruct-adv-robust-50-sft-lora
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # mistral-instruct-adv-robust-50-sft-lora
15
+
16
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on an unknown dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 0.8817
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 0.0003
38
+ - train_batch_size: 4
39
+ - eval_batch_size: 8
40
+ - seed: 42
41
+ - distributed_type: multi-GPU
42
+ - num_devices: 4
43
+ - gradient_accumulation_steps: 16
44
+ - total_train_batch_size: 256
45
+ - total_eval_batch_size: 32
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: cosine
48
+ - num_epochs: 50
49
+
50
+ ### Training results
51
+
52
+ | Training Loss | Epoch | Step | Validation Loss |
53
+ |:-------------:|:-----:|:----:|:---------------:|
54
+ | 3.1318 | 0.12 | 1 | 2.8355 |
55
+ | 3.1318 | 1.12 | 2 | 2.6364 |
56
+ | 3.1318 | 2.12 | 3 | 2.4945 |
57
+ | 3.1318 | 3.12 | 4 | 2.5339 |
58
+ | 2.7386 | 4.12 | 5 | 2.3352 |
59
+ | 2.7386 | 5.12 | 6 | 2.2137 |
60
+ | 2.7386 | 6.12 | 7 | 2.1641 |
61
+ | 2.7386 | 7.12 | 8 | 2.1051 |
62
+ | 2.7386 | 8.12 | 9 | 2.0842 |
63
+ | 2.269 | 9.12 | 10 | 2.0479 |
64
+ | 2.269 | 10.12 | 11 | 1.9554 |
65
+ | 2.269 | 11.12 | 12 | 1.8555 |
66
+ | 2.269 | 12.12 | 13 | 1.7736 |
67
+ | 2.269 | 13.12 | 14 | 1.7906 |
68
+ | 1.9451 | 14.12 | 15 | 1.7737 |
69
+ | 1.9451 | 15.12 | 16 | 1.6677 |
70
+ | 1.9451 | 16.12 | 17 | 1.6411 |
71
+ | 1.9451 | 17.12 | 18 | 1.5739 |
72
+ | 1.9451 | 18.12 | 19 | 1.5334 |
73
+ | 1.6568 | 19.12 | 20 | 1.4794 |
74
+ | 1.6568 | 20.12 | 21 | 1.4008 |
75
+ | 1.6568 | 21.12 | 22 | 1.3625 |
76
+ | 1.6568 | 22.12 | 23 | 1.2964 |
77
+ | 1.6568 | 23.12 | 24 | 1.2041 |
78
+ | 1.3674 | 24.12 | 25 | 1.1971 |
79
+ | 1.3674 | 25.12 | 26 | 1.1571 |
80
+ | 1.3674 | 26.12 | 27 | 1.1080 |
81
+ | 1.3674 | 27.12 | 28 | 1.1099 |
82
+ | 1.3674 | 28.12 | 29 | 1.0930 |
83
+ | 1.145 | 29.12 | 30 | 1.0333 |
84
+ | 1.145 | 30.12 | 31 | 1.0096 |
85
+ | 1.145 | 31.12 | 32 | 1.0012 |
86
+ | 1.145 | 32.12 | 33 | 0.9266 |
87
+ | 1.145 | 33.12 | 34 | 0.9624 |
88
+ | 0.9987 | 34.12 | 35 | 0.9425 |
89
+ | 0.9987 | 35.12 | 36 | 0.9354 |
90
+ | 0.9987 | 36.12 | 37 | 0.9091 |
91
+ | 0.9987 | 37.12 | 38 | 0.9007 |
92
+ | 0.9987 | 38.12 | 39 | 0.9649 |
93
+ | 0.9071 | 39.12 | 40 | 0.9199 |
94
+ | 0.9071 | 40.12 | 41 | 0.8651 |
95
+ | 0.9071 | 41.12 | 42 | 0.8727 |
96
+ | 0.9071 | 42.12 | 43 | 0.8559 |
97
+ | 0.9071 | 43.12 | 44 | 0.8499 |
98
+ | 0.8522 | 44.12 | 45 | 0.8547 |
99
+ | 0.8522 | 45.12 | 46 | 0.8880 |
100
+ | 0.8522 | 46.12 | 47 | 0.8678 |
101
+ | 0.8522 | 47.12 | 48 | 0.8565 |
102
+ | 0.8522 | 48.12 | 49 | 0.8197 |
103
+ | 0.8153 | 49.12 | 50 | 0.8439 |
104
+
105
+
106
+ ### Framework versions
107
+
108
+ - Transformers 4.35.0
109
+ - Pytorch 2.1.0a0+32f93b1
110
+ - Datasets 2.14.6
111
+ - Tokenizers 0.14.1
adapter_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 16,
12
+ "lora_dropout": 0.1,
13
+ "modules_to_save": null,
14
+ "peft_type": "LORA",
15
+ "r": 64,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "q_proj",
20
+ "o_proj",
21
+ "k_proj",
22
+ "v_proj"
23
+ ],
24
+ "task_type": "CAUSAL_LM"
25
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8b8fb882c6507996dc92444210aadcbd379803735a95c8abe09cdc9cf4e6064
3
+ size 109086672
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 49.12,
3
+ "eval_loss": 0.8817058205604553,
4
+ "eval_runtime": 1.5268,
5
+ "eval_samples": 234,
6
+ "eval_samples_per_second": 153.259,
7
+ "eval_steps_per_second": 5.24,
8
+ "train_loss": 1.4773838996887207,
9
+ "train_runtime": 2397.7837,
10
+ "train_samples": 2097,
11
+ "train_samples_per_second": 43.728,
12
+ "train_steps_per_second": 0.167
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 49.12,
3
+ "eval_loss": 0.8817058205604553,
4
+ "eval_runtime": 1.5268,
5
+ "eval_samples": 234,
6
+ "eval_samples_per_second": 153.259,
7
+ "eval_steps_per_second": 5.24
8
+ }
runs/Jan08_21-52-00_c33a1a6d4547/events.out.tfevents.1704750734.c33a1a6d4547.2543.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:981f9a9e965b6b29c66f22c5fae8ebd41450651131f52be8d587ebd76088a903
3
+ size 19648
runs/Jan08_21-52-00_c33a1a6d4547/events.out.tfevents.1704753133.c33a1a6d4547.2543.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:631ec612be7427f241eb354776d8d14b9e78f42f4de5df304457d245e388214a
3
+ size 306
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "additional_special_tokens": [],
29
+ "bos_token": "<s>",
30
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
31
+ "clean_up_tokenization_spaces": false,
32
+ "eos_token": "</s>",
33
+ "legacy": true,
34
+ "model_max_length": 2048,
35
+ "pad_token": "</s>",
36
+ "sp_model_kwargs": {},
37
+ "spaces_between_special_tokens": false,
38
+ "tokenizer_class": "LlamaTokenizer",
39
+ "unk_token": "<unk>",
40
+ "use_default_system_prompt": false
41
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 49.12,
3
+ "train_loss": 1.4773838996887207,
4
+ "train_runtime": 2397.7837,
5
+ "train_samples": 2097,
6
+ "train_samples_per_second": 43.728,
7
+ "train_steps_per_second": 0.167
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 49.121212121212125,
5
+ "eval_steps": 500,
6
+ "global_step": 50,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.12,
13
+ "learning_rate": 0.00029999537364671844,
14
+ "loss": 3.1318,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.12,
19
+ "eval_loss": 2.835486888885498,
20
+ "eval_runtime": 2.1688,
21
+ "eval_samples_per_second": 107.896,
22
+ "eval_steps_per_second": 3.689,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 1.12,
27
+ "eval_loss": 2.6363508701324463,
28
+ "eval_runtime": 1.538,
29
+ "eval_samples_per_second": 152.15,
30
+ "eval_steps_per_second": 5.202,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 2.12,
35
+ "eval_loss": 2.4945499897003174,
36
+ "eval_runtime": 1.526,
37
+ "eval_samples_per_second": 153.347,
38
+ "eval_steps_per_second": 5.243,
39
+ "step": 3
40
+ },
41
+ {
42
+ "epoch": 3.12,
43
+ "eval_loss": 2.5338640213012695,
44
+ "eval_runtime": 1.5296,
45
+ "eval_samples_per_second": 152.982,
46
+ "eval_steps_per_second": 5.23,
47
+ "step": 4
48
+ },
49
+ {
50
+ "epoch": 4.12,
51
+ "learning_rate": 0.00029988435543610843,
52
+ "loss": 2.7386,
53
+ "step": 5
54
+ },
55
+ {
56
+ "epoch": 4.12,
57
+ "eval_loss": 2.3351666927337646,
58
+ "eval_runtime": 1.5386,
59
+ "eval_samples_per_second": 152.088,
60
+ "eval_steps_per_second": 5.2,
61
+ "step": 5
62
+ },
63
+ {
64
+ "epoch": 5.12,
65
+ "eval_loss": 2.2136902809143066,
66
+ "eval_runtime": 1.517,
67
+ "eval_samples_per_second": 154.247,
68
+ "eval_steps_per_second": 5.273,
69
+ "step": 6
70
+ },
71
+ {
72
+ "epoch": 6.12,
73
+ "eval_loss": 2.164069652557373,
74
+ "eval_runtime": 1.5148,
75
+ "eval_samples_per_second": 154.475,
76
+ "eval_steps_per_second": 5.281,
77
+ "step": 7
78
+ },
79
+ {
80
+ "epoch": 7.12,
81
+ "eval_loss": 2.105088710784912,
82
+ "eval_runtime": 1.5384,
83
+ "eval_samples_per_second": 152.111,
84
+ "eval_steps_per_second": 5.2,
85
+ "step": 8
86
+ },
87
+ {
88
+ "epoch": 8.12,
89
+ "eval_loss": 2.0841920375823975,
90
+ "eval_runtime": 1.6789,
91
+ "eval_samples_per_second": 139.374,
92
+ "eval_steps_per_second": 4.765,
93
+ "step": 9
94
+ },
95
+ {
96
+ "epoch": 9.12,
97
+ "learning_rate": 0.00029953760005996916,
98
+ "loss": 2.269,
99
+ "step": 10
100
+ },
101
+ {
102
+ "epoch": 9.12,
103
+ "eval_loss": 2.047882556915283,
104
+ "eval_runtime": 1.5168,
105
+ "eval_samples_per_second": 154.272,
106
+ "eval_steps_per_second": 5.274,
107
+ "step": 10
108
+ },
109
+ {
110
+ "epoch": 10.12,
111
+ "eval_loss": 1.9553548097610474,
112
+ "eval_runtime": 1.5263,
113
+ "eval_samples_per_second": 153.317,
114
+ "eval_steps_per_second": 5.242,
115
+ "step": 11
116
+ },
117
+ {
118
+ "epoch": 11.12,
119
+ "eval_loss": 1.8555233478546143,
120
+ "eval_runtime": 1.527,
121
+ "eval_samples_per_second": 153.245,
122
+ "eval_steps_per_second": 5.239,
123
+ "step": 12
124
+ },
125
+ {
126
+ "epoch": 12.12,
127
+ "eval_loss": 1.7735551595687866,
128
+ "eval_runtime": 1.5194,
129
+ "eval_samples_per_second": 154.009,
130
+ "eval_steps_per_second": 5.265,
131
+ "step": 13
132
+ },
133
+ {
134
+ "epoch": 13.12,
135
+ "eval_loss": 1.7906467914581299,
136
+ "eval_runtime": 1.5217,
137
+ "eval_samples_per_second": 153.779,
138
+ "eval_steps_per_second": 5.257,
139
+ "step": 14
140
+ },
141
+ {
142
+ "epoch": 14.12,
143
+ "learning_rate": 0.00029896026854323894,
144
+ "loss": 1.9451,
145
+ "step": 15
146
+ },
147
+ {
148
+ "epoch": 14.12,
149
+ "eval_loss": 1.7737478017807007,
150
+ "eval_runtime": 1.5139,
151
+ "eval_samples_per_second": 154.563,
152
+ "eval_steps_per_second": 5.284,
153
+ "step": 15
154
+ },
155
+ {
156
+ "epoch": 15.12,
157
+ "eval_loss": 1.6676586866378784,
158
+ "eval_runtime": 1.5263,
159
+ "eval_samples_per_second": 153.316,
160
+ "eval_steps_per_second": 5.242,
161
+ "step": 16
162
+ },
163
+ {
164
+ "epoch": 16.12,
165
+ "eval_loss": 1.6410826444625854,
166
+ "eval_runtime": 1.5192,
167
+ "eval_samples_per_second": 154.025,
168
+ "eval_steps_per_second": 5.266,
169
+ "step": 17
170
+ },
171
+ {
172
+ "epoch": 17.12,
173
+ "eval_loss": 1.5739473104476929,
174
+ "eval_runtime": 1.5309,
175
+ "eval_samples_per_second": 152.855,
176
+ "eval_steps_per_second": 5.226,
177
+ "step": 18
178
+ },
179
+ {
180
+ "epoch": 18.12,
181
+ "eval_loss": 1.5334192514419556,
182
+ "eval_runtime": 1.5271,
183
+ "eval_samples_per_second": 153.235,
184
+ "eval_steps_per_second": 5.239,
185
+ "step": 19
186
+ },
187
+ {
188
+ "epoch": 19.12,
189
+ "learning_rate": 0.00029815325108927063,
190
+ "loss": 1.6568,
191
+ "step": 20
192
+ },
193
+ {
194
+ "epoch": 19.12,
195
+ "eval_loss": 1.47941255569458,
196
+ "eval_runtime": 1.5335,
197
+ "eval_samples_per_second": 152.593,
198
+ "eval_steps_per_second": 5.217,
199
+ "step": 20
200
+ },
201
+ {
202
+ "epoch": 20.12,
203
+ "eval_loss": 1.4007827043533325,
204
+ "eval_runtime": 1.5222,
205
+ "eval_samples_per_second": 153.722,
206
+ "eval_steps_per_second": 5.255,
207
+ "step": 21
208
+ },
209
+ {
210
+ "epoch": 21.12,
211
+ "eval_loss": 1.3624812364578247,
212
+ "eval_runtime": 1.5197,
213
+ "eval_samples_per_second": 153.982,
214
+ "eval_steps_per_second": 5.264,
215
+ "step": 22
216
+ },
217
+ {
218
+ "epoch": 22.12,
219
+ "eval_loss": 1.2963740825653076,
220
+ "eval_runtime": 1.5258,
221
+ "eval_samples_per_second": 153.363,
222
+ "eval_steps_per_second": 5.243,
223
+ "step": 23
224
+ },
225
+ {
226
+ "epoch": 23.12,
227
+ "eval_loss": 1.2041164636611938,
228
+ "eval_runtime": 1.5251,
229
+ "eval_samples_per_second": 153.434,
230
+ "eval_steps_per_second": 5.246,
231
+ "step": 24
232
+ },
233
+ {
234
+ "epoch": 24.12,
235
+ "learning_rate": 0.00029711779206048454,
236
+ "loss": 1.3674,
237
+ "step": 25
238
+ },
239
+ {
240
+ "epoch": 24.12,
241
+ "eval_loss": 1.1971029043197632,
242
+ "eval_runtime": 1.535,
243
+ "eval_samples_per_second": 152.446,
244
+ "eval_steps_per_second": 5.212,
245
+ "step": 25
246
+ },
247
+ {
248
+ "epoch": 25.12,
249
+ "eval_loss": 1.1571109294891357,
250
+ "eval_runtime": 1.5213,
251
+ "eval_samples_per_second": 153.815,
252
+ "eval_steps_per_second": 5.259,
253
+ "step": 26
254
+ },
255
+ {
256
+ "epoch": 26.12,
257
+ "eval_loss": 1.1079976558685303,
258
+ "eval_runtime": 1.5286,
259
+ "eval_samples_per_second": 153.079,
260
+ "eval_steps_per_second": 5.233,
261
+ "step": 27
262
+ },
263
+ {
264
+ "epoch": 27.12,
265
+ "eval_loss": 1.109868049621582,
266
+ "eval_runtime": 1.5388,
267
+ "eval_samples_per_second": 152.068,
268
+ "eval_steps_per_second": 5.199,
269
+ "step": 28
270
+ },
271
+ {
272
+ "epoch": 28.12,
273
+ "eval_loss": 1.0929827690124512,
274
+ "eval_runtime": 1.5243,
275
+ "eval_samples_per_second": 153.513,
276
+ "eval_steps_per_second": 5.248,
277
+ "step": 29
278
+ },
279
+ {
280
+ "epoch": 29.12,
281
+ "learning_rate": 0.0002958554880596515,
282
+ "loss": 1.145,
283
+ "step": 30
284
+ },
285
+ {
286
+ "epoch": 29.12,
287
+ "eval_loss": 1.0333445072174072,
288
+ "eval_runtime": 1.528,
289
+ "eval_samples_per_second": 153.138,
290
+ "eval_steps_per_second": 5.235,
291
+ "step": 30
292
+ },
293
+ {
294
+ "epoch": 30.12,
295
+ "eval_loss": 1.009576678276062,
296
+ "eval_runtime": 1.5222,
297
+ "eval_samples_per_second": 153.722,
298
+ "eval_steps_per_second": 5.255,
299
+ "step": 31
300
+ },
301
+ {
302
+ "epoch": 31.12,
303
+ "eval_loss": 1.0011868476867676,
304
+ "eval_runtime": 1.5185,
305
+ "eval_samples_per_second": 154.104,
306
+ "eval_steps_per_second": 5.269,
307
+ "step": 32
308
+ },
309
+ {
310
+ "epoch": 32.12,
311
+ "eval_loss": 0.9265638589859009,
312
+ "eval_runtime": 1.5235,
313
+ "eval_samples_per_second": 153.589,
314
+ "eval_steps_per_second": 5.251,
315
+ "step": 33
316
+ },
317
+ {
318
+ "epoch": 33.12,
319
+ "eval_loss": 0.962448239326477,
320
+ "eval_runtime": 1.5219,
321
+ "eval_samples_per_second": 153.758,
322
+ "eval_steps_per_second": 5.257,
323
+ "step": 34
324
+ },
325
+ {
326
+ "epoch": 34.12,
327
+ "learning_rate": 0.000294368285468047,
328
+ "loss": 0.9987,
329
+ "step": 35
330
+ },
331
+ {
332
+ "epoch": 34.12,
333
+ "eval_loss": 0.9425073862075806,
334
+ "eval_runtime": 1.5206,
335
+ "eval_samples_per_second": 153.885,
336
+ "eval_steps_per_second": 5.261,
337
+ "step": 35
338
+ },
339
+ {
340
+ "epoch": 35.12,
341
+ "eval_loss": 0.9353674650192261,
342
+ "eval_runtime": 1.5211,
343
+ "eval_samples_per_second": 153.831,
344
+ "eval_steps_per_second": 5.259,
345
+ "step": 36
346
+ },
347
+ {
348
+ "epoch": 36.12,
349
+ "eval_loss": 0.9090538024902344,
350
+ "eval_runtime": 1.5239,
351
+ "eval_samples_per_second": 153.554,
352
+ "eval_steps_per_second": 5.25,
353
+ "step": 37
354
+ },
355
+ {
356
+ "epoch": 37.12,
357
+ "eval_loss": 0.9006912708282471,
358
+ "eval_runtime": 1.6666,
359
+ "eval_samples_per_second": 140.404,
360
+ "eval_steps_per_second": 4.8,
361
+ "step": 38
362
+ },
363
+ {
364
+ "epoch": 38.12,
365
+ "eval_loss": 0.9648869037628174,
366
+ "eval_runtime": 1.5236,
367
+ "eval_samples_per_second": 153.587,
368
+ "eval_steps_per_second": 5.251,
369
+ "step": 39
370
+ },
371
+ {
372
+ "epoch": 39.12,
373
+ "learning_rate": 0.00029265847744427303,
374
+ "loss": 0.9071,
375
+ "step": 40
376
+ },
377
+ {
378
+ "epoch": 39.12,
379
+ "eval_loss": 0.9199429154396057,
380
+ "eval_runtime": 1.526,
381
+ "eval_samples_per_second": 153.343,
382
+ "eval_steps_per_second": 5.242,
383
+ "step": 40
384
+ },
385
+ {
386
+ "epoch": 40.12,
387
+ "eval_loss": 0.8650604486465454,
388
+ "eval_runtime": 1.5281,
389
+ "eval_samples_per_second": 153.127,
390
+ "eval_steps_per_second": 5.235,
391
+ "step": 41
392
+ },
393
+ {
394
+ "epoch": 41.12,
395
+ "eval_loss": 0.8727077841758728,
396
+ "eval_runtime": 1.5186,
397
+ "eval_samples_per_second": 154.087,
398
+ "eval_steps_per_second": 5.268,
399
+ "step": 42
400
+ },
401
+ {
402
+ "epoch": 42.12,
403
+ "eval_loss": 0.8558970093727112,
404
+ "eval_runtime": 1.5297,
405
+ "eval_samples_per_second": 152.968,
406
+ "eval_steps_per_second": 5.23,
407
+ "step": 43
408
+ },
409
+ {
410
+ "epoch": 43.12,
411
+ "eval_loss": 0.8499311804771423,
412
+ "eval_runtime": 1.5225,
413
+ "eval_samples_per_second": 153.692,
414
+ "eval_steps_per_second": 5.254,
415
+ "step": 44
416
+ },
417
+ {
418
+ "epoch": 44.12,
419
+ "learning_rate": 0.0002907287003883726,
420
+ "loss": 0.8522,
421
+ "step": 45
422
+ },
423
+ {
424
+ "epoch": 44.12,
425
+ "eval_loss": 0.8547362089157104,
426
+ "eval_runtime": 1.5331,
427
+ "eval_samples_per_second": 152.637,
428
+ "eval_steps_per_second": 5.218,
429
+ "step": 45
430
+ },
431
+ {
432
+ "epoch": 45.12,
433
+ "eval_loss": 0.8880292177200317,
434
+ "eval_runtime": 1.5217,
435
+ "eval_samples_per_second": 153.771,
436
+ "eval_steps_per_second": 5.257,
437
+ "step": 46
438
+ },
439
+ {
440
+ "epoch": 46.12,
441
+ "eval_loss": 0.8677502870559692,
442
+ "eval_runtime": 1.5273,
443
+ "eval_samples_per_second": 153.214,
444
+ "eval_steps_per_second": 5.238,
445
+ "step": 47
446
+ },
447
+ {
448
+ "epoch": 47.12,
449
+ "eval_loss": 0.8565409183502197,
450
+ "eval_runtime": 1.5222,
451
+ "eval_samples_per_second": 153.723,
452
+ "eval_steps_per_second": 5.255,
453
+ "step": 48
454
+ },
455
+ {
456
+ "epoch": 48.12,
457
+ "eval_loss": 0.8197174072265625,
458
+ "eval_runtime": 1.5188,
459
+ "eval_samples_per_second": 154.07,
460
+ "eval_steps_per_second": 5.267,
461
+ "step": 49
462
+ },
463
+ {
464
+ "epoch": 49.12,
465
+ "learning_rate": 0.000288581929876693,
466
+ "loss": 0.8153,
467
+ "step": 50
468
+ },
469
+ {
470
+ "epoch": 49.12,
471
+ "eval_loss": 0.8439480662345886,
472
+ "eval_runtime": 1.5245,
473
+ "eval_samples_per_second": 153.497,
474
+ "eval_steps_per_second": 5.248,
475
+ "step": 50
476
+ },
477
+ {
478
+ "epoch": 49.12,
479
+ "step": 50,
480
+ "total_flos": 4527521789902848.0,
481
+ "train_loss": 1.4773838996887207,
482
+ "train_runtime": 2397.7837,
483
+ "train_samples_per_second": 43.728,
484
+ "train_steps_per_second": 0.167
485
+ }
486
+ ],
487
+ "logging_steps": 5,
488
+ "max_steps": 400,
489
+ "num_train_epochs": 50,
490
+ "save_steps": 500,
491
+ "total_flos": 4527521789902848.0,
492
+ "trial_name": null,
493
+ "trial_params": null
494
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a81458bbf3b08b77cd58016c15a52a8b576953f0dd41f6b8585e6b3ee4d7bbb1
3
+ size 5688