koichi12 commited on
Commit
d3983b6
·
verified ·
1 Parent(s): d151261

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. wandb/run-20240804_021032-cd2cg2ui/files/config.yaml +314 -0
  2. wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt +271 -0
  3. wandb/run-20240804_021032-cd2cg2ui/logs/debug-internal.log +185 -0
  4. wandb/run-20240804_021032-cd2cg2ui/logs/debug.log +28 -0
  5. wandb/run-20240804_021032-cd2cg2ui/run-cd2cg2ui.wandb +0 -0
  6. wandb/run-20240804_035140-nyllt780/files/config.yaml +335 -0
  7. wandb/run-20240804_035140-nyllt780/files/output.log +130 -0
  8. wandb/run-20240804_035140-nyllt780/files/requirements.txt +271 -0
  9. wandb/run-20240804_035140-nyllt780/files/wandb-metadata.json +215 -0
  10. wandb/run-20240804_035140-nyllt780/files/wandb-summary.json +1 -0
  11. wandb/run-20240804_035140-nyllt780/logs/debug-internal.log +215 -0
  12. wandb/run-20240804_035140-nyllt780/logs/debug.log +30 -0
  13. wandb/run-20240804_035140-nyllt780/run-nyllt780.wandb +0 -0
  14. wandb/run-20240804_211947-niq3ake5/files/config.yaml +335 -0
  15. wandb/run-20240804_211947-niq3ake5/files/output.log +135 -0
  16. wandb/run-20240804_211947-niq3ake5/files/requirements.txt +271 -0
  17. wandb/run-20240804_211947-niq3ake5/files/wandb-metadata.json +215 -0
  18. wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json +1 -0
  19. wandb/run-20240804_211947-niq3ake5/logs/debug-internal.log +213 -0
  20. wandb/run-20240804_211947-niq3ake5/logs/debug.log +30 -0
  21. wandb/run-20240804_211947-niq3ake5/run-niq3ake5.wandb +0 -0
  22. wandb/run-20240812_055620-qpw0uqx2/files/config.yaml +314 -0
  23. wandb/run-20240812_055620-qpw0uqx2/files/output.log +9 -0
  24. wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt +271 -0
  25. wandb/run-20240812_055620-qpw0uqx2/files/wandb-metadata.json +215 -0
  26. wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json +1 -0
  27. wandb/run-20240812_055620-qpw0uqx2/logs/debug-internal.log +181 -0
  28. wandb/run-20240812_055620-qpw0uqx2/logs/debug.log +27 -0
  29. wandb/run-20240812_055620-qpw0uqx2/run-qpw0uqx2.wandb +0 -0
  30. wandb/run-20240812_073955-ikoro1zp/files/config.yaml +335 -0
  31. wandb/run-20240812_073955-ikoro1zp/files/output.log +0 -0
  32. wandb/run-20240812_073955-ikoro1zp/files/requirements.txt +271 -0
  33. wandb/run-20240812_073955-ikoro1zp/files/wandb-metadata.json +215 -0
  34. wandb/run-20240812_073955-ikoro1zp/files/wandb-summary.json +1 -0
  35. wandb/run-20240812_073955-ikoro1zp/logs/debug-internal.log +0 -0
  36. wandb/run-20240812_073955-ikoro1zp/logs/debug.log +29 -0
  37. wandb/run-20240823_160642-78xnl14c/files/config.yaml +342 -0
  38. wandb/run-20240823_160642-78xnl14c/files/output.log +253 -0
  39. wandb/run-20240823_160642-78xnl14c/files/requirements.txt +375 -0
  40. wandb/run-20240823_160642-78xnl14c/files/wandb-metadata.json +220 -0
  41. wandb/run-20240823_160642-78xnl14c/files/wandb-summary.json +1 -0
  42. wandb/run-20240823_160642-78xnl14c/logs/debug-internal.log +0 -0
  43. wandb/run-20240823_160642-78xnl14c/logs/debug.log +30 -0
  44. wandb/run-20240823_160642-78xnl14c/run-78xnl14c.wandb +0 -0
  45. wandb/run-20240823_162922-z3gs82jm/files/config.yaml +342 -0
  46. wandb/run-20240823_162922-z3gs82jm/files/output.log +174 -0
  47. wandb/run-20240823_162922-z3gs82jm/files/requirements.txt +375 -0
  48. wandb/run-20240823_162922-z3gs82jm/files/wandb-metadata.json +220 -0
  49. wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json +1 -0
  50. wandb/run-20240823_162922-z3gs82jm/logs/debug-internal.log +453 -0
wandb/run-20240804_021032-cd2cg2ui/files/config.yaml ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 1024
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-mistral-sample_train_2024-08-04-02:10:14
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-mistral-sample
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-mistral-sample
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/custom/tiny-mistral
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 8192
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-mistral-sample
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32768
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722705032.417279
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_021032-cd2cg2ui/logs/debug-internal.log ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 02:10:32,669 INFO StreamThr :11309 [internal.py:wandb_internal():86] W&B internal server running at pid: 11309, started at: 2024-08-04 02:10:32.417731
2
+ 2024-08-04 02:10:32,670 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 02:10:32,672 INFO WriterThread:11309 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_021032-cd2cg2ui/run-cd2cg2ui.wandb
4
+ 2024-08-04 02:10:32,673 DEBUG SenderThread:11309 [sender.py:send():382] send: header
5
+ 2024-08-04 02:10:32,883 DEBUG SenderThread:11309 [sender.py:send():382] send: run
6
+ 2024-08-04 02:10:33,348 INFO SenderThread:11309 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_021032-cd2cg2ui/files
7
+ 2024-08-04 02:10:33,348 INFO SenderThread:11309 [sender.py:_start_run_threads():1136] run started: cd2cg2ui with start time 1722705032.417279
8
+ 2024-08-04 02:10:33,353 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 02:10:33,354 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 02:10:33,438 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 02:10:33,444 DEBUG HandlerThread:11309 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 02:10:33,444 DEBUG HandlerThread:11309 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 02:10:33,444 INFO HandlerThread:11309 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 02:10:33,444 INFO SystemMonitor:11309 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 02:10:33,445 INFO HandlerThread:11309 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 02:10:33,445 INFO SystemMonitor:11309 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 02:10:33,445 INFO SystemMonitor:11309 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 02:10:33,447 INFO SystemMonitor:11309 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 02:10:33,447 INFO SystemMonitor:11309 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 02:10:33,448 INFO SystemMonitor:11309 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 02:10:33,458 DEBUG HandlerThread:11309 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 02:10:33,460 DEBUG HandlerThread:11309 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 02:10:33,471 DEBUG HandlerThread:11309 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 02:10:33,471 DEBUG HandlerThread:11309 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 02:10:33,471 DEBUG HandlerThread:11309 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T17:10:33.458421', 'startedAt': '2024-08-03T17:10:32.395506', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '8192', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample', '--load', '/work/llm_recipes/models/tiny-mistral-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample_train_2024-08-04-02:10:14'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-04 02:10:33,471 INFO HandlerThread:11309 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 02:10:33,471 INFO HandlerThread:11309 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 02:10:33,473 INFO HandlerThread:11309 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 02:10:33,478 DEBUG SenderThread:11309 [sender.py:send():382] send: files
30
+ 2024-08-04 02:10:33,479 INFO SenderThread:11309 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 02:10:33,488 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 02:10:33,488 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 02:10:33,488 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: python_packages
34
+ 2024-08-04 02:10:33,489 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-04 02:10:33,490 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 02:10:33,776 DEBUG SenderThread:11309 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 02:10:34,131 INFO wandb-upload_0:11309 [upload_job.py:push():131] Uploaded file /tmp/tmpcp1trk59wandb/1uhn5dog-wandb-metadata.json
38
+ 2024-08-04 02:10:34,327 DEBUG SenderThread:11309 [sender.py:send():382] send: exit
39
+ 2024-08-04 02:10:34,327 INFO SenderThread:11309 [sender.py:send_exit():589] handling exit code: 1
40
+ 2024-08-04 02:10:34,327 INFO SenderThread:11309 [sender.py:send_exit():591] handling runtime: 0
41
+ 2024-08-04 02:10:34,328 INFO SenderThread:11309 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
42
+ 2024-08-04 02:10:34,329 INFO SenderThread:11309 [sender.py:send_exit():597] send defer
43
+ 2024-08-04 02:10:34,329 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
44
+ 2024-08-04 02:10:34,329 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 0
45
+ 2024-08-04 02:10:34,329 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
46
+ 2024-08-04 02:10:34,329 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 0
47
+ 2024-08-04 02:10:34,329 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 1
48
+ 2024-08-04 02:10:34,329 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
49
+ 2024-08-04 02:10:34,329 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 1
50
+ 2024-08-04 02:10:34,330 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
51
+ 2024-08-04 02:10:34,330 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 1
52
+ 2024-08-04 02:10:34,330 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 2
53
+ 2024-08-04 02:10:34,330 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
54
+ 2024-08-04 02:10:34,330 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 2
55
+ 2024-08-04 02:10:34,330 INFO HandlerThread:11309 [system_monitor.py:finish():203] Stopping system monitor
56
+ 2024-08-04 02:10:34,330 DEBUG SystemMonitor:11309 [system_monitor.py:_start():172] Starting system metrics aggregation loop
57
+ 2024-08-04 02:10:34,330 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined cpu monitor
58
+ 2024-08-04 02:10:34,330 DEBUG SystemMonitor:11309 [system_monitor.py:_start():179] Finished system metrics aggregation loop
59
+ 2024-08-04 02:10:34,331 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined disk monitor
60
+ 2024-08-04 02:10:34,331 DEBUG SystemMonitor:11309 [system_monitor.py:_start():183] Publishing last batch of metrics
61
+ 2024-08-04 02:10:34,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt
62
+ 2024-08-04 02:10:34,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021032-cd2cg2ui/files/output.log
63
+ 2024-08-04 02:10:34,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json
64
+ 2024-08-04 02:10:34,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json
65
+ 2024-08-04 02:10:34,365 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined gpu monitor
66
+ 2024-08-04 02:10:34,366 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined memory monitor
67
+ 2024-08-04 02:10:34,366 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined network monitor
68
+ 2024-08-04 02:10:34,366 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
69
+ 2024-08-04 02:10:34,366 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 2
70
+ 2024-08-04 02:10:34,366 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 3
71
+ 2024-08-04 02:10:34,366 DEBUG SenderThread:11309 [sender.py:send():382] send: stats
72
+ 2024-08-04 02:10:34,366 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
73
+ 2024-08-04 02:10:34,367 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 3
74
+ 2024-08-04 02:10:34,367 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
75
+ 2024-08-04 02:10:34,367 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 3
76
+ 2024-08-04 02:10:34,367 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 4
77
+ 2024-08-04 02:10:34,367 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
78
+ 2024-08-04 02:10:34,367 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 4
79
+ 2024-08-04 02:10:34,367 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
80
+ 2024-08-04 02:10:34,367 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 4
81
+ 2024-08-04 02:10:34,367 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 5
82
+ 2024-08-04 02:10:34,367 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
83
+ 2024-08-04 02:10:34,367 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 5
84
+ 2024-08-04 02:10:34,368 DEBUG SenderThread:11309 [sender.py:send():382] send: summary
85
+ 2024-08-04 02:10:34,368 INFO SenderThread:11309 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
86
+ 2024-08-04 02:10:34,369 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
87
+ 2024-08-04 02:10:34,369 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 5
88
+ 2024-08-04 02:10:34,369 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 6
89
+ 2024-08-04 02:10:34,369 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
90
+ 2024-08-04 02:10:34,369 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 6
91
+ 2024-08-04 02:10:34,369 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
92
+ 2024-08-04 02:10:34,369 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 6
93
+ 2024-08-04 02:10:34,372 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: status_report
94
+ 2024-08-04 02:10:34,573 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 7
95
+ 2024-08-04 02:10:34,573 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
96
+ 2024-08-04 02:10:34,573 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 7
97
+ 2024-08-04 02:10:34,573 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
98
+ 2024-08-04 02:10:34,573 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 7
99
+ 2024-08-04 02:10:35,327 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
100
+ 2024-08-04 02:10:35,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021032-cd2cg2ui/files/config.yaml
101
+ 2024-08-04 02:10:36,318 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 8
102
+ 2024-08-04 02:10:36,318 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
103
+ 2024-08-04 02:10:36,318 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
104
+ 2024-08-04 02:10:36,318 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 8
105
+ 2024-08-04 02:10:36,318 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
106
+ 2024-08-04 02:10:36,318 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 8
107
+ 2024-08-04 02:10:36,319 INFO SenderThread:11309 [job_builder.py:build():296] Attempting to build job artifact
108
+ 2024-08-04 02:10:36,319 INFO SenderThread:11309 [job_builder.py:_get_source_type():426] is repo sourced job
109
+ 2024-08-04 02:10:36,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
110
+ 2024-08-04 02:10:36,333 INFO SenderThread:11309 [job_builder.py:build():402] adding wandb-job metadata file
111
+ 2024-08-04 02:10:36,342 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 9
112
+ 2024-08-04 02:10:36,342 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
113
+ 2024-08-04 02:10:36,342 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
114
+ 2024-08-04 02:10:36,342 DEBUG SenderThread:11309 [sender.py:send():382] send: artifact
115
+ 2024-08-04 02:10:36,342 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 9
116
+ 2024-08-04 02:10:36,351 INFO Thread-12 :11309 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021032-cd2cg2ui/files/output.log
117
+ 2024-08-04 02:10:37,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
118
+ 2024-08-04 02:10:37,552 INFO wandb-upload_1:11309 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpteaibpd9
119
+ 2024-08-04 02:10:37,910 INFO wandb-upload_0:11309 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp9hfu5wh3
120
+ 2024-08-04 02:10:39,216 INFO SenderThread:11309 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MTk4ODAyMA==', 'versionIndex': 2}}}
121
+ 2024-08-04 02:10:39,216 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
122
+ 2024-08-04 02:10:39,216 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 9
123
+ 2024-08-04 02:10:39,216 INFO SenderThread:11309 [dir_watcher.py:finish():358] shutting down directory watcher
124
+ 2024-08-04 02:10:39,352 INFO SenderThread:11309 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_021032-cd2cg2ui/files
125
+ 2024-08-04 02:10:39,352 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt requirements.txt
126
+ 2024-08-04 02:10:39,352 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/config.yaml config.yaml
127
+ 2024-08-04 02:10:39,354 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json wandb-metadata.json
128
+ 2024-08-04 02:10:39,354 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json wandb-summary.json
129
+ 2024-08-04 02:10:39,355 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/output.log output.log
130
+ 2024-08-04 02:10:39,357 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 10
131
+ 2024-08-04 02:10:39,357 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
132
+ 2024-08-04 02:10:39,358 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
133
+ 2024-08-04 02:10:39,358 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 10
134
+ 2024-08-04 02:10:39,359 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
135
+ 2024-08-04 02:10:39,359 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 10
136
+ 2024-08-04 02:10:39,359 INFO SenderThread:11309 [file_pusher.py:finish():172] shutting down file pusher
137
+ 2024-08-04 02:10:39,788 INFO wandb-upload_0:11309 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021032-cd2cg2ui/files/config.yaml
138
+ 2024-08-04 02:10:39,856 INFO wandb-upload_1:11309 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt
139
+ 2024-08-04 02:10:39,931 INFO wandb-upload_3:11309 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021032-cd2cg2ui/files/output.log
140
+ 2024-08-04 02:10:39,937 INFO wandb-upload_2:11309 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json
141
+ 2024-08-04 02:10:40,137 INFO Thread-11 (_thread_body):11309 [sender.py:transition_state():617] send defer: 11
142
+ 2024-08-04 02:10:40,137 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
143
+ 2024-08-04 02:10:40,137 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 11
144
+ 2024-08-04 02:10:40,138 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
145
+ 2024-08-04 02:10:40,138 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 11
146
+ 2024-08-04 02:10:40,138 INFO SenderThread:11309 [file_pusher.py:join():178] waiting for file pusher
147
+ 2024-08-04 02:10:40,138 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 12
148
+ 2024-08-04 02:10:40,138 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
149
+ 2024-08-04 02:10:40,138 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 12
150
+ 2024-08-04 02:10:40,138 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
151
+ 2024-08-04 02:10:40,138 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 12
152
+ 2024-08-04 02:10:40,138 INFO SenderThread:11309 [file_stream.py:finish():595] file stream finish called
153
+ 2024-08-04 02:10:40,324 INFO SenderThread:11309 [file_stream.py:finish():599] file stream finish is done
154
+ 2024-08-04 02:10:40,324 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 13
155
+ 2024-08-04 02:10:40,324 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
156
+ 2024-08-04 02:10:40,324 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 13
157
+ 2024-08-04 02:10:40,324 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
158
+ 2024-08-04 02:10:40,324 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 13
159
+ 2024-08-04 02:10:40,324 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 14
160
+ 2024-08-04 02:10:40,325 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
161
+ 2024-08-04 02:10:40,325 DEBUG SenderThread:11309 [sender.py:send():382] send: final
162
+ 2024-08-04 02:10:40,325 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 14
163
+ 2024-08-04 02:10:40,325 DEBUG SenderThread:11309 [sender.py:send():382] send: footer
164
+ 2024-08-04 02:10:40,325 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
165
+ 2024-08-04 02:10:40,325 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 14
166
+ 2024-08-04 02:10:40,325 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
167
+ 2024-08-04 02:10:40,326 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
168
+ 2024-08-04 02:10:40,326 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
169
+ 2024-08-04 02:10:40,326 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: server_info
170
+ 2024-08-04 02:10:40,326 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
171
+ 2024-08-04 02:10:40,326 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: server_info
172
+ 2024-08-04 02:10:40,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: get_summary
173
+ 2024-08-04 02:10:40,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: sampled_history
174
+ 2024-08-04 02:10:40,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: internal_messages
175
+ 2024-08-04 02:10:40,329 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: job_info
176
+ 2024-08-04 02:10:40,492 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: job_info
177
+ 2024-08-04 02:10:40,492 INFO MainThread:11309 [wandb_run.py:_footer_history_summary_info():3866] rendering history
178
+ 2024-08-04 02:10:40,492 INFO MainThread:11309 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
179
+ 2024-08-04 02:10:40,492 INFO MainThread:11309 [wandb_run.py:_footer_sync_info():3825] logging synced files
180
+ 2024-08-04 02:10:40,492 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: shutdown
181
+ 2024-08-04 02:10:40,492 INFO HandlerThread:11309 [handler.py:finish():869] shutting down handler
182
+ 2024-08-04 02:10:41,329 INFO WriterThread:11309 [datastore.py:close():296] close: /project/wandb/run-20240804_021032-cd2cg2ui/run-cd2cg2ui.wandb
183
+ 2024-08-04 02:10:41,492 INFO SenderThread:11309 [sender.py:finish():1572] shutting down sender
184
+ 2024-08-04 02:10:41,492 INFO SenderThread:11309 [file_pusher.py:finish():172] shutting down file pusher
185
+ 2024-08-04 02:10:41,492 INFO SenderThread:11309 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_021032-cd2cg2ui/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 02:10:32,410 INFO MainThread:11238 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Configure stats pid to 11238
3
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
6
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_021032-cd2cg2ui/logs/debug.log
9
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_021032-cd2cg2ui/logs/debug-internal.log
10
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-04-02:10:14', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 8192, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 02:10:32,416 INFO MainThread:11238 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 02:10:32,416 INFO MainThread:11238 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 02:10:32,422 INFO MainThread:11238 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 02:10:32,879 INFO MainThread:11238 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 02:10:33,353 INFO MainThread:11238 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 02:10:33,431 INFO MainThread:11238 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 02:10:33,431 INFO MainThread:11238 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 02:10:33,487 INFO MainThread:11238 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 02:10:33,487 INFO MainThread:11238 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 02:10:33,487 INFO MainThread:11238 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 02:10:33,488 INFO MainThread:11238 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 02:10:33,489 INFO MainThread:11238 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 02:10:41,493 WARNING MsgRouterThr:11238 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_021032-cd2cg2ui/run-cd2cg2ui.wandb ADDED
Binary file (7.16 kB). View file
 
wandb/run-20240804_035140-nyllt780/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-llama-sample_train_2024-08-04-03:51:30
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-llama-sample
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-llama-sample
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-llama-sample
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722711100.510646
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 2048
321
+ model_type:
322
+ desc: null
323
+ value: llama
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 2048
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 32
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 22
333
+ model_architecture:
334
+ desc: null
335
+ value: LlamaForCausalLM
wandb/run-20240804_035140-nyllt780/files/output.log ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-llama-sample.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
8
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
9
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
10
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
11
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
12
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
13
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
14
+ No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping model loading
15
+ --> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
16
+ --> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
17
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
18
+ warnings.warn(
19
+ Let split = None
20
+ Building a BlendedDataset for a single MegatronDataset
21
+ Unable to save the indexes because path_to_cache is None
22
+ Building a BlendedDataset for a single MegatronDataset
23
+ Unable to save the indexes because path_to_cache is None
24
+ Building a BlendedDataset for a single MegatronDataset
25
+ Unable to save the indexes because path_to_cache is None
26
+ BFloat16 enabled for mixed precision - using bfSixteen policy
27
+ --> applying fsdp activation checkpointing...
28
+ > datasets target sizes (minimum size):
29
+ train: 6400000
30
+ validation: 323200
31
+ test: 3200
32
+ > building train, validation, and test datasets for GPT ...
33
+ > finished creating GPT datasets ...
34
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
35
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
36
+ No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping optimizer loading
37
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
38
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
39
+ model info: FullyShardedDataParallel(
40
+ (_fsdp_wrapped_module): LlamaForCausalLM(
41
+ (model): LlamaModel(
42
+ (embed_tokens): Embedding(32000, 2048)
43
+ (layers): ModuleList(
44
+ (0-21): 22 x FullyShardedDataParallel(
45
+ (_fsdp_wrapped_module): CheckpointWrapper(
46
+ (_checkpoint_wrapped_module): LlamaDecoderLayer(
47
+ (self_attn): LlamaFlashAttention2(
48
+ (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
49
+ (k_proj): Linear(in_features=2048, out_features=256, bias=False)
50
+ (v_proj): Linear(in_features=2048, out_features=256, bias=False)
51
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
52
+ (rotary_emb): LlamaRotaryEmbedding()
53
+ )
54
+ (mlp): LlamaMLP(
55
+ (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
56
+ (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
57
+ (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
58
+ (act_fn): SiLU()
59
+ )
60
+ (input_layernorm): LlamaRMSNorm()
61
+ (post_attention_layernorm): LlamaRMSNorm()
62
+ )
63
+ )
64
+ )
65
+ )
66
+ (norm): LlamaRMSNorm()
67
+ (rotary_emb): LlamaRotaryEmbedding()
68
+ )
69
+ (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
70
+ )
71
+ )
72
+ model config: LlamaConfig {
73
+ "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
74
+ "architectures": [
75
+ "LlamaForCausalLM"
76
+ ],
77
+ "attention_bias": false,
78
+ "attention_dropout": 0.0,
79
+ "bos_token_id": 1,
80
+ "eos_token_id": 2,
81
+ "hidden_act": "silu",
82
+ "hidden_size": 2048,
83
+ "initializer_range": 0.02,
84
+ "intermediate_size": 5632,
85
+ "label_smoothing": 0.0,
86
+ "max_position_embeddings": 2048,
87
+ "mlp_bias": false,
88
+ "model_type": "llama",
89
+ "num_attention_heads": 32,
90
+ "num_hidden_layers": 22,
91
+ "num_key_value_heads": 4,
92
+ "pretraining_tp": 1,
93
+ "rms_norm_eps": 1e-05,
94
+ "rope_scaling": null,
95
+ "rope_theta": 10000.0,
96
+ "tie_word_embeddings": false,
97
+ "torch_dtype": "float32",
98
+ "transformers_version": "4.43.3",
99
+ "use_cache": false,
100
+ "vocab_size": 32000
101
+ }
102
+ Traceback (most recent call last):
103
+ File "/project/examples/finetuning.py", line 13, in <module>
104
+ main()
105
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
106
+ train(
107
+ File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
108
+ loss: torch.Tensor = model(**batch).loss
109
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
110
+ return self._call_impl(*args, **kwargs)
111
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
112
+ return forward_call(*args, **kwargs)
113
+ File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
114
+ output = self._fsdp_wrapped_module(*args, **kwargs)
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
116
+ return self._call_impl(*args, **kwargs)
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
118
+ return forward_call(*args, **kwargs)
119
+ File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
120
+ outputs = self.model(
121
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
122
+ return self._call_impl(*args, **kwargs)
123
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
124
+ return forward_call(*args, **kwargs)
125
+ File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
126
+ cache_position = torch.arange(
127
+ RuntimeError: CUDA error: device-side assert triggered
128
+ CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
129
+ For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
130
+ Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
wandb/run-20240804_035140-nyllt780/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_035140-nyllt780/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-03T18:51:41.236802",
5
+ "startedAt": "2024-08-03T18:51:40.498160",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-llama-sample",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-llama-sample",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-llama-sample",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-llama-sample_train_2024-08-04-03:51:30"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.034,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.034,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.034,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.034,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.034,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.034,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.034,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.034,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.034,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.034,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.034,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.034,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.034,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.034,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.034,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.034,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.034,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.034,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.034,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240804_035140-nyllt780/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 63}}
wandb/run-20240804_035140-nyllt780/logs/debug-internal.log ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 03:51:40,525 INFO StreamThr :12425 [internal.py:wandb_internal():86] W&B internal server running at pid: 12425, started at: 2024-08-04 03:51:40.511089
2
+ 2024-08-04 03:51:40,526 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 03:51:40,527 INFO WriterThread:12425 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_035140-nyllt780/run-nyllt780.wandb
4
+ 2024-08-04 03:51:40,541 DEBUG SenderThread:12425 [sender.py:send():382] send: header
5
+ 2024-08-04 03:51:40,658 DEBUG SenderThread:12425 [sender.py:send():382] send: run
6
+ 2024-08-04 03:51:41,127 INFO SenderThread:12425 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_035140-nyllt780/files
7
+ 2024-08-04 03:51:41,127 INFO SenderThread:12425 [sender.py:_start_run_threads():1136] run started: nyllt780 with start time 1722711100.510646
8
+ 2024-08-04 03:51:41,132 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 03:51:41,133 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 03:51:41,218 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 03:51:41,224 DEBUG HandlerThread:12425 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 03:51:41,224 DEBUG HandlerThread:12425 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 03:51:41,224 INFO HandlerThread:12425 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 03:51:41,224 INFO SystemMonitor:12425 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 03:51:41,224 INFO HandlerThread:12425 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 03:51:41,225 INFO SystemMonitor:12425 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 03:51:41,225 INFO SystemMonitor:12425 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 03:51:41,226 INFO SystemMonitor:12425 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 03:51:41,227 INFO SystemMonitor:12425 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 03:51:41,228 INFO SystemMonitor:12425 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 03:51:41,236 DEBUG HandlerThread:12425 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 03:51:41,238 DEBUG HandlerThread:12425 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 03:51:41,249 DEBUG HandlerThread:12425 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 03:51:41,249 DEBUG HandlerThread:12425 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 03:51:41,250 DEBUG HandlerThread:12425 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T18:51:41.236802', 'startedAt': '2024-08-03T18:51:40.498160', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama-sample', '--load', '/work/llm_recipes/models/tiny-llama-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama-sample_train_2024-08-04-03:51:30'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-04 03:51:41,250 INFO HandlerThread:12425 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 03:51:41,250 INFO HandlerThread:12425 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 03:51:41,302 INFO HandlerThread:12425 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 03:51:41,308 DEBUG SenderThread:12425 [sender.py:send():382] send: files
30
+ 2024-08-04 03:51:41,308 INFO SenderThread:12425 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 03:51:41,317 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 03:51:41,317 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 03:51:41,317 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-04 03:51:41,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-04 03:51:41,335 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 03:51:41,618 DEBUG SenderThread:12425 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 03:51:41,985 INFO wandb-upload_0:12425 [upload_job.py:push():131] Uploaded file /tmp/tmpxkt1klm7wandb/bxmu94ae-wandb-metadata.json
38
+ 2024-08-04 03:51:42,129 INFO Thread-12 :12425 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035140-nyllt780/files/output.log
39
+ 2024-08-04 03:51:42,129 INFO Thread-12 :12425 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035140-nyllt780/files/requirements.txt
40
+ 2024-08-04 03:51:42,129 INFO Thread-12 :12425 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035140-nyllt780/files/wandb-metadata.json
41
+ 2024-08-04 03:51:44,129 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
42
+ 2024-08-04 03:51:45,608 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-04 03:51:48,132 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
44
+ 2024-08-04 03:51:50,610 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
45
+ 2024-08-04 03:51:55,611 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
46
+ 2024-08-04 03:51:56,316 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
47
+ 2024-08-04 03:51:56,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
48
+ 2024-08-04 03:51:56,317 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
49
+ 2024-08-04 03:52:01,592 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
50
+ 2024-08-04 03:52:06,593 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
51
+ 2024-08-04 03:52:11,316 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
52
+ 2024-08-04 03:52:11,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
53
+ 2024-08-04 03:52:11,360 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
54
+ 2024-08-04 03:52:12,552 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-08-04 03:52:13,160 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/config.yaml
56
+ 2024-08-04 03:52:17,755 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
57
+ 2024-08-04 03:52:22,755 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
58
+ 2024-08-04 03:52:26,316 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
59
+ 2024-08-04 03:52:26,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
60
+ 2024-08-04 03:52:26,360 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
61
+ 2024-08-04 03:52:28,589 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-04 03:52:33,590 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
63
+ 2024-08-04 03:52:38,591 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
64
+ 2024-08-04 03:52:41,228 DEBUG SystemMonitor:12425 [system_monitor.py:_start():172] Starting system metrics aggregation loop
65
+ 2024-08-04 03:52:41,230 DEBUG SenderThread:12425 [sender.py:send():382] send: stats
66
+ 2024-08-04 03:52:41,316 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
67
+ 2024-08-04 03:52:41,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
68
+ 2024-08-04 03:52:41,360 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
69
+ 2024-08-04 03:52:43,008 DEBUG SenderThread:12425 [sender.py:send():382] send: config
70
+ 2024-08-04 03:52:43,008 DEBUG SenderThread:12425 [sender.py:send():382] send: config
71
+ 2024-08-04 03:52:44,011 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
72
+ 2024-08-04 03:52:44,176 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
73
+ 2024-08-04 03:52:44,726 DEBUG SenderThread:12425 [sender.py:send():382] send: exit
74
+ 2024-08-04 03:52:44,726 INFO SenderThread:12425 [sender.py:send_exit():589] handling exit code: 1
75
+ 2024-08-04 03:52:44,726 INFO SenderThread:12425 [sender.py:send_exit():591] handling runtime: 63
76
+ 2024-08-04 03:52:44,741 INFO SenderThread:12425 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
77
+ 2024-08-04 03:52:44,741 INFO SenderThread:12425 [sender.py:send_exit():597] send defer
78
+ 2024-08-04 03:52:44,742 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
79
+ 2024-08-04 03:52:44,742 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 0
80
+ 2024-08-04 03:52:44,742 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
81
+ 2024-08-04 03:52:44,742 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 0
82
+ 2024-08-04 03:52:44,742 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 1
83
+ 2024-08-04 03:52:44,742 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
84
+ 2024-08-04 03:52:44,742 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 1
85
+ 2024-08-04 03:52:44,742 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
86
+ 2024-08-04 03:52:44,742 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 1
87
+ 2024-08-04 03:52:44,742 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 2
88
+ 2024-08-04 03:52:44,742 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
89
+ 2024-08-04 03:52:44,742 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 2
90
+ 2024-08-04 03:52:44,742 INFO HandlerThread:12425 [system_monitor.py:finish():203] Stopping system monitor
91
+ 2024-08-04 03:52:44,743 DEBUG SystemMonitor:12425 [system_monitor.py:_start():179] Finished system metrics aggregation loop
92
+ 2024-08-04 03:52:44,743 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined cpu monitor
93
+ 2024-08-04 03:52:44,743 DEBUG SystemMonitor:12425 [system_monitor.py:_start():183] Publishing last batch of metrics
94
+ 2024-08-04 03:52:44,743 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined disk monitor
95
+ 2024-08-04 03:52:44,777 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined gpu monitor
96
+ 2024-08-04 03:52:44,777 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined memory monitor
97
+ 2024-08-04 03:52:44,777 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined network monitor
98
+ 2024-08-04 03:52:44,778 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
99
+ 2024-08-04 03:52:44,778 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 2
100
+ 2024-08-04 03:52:44,778 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 3
101
+ 2024-08-04 03:52:44,778 DEBUG SenderThread:12425 [sender.py:send():382] send: stats
102
+ 2024-08-04 03:52:44,778 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
103
+ 2024-08-04 03:52:44,778 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 3
104
+ 2024-08-04 03:52:44,778 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
105
+ 2024-08-04 03:52:44,779 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 3
106
+ 2024-08-04 03:52:44,779 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 4
107
+ 2024-08-04 03:52:44,779 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
108
+ 2024-08-04 03:52:44,779 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 4
109
+ 2024-08-04 03:52:44,779 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
110
+ 2024-08-04 03:52:44,779 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 4
111
+ 2024-08-04 03:52:44,779 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 5
112
+ 2024-08-04 03:52:44,779 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
113
+ 2024-08-04 03:52:44,779 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 5
114
+ 2024-08-04 03:52:44,779 DEBUG SenderThread:12425 [sender.py:send():382] send: summary
115
+ 2024-08-04 03:52:44,780 INFO SenderThread:12425 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
116
+ 2024-08-04 03:52:44,780 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
117
+ 2024-08-04 03:52:44,780 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 5
118
+ 2024-08-04 03:52:44,780 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 6
119
+ 2024-08-04 03:52:44,780 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
120
+ 2024-08-04 03:52:44,781 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 6
121
+ 2024-08-04 03:52:44,781 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
122
+ 2024-08-04 03:52:44,781 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 6
123
+ 2024-08-04 03:52:44,781 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 7
124
+ 2024-08-04 03:52:44,781 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
125
+ 2024-08-04 03:52:44,781 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
126
+ 2024-08-04 03:52:44,781 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 7
127
+ 2024-08-04 03:52:44,781 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
128
+ 2024-08-04 03:52:44,781 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 7
129
+ 2024-08-04 03:52:45,177 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/config.yaml
130
+ 2024-08-04 03:52:45,177 INFO Thread-12 :12425 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035140-nyllt780/files/wandb-summary.json
131
+ 2024-08-04 03:52:45,726 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
132
+ 2024-08-04 03:52:46,178 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
133
+ 2024-08-04 03:52:47,600 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 8
134
+ 2024-08-04 03:52:47,600 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
135
+ 2024-08-04 03:52:47,600 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
136
+ 2024-08-04 03:52:47,601 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 8
137
+ 2024-08-04 03:52:47,601 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
138
+ 2024-08-04 03:52:47,601 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 8
139
+ 2024-08-04 03:52:47,601 INFO SenderThread:12425 [job_builder.py:build():296] Attempting to build job artifact
140
+ 2024-08-04 03:52:47,602 INFO SenderThread:12425 [job_builder.py:_get_source_type():426] is repo sourced job
141
+ 2024-08-04 03:52:47,616 INFO SenderThread:12425 [job_builder.py:build():402] adding wandb-job metadata file
142
+ 2024-08-04 03:52:47,688 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 9
143
+ 2024-08-04 03:52:47,689 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
144
+ 2024-08-04 03:52:47,689 DEBUG SenderThread:12425 [sender.py:send():382] send: artifact
145
+ 2024-08-04 03:52:47,689 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 9
146
+ 2024-08-04 03:52:47,727 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
147
+ 2024-08-04 03:52:48,179 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
148
+ 2024-08-04 03:52:48,575 INFO SenderThread:12425 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
149
+ 2024-08-04 03:52:48,575 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
150
+ 2024-08-04 03:52:48,575 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 9
151
+ 2024-08-04 03:52:48,575 INFO SenderThread:12425 [dir_watcher.py:finish():358] shutting down directory watcher
152
+ 2024-08-04 03:52:49,180 INFO SenderThread:12425 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_035140-nyllt780/files
153
+ 2024-08-04 03:52:49,180 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/requirements.txt requirements.txt
154
+ 2024-08-04 03:52:49,180 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/config.yaml config.yaml
155
+ 2024-08-04 03:52:49,181 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/wandb-metadata.json wandb-metadata.json
156
+ 2024-08-04 03:52:49,182 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/wandb-summary.json wandb-summary.json
157
+ 2024-08-04 03:52:49,183 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/output.log output.log
158
+ 2024-08-04 03:52:49,185 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 10
159
+ 2024-08-04 03:52:49,185 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
160
+ 2024-08-04 03:52:49,185 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
161
+ 2024-08-04 03:52:49,187 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 10
162
+ 2024-08-04 03:52:49,187 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
163
+ 2024-08-04 03:52:49,187 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 10
164
+ 2024-08-04 03:52:49,187 INFO SenderThread:12425 [file_pusher.py:finish():172] shutting down file pusher
165
+ 2024-08-04 03:52:49,580 INFO wandb-upload_0:12425 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035140-nyllt780/files/requirements.txt
166
+ 2024-08-04 03:52:49,719 INFO wandb-upload_1:12425 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035140-nyllt780/files/config.yaml
167
+ 2024-08-04 03:52:49,727 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
168
+ 2024-08-04 03:52:49,727 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
169
+ 2024-08-04 03:52:49,752 INFO wandb-upload_2:12425 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035140-nyllt780/files/wandb-summary.json
170
+ 2024-08-04 03:52:49,778 INFO wandb-upload_3:12425 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035140-nyllt780/files/output.log
171
+ 2024-08-04 03:52:49,978 INFO Thread-11 (_thread_body):12425 [sender.py:transition_state():617] send defer: 11
172
+ 2024-08-04 03:52:49,978 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
173
+ 2024-08-04 03:52:49,979 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 11
174
+ 2024-08-04 03:52:49,979 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
175
+ 2024-08-04 03:52:49,979 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 11
176
+ 2024-08-04 03:52:49,979 INFO SenderThread:12425 [file_pusher.py:join():178] waiting for file pusher
177
+ 2024-08-04 03:52:49,979 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 12
178
+ 2024-08-04 03:52:49,979 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
179
+ 2024-08-04 03:52:49,979 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 12
180
+ 2024-08-04 03:52:49,979 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
181
+ 2024-08-04 03:52:49,979 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 12
182
+ 2024-08-04 03:52:49,979 INFO SenderThread:12425 [file_stream.py:finish():595] file stream finish called
183
+ 2024-08-04 03:52:50,544 INFO SenderThread:12425 [file_stream.py:finish():599] file stream finish is done
184
+ 2024-08-04 03:52:50,544 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 13
185
+ 2024-08-04 03:52:50,545 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
186
+ 2024-08-04 03:52:50,545 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 13
187
+ 2024-08-04 03:52:50,545 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
188
+ 2024-08-04 03:52:50,545 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 13
189
+ 2024-08-04 03:52:50,545 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 14
190
+ 2024-08-04 03:52:50,545 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
191
+ 2024-08-04 03:52:50,545 DEBUG SenderThread:12425 [sender.py:send():382] send: final
192
+ 2024-08-04 03:52:50,545 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 14
193
+ 2024-08-04 03:52:50,545 DEBUG SenderThread:12425 [sender.py:send():382] send: footer
194
+ 2024-08-04 03:52:50,546 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
195
+ 2024-08-04 03:52:50,546 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 14
196
+ 2024-08-04 03:52:50,546 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
197
+ 2024-08-04 03:52:50,546 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
198
+ 2024-08-04 03:52:50,546 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
199
+ 2024-08-04 03:52:50,547 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: server_info
200
+ 2024-08-04 03:52:50,547 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: get_summary
201
+ 2024-08-04 03:52:50,547 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
202
+ 2024-08-04 03:52:50,547 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: sampled_history
203
+ 2024-08-04 03:52:50,547 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: server_info
204
+ 2024-08-04 03:52:50,548 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
205
+ 2024-08-04 03:52:50,549 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: job_info
206
+ 2024-08-04 03:52:50,716 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: job_info
207
+ 2024-08-04 03:52:50,717 INFO MainThread:12425 [wandb_run.py:_footer_history_summary_info():3866] rendering history
208
+ 2024-08-04 03:52:50,717 INFO MainThread:12425 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
209
+ 2024-08-04 03:52:50,717 INFO MainThread:12425 [wandb_run.py:_footer_sync_info():3825] logging synced files
210
+ 2024-08-04 03:52:50,717 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: shutdown
211
+ 2024-08-04 03:52:50,717 INFO HandlerThread:12425 [handler.py:finish():869] shutting down handler
212
+ 2024-08-04 03:52:51,549 INFO WriterThread:12425 [datastore.py:close():296] close: /project/wandb/run-20240804_035140-nyllt780/run-nyllt780.wandb
213
+ 2024-08-04 03:52:51,717 INFO SenderThread:12425 [sender.py:finish():1572] shutting down sender
214
+ 2024-08-04 03:52:51,717 INFO SenderThread:12425 [file_pusher.py:finish():172] shutting down file pusher
215
+ 2024-08-04 03:52:51,717 INFO SenderThread:12425 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_035140-nyllt780/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 03:51:40,503 INFO MainThread:12354 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Configure stats pid to 12354
3
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
6
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_035140-nyllt780/logs/debug.log
9
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_035140-nyllt780/logs/debug-internal.log
10
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama-sample_train_2024-08-04-03:51:30', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama-sample', 'save': '/work/llm_recipes/models/tiny-llama-sample', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 03:51:40,509 INFO MainThread:12354 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 03:51:40,510 INFO MainThread:12354 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 03:51:40,515 INFO MainThread:12354 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 03:51:40,654 INFO MainThread:12354 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 03:51:41,132 INFO MainThread:12354 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 03:51:41,211 INFO MainThread:12354 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 03:51:41,211 INFO MainThread:12354 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 03:51:41,316 INFO MainThread:12354 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 03:51:41,317 INFO MainThread:12354 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 03:51:41,317 INFO MainThread:12354 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 03:51:41,317 INFO MainThread:12354 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 03:51:41,318 INFO MainThread:12354 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 03:52:43,007 INFO MainThread:12354 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
29
+ 2024-08-04 03:52:43,008 INFO MainThread:12354 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 03:52:51,718 WARNING MsgRouterThr:12354 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_035140-nyllt780/run-nyllt780.wandb ADDED
Binary file (22.5 kB). View file
 
wandb/run-20240804_211947-niq3ake5/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-llama_train_2024-08-04-21:19:16
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-llama
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-llama
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 2000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 2000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-llama
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722773987.17106
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 2048
321
+ model_type:
322
+ desc: null
323
+ value: llama
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 2048
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 32
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 22
333
+ model_architecture:
334
+ desc: null
335
+ value: LlamaForCausalLM
wandb/run-20240804_211947-niq3ake5/files/output.log ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-llama.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
11
+ --> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
12
+ --> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
13
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
14
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
15
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
16
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
17
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
18
+ warnings.warn(
19
+ BFloat16 enabled for mixed precision - using bfSixteen policy
20
+ --> applying fsdp activation checkpointing...
21
+ > datasets target sizes (minimum size):
22
+ train: 640000
23
+ validation: 35200
24
+ test: 3200
25
+ > building train, validation, and test datasets for GPT ...
26
+ > finished creating GPT datasets ...
27
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
28
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
29
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
30
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
31
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
32
+ model info: FullyShardedDataParallel(
33
+ (_fsdp_wrapped_module): LlamaForCausalLM(
34
+ (model): LlamaModel(
35
+ (embed_tokens): Embedding(32000, 2048)
36
+ (layers): ModuleList(
37
+ (0-21): 22 x FullyShardedDataParallel(
38
+ (_fsdp_wrapped_module): CheckpointWrapper(
39
+ (_checkpoint_wrapped_module): LlamaDecoderLayer(
40
+ (self_attn): LlamaFlashAttention2(
41
+ (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
42
+ (k_proj): Linear(in_features=2048, out_features=256, bias=False)
43
+ (v_proj): Linear(in_features=2048, out_features=256, bias=False)
44
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
45
+ (rotary_emb): LlamaRotaryEmbedding()
46
+ )
47
+ (mlp): LlamaMLP(
48
+ (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
49
+ (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
50
+ (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
51
+ (act_fn): SiLU()
52
+ )
53
+ (input_layernorm): LlamaRMSNorm()
54
+ (post_attention_layernorm): LlamaRMSNorm()
55
+ )
56
+ )
57
+ )
58
+ )
59
+ (norm): LlamaRMSNorm()
60
+ (rotary_emb): LlamaRotaryEmbedding()
61
+ )
62
+ (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
63
+ )
64
+ )
65
+ model config: LlamaConfig {
66
+ "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
67
+ "architectures": [
68
+ "LlamaForCausalLM"
69
+ ],
70
+ "attention_bias": false,
71
+ "attention_dropout": 0.0,
72
+ "bos_token_id": 1,
73
+ "eos_token_id": 2,
74
+ "hidden_act": "silu",
75
+ "hidden_size": 2048,
76
+ "initializer_range": 0.02,
77
+ "intermediate_size": 5632,
78
+ "label_smoothing": 0.0,
79
+ "max_position_embeddings": 2048,
80
+ "mlp_bias": false,
81
+ "model_type": "llama",
82
+ "num_attention_heads": 32,
83
+ "num_hidden_layers": 22,
84
+ "num_key_value_heads": 4,
85
+ "pretraining_tp": 1,
86
+ "rms_norm_eps": 1e-05,
87
+ "rope_scaling": null,
88
+ "rope_theta": 10000.0,
89
+ "tie_word_embeddings": false,
90
+ "torch_dtype": "float32",
91
+ "transformers_version": "4.43.3",
92
+ "use_cache": false,
93
+ "vocab_size": 32000
94
+ }
95
+ Let split = None
96
+ Building a BlendedDataset for a single MegatronDataset
97
+ Unable to save the indexes because path_to_cache is None
98
+ Building a BlendedDataset for a single MegatronDataset
99
+ Unable to save the indexes because path_to_cache is None
100
+ Building a BlendedDataset for a single MegatronDataset
101
+ Unable to save the indexes because path_to_cache is None
102
+ Traceback (most recent call last):
103
+ File "/project/examples/finetuning.py", line 13, in <module>
104
+ main()
105
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
106
+ train(
107
+ File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
108
+ batch = next(train_dataloader)
109
+ File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
110
+ for x in iter:
111
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
112
+ data = self._next_data()
113
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
114
+ return self._process_data(data)
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
116
+ data.reraise()
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
118
+ raise exception
119
+ RuntimeError: Caught RuntimeError in DataLoader worker process 0.
120
+ Original Traceback (most recent call last):
121
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
122
+ data = fetcher.fetch(index)
123
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
124
+ return self.collate_fn(data)
125
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
126
+ return collate(batch, collate_fn_map=default_collate_fn_map)
127
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
128
+ return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
129
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
130
+ return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
131
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
132
+ return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
133
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
134
+ return torch.stack(batch, 0, out=out)
135
+ RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1
wandb/run-20240804_211947-niq3ake5/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_211947-niq3ake5/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-04T12:19:47.940599",
5
+ "startedAt": "2024-08-04T12:19:47.157671",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "2000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "2000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-llama",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-llama",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-llama",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-llama_train_2024-08-04-21:19:16"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.044999999999,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.045,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.045,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.045,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.045,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.045,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.045,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.045,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.045,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.045,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.045,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.045,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.045,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.045,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.045,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.045,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.045,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.045,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.045,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 67}}
wandb/run-20240804_211947-niq3ake5/logs/debug-internal.log ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 21:19:47,171 INFO StreamThr :10096 [internal.py:wandb_internal():86] W&B internal server running at pid: 10096, started at: 2024-08-04 21:19:47.170590
2
+ 2024-08-04 21:19:47,173 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 21:19:47,176 INFO WriterThread:10096 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_211947-niq3ake5/run-niq3ake5.wandb
4
+ 2024-08-04 21:19:47,177 DEBUG SenderThread:10096 [sender.py:send():382] send: header
5
+ 2024-08-04 21:19:47,316 DEBUG SenderThread:10096 [sender.py:send():382] send: run
6
+ 2024-08-04 21:19:47,822 INFO SenderThread:10096 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_211947-niq3ake5/files
7
+ 2024-08-04 21:19:47,822 INFO SenderThread:10096 [sender.py:_start_run_threads():1136] run started: niq3ake5 with start time 1722773987.17106
8
+ 2024-08-04 21:19:47,827 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 21:19:47,827 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 21:19:47,917 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 21:19:47,923 DEBUG HandlerThread:10096 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 21:19:47,923 DEBUG HandlerThread:10096 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 21:19:47,923 INFO HandlerThread:10096 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 21:19:47,923 INFO SystemMonitor:10096 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 21:19:47,924 INFO HandlerThread:10096 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 21:19:47,924 INFO SystemMonitor:10096 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 21:19:47,924 INFO SystemMonitor:10096 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 21:19:47,925 INFO SystemMonitor:10096 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 21:19:47,926 INFO SystemMonitor:10096 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 21:19:47,927 INFO SystemMonitor:10096 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 21:19:47,940 DEBUG HandlerThread:10096 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 21:19:47,942 DEBUG HandlerThread:10096 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 21:19:47,954 DEBUG HandlerThread:10096 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 21:19:47,954 DEBUG HandlerThread:10096 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 21:19:47,954 DEBUG HandlerThread:10096 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T12:19:47.940599', 'startedAt': '2024-08-04T12:19:47.157671', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-21:19:16'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.044999999999, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-04 21:19:47,954 INFO HandlerThread:10096 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 21:19:47,954 INFO HandlerThread:10096 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 21:19:47,955 INFO HandlerThread:10096 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 21:19:47,982 DEBUG SenderThread:10096 [sender.py:send():382] send: files
30
+ 2024-08-04 21:19:47,982 INFO SenderThread:10096 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 21:19:47,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 21:19:47,992 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: python_packages
33
+ 2024-08-04 21:19:47,992 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
34
+ 2024-08-04 21:19:47,992 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-04 21:19:47,993 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 21:19:48,264 DEBUG SenderThread:10096 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 21:19:48,653 INFO wandb-upload_0:10096 [upload_job.py:push():131] Uploaded file /tmp/tmpc_z53slvwandb/somhprnl-wandb-metadata.json
38
+ 2024-08-04 21:19:48,823 INFO Thread-12 :10096 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_211947-niq3ake5/files/requirements.txt
39
+ 2024-08-04 21:19:48,824 INFO Thread-12 :10096 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_211947-niq3ake5/files/wandb-metadata.json
40
+ 2024-08-04 21:19:52,265 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
41
+ 2024-08-04 21:19:53,826 INFO Thread-12 :10096 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
42
+ 2024-08-04 21:19:55,827 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
43
+ 2024-08-04 21:19:57,441 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
44
+ 2024-08-04 21:19:59,829 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
45
+ 2024-08-04 21:20:02,739 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
46
+ 2024-08-04 21:20:02,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
47
+ 2024-08-04 21:20:02,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
48
+ 2024-08-04 21:20:02,992 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
49
+ 2024-08-04 21:20:08,241 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
50
+ 2024-08-04 21:20:13,242 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
51
+ 2024-08-04 21:20:17,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
52
+ 2024-08-04 21:20:17,991 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
53
+ 2024-08-04 21:20:18,032 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
54
+ 2024-08-04 21:20:18,266 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-08-04 21:20:18,841 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/config.yaml
56
+ 2024-08-04 21:20:23,460 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
57
+ 2024-08-04 21:20:28,461 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
58
+ 2024-08-04 21:20:32,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
59
+ 2024-08-04 21:20:32,991 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
60
+ 2024-08-04 21:20:33,032 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
61
+ 2024-08-04 21:20:34,171 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-04 21:20:39,171 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
63
+ 2024-08-04 21:20:44,172 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
64
+ 2024-08-04 21:20:47,927 DEBUG SystemMonitor:10096 [system_monitor.py:_start():172] Starting system metrics aggregation loop
65
+ 2024-08-04 21:20:47,929 DEBUG SenderThread:10096 [sender.py:send():382] send: stats
66
+ 2024-08-04 21:20:47,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
67
+ 2024-08-04 21:20:47,991 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
68
+ 2024-08-04 21:20:48,032 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
69
+ 2024-08-04 21:20:49,259 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
70
+ 2024-08-04 21:20:53,862 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
71
+ 2024-08-04 21:20:54,334 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
72
+ 2024-08-04 21:20:54,437 DEBUG SenderThread:10096 [sender.py:send():382] send: config
73
+ 2024-08-04 21:20:54,437 DEBUG SenderThread:10096 [sender.py:send():382] send: config
74
+ 2024-08-04 21:20:55,009 DEBUG SenderThread:10096 [sender.py:send():382] send: exit
75
+ 2024-08-04 21:20:55,009 INFO SenderThread:10096 [sender.py:send_exit():589] handling exit code: 1
76
+ 2024-08-04 21:20:55,009 INFO SenderThread:10096 [sender.py:send_exit():591] handling runtime: 67
77
+ 2024-08-04 21:20:55,010 INFO SenderThread:10096 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
78
+ 2024-08-04 21:20:55,011 INFO SenderThread:10096 [sender.py:send_exit():597] send defer
79
+ 2024-08-04 21:20:55,011 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
80
+ 2024-08-04 21:20:55,011 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 0
81
+ 2024-08-04 21:20:55,011 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
82
+ 2024-08-04 21:20:55,011 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 0
83
+ 2024-08-04 21:20:55,011 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 1
84
+ 2024-08-04 21:20:55,011 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
85
+ 2024-08-04 21:20:55,011 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 1
86
+ 2024-08-04 21:20:55,011 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
87
+ 2024-08-04 21:20:55,011 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 1
88
+ 2024-08-04 21:20:55,012 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 2
89
+ 2024-08-04 21:20:55,012 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
90
+ 2024-08-04 21:20:55,012 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 2
91
+ 2024-08-04 21:20:55,012 INFO HandlerThread:10096 [system_monitor.py:finish():203] Stopping system monitor
92
+ 2024-08-04 21:20:55,012 DEBUG SystemMonitor:10096 [system_monitor.py:_start():179] Finished system metrics aggregation loop
93
+ 2024-08-04 21:20:55,012 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined cpu monitor
94
+ 2024-08-04 21:20:55,012 DEBUG SystemMonitor:10096 [system_monitor.py:_start():183] Publishing last batch of metrics
95
+ 2024-08-04 21:20:55,012 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined disk monitor
96
+ 2024-08-04 21:20:55,046 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined gpu monitor
97
+ 2024-08-04 21:20:55,047 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined memory monitor
98
+ 2024-08-04 21:20:55,047 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined network monitor
99
+ 2024-08-04 21:20:55,047 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
100
+ 2024-08-04 21:20:55,047 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 2
101
+ 2024-08-04 21:20:55,047 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 3
102
+ 2024-08-04 21:20:55,047 DEBUG SenderThread:10096 [sender.py:send():382] send: stats
103
+ 2024-08-04 21:20:55,047 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
104
+ 2024-08-04 21:20:55,048 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 3
105
+ 2024-08-04 21:20:55,048 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
106
+ 2024-08-04 21:20:55,048 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 3
107
+ 2024-08-04 21:20:55,048 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 4
108
+ 2024-08-04 21:20:55,048 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
109
+ 2024-08-04 21:20:55,048 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 4
110
+ 2024-08-04 21:20:55,048 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
111
+ 2024-08-04 21:20:55,048 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 4
112
+ 2024-08-04 21:20:55,048 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 5
113
+ 2024-08-04 21:20:55,048 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
114
+ 2024-08-04 21:20:55,048 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 5
115
+ 2024-08-04 21:20:55,049 DEBUG SenderThread:10096 [sender.py:send():382] send: summary
116
+ 2024-08-04 21:20:55,050 INFO SenderThread:10096 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
117
+ 2024-08-04 21:20:55,050 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
118
+ 2024-08-04 21:20:55,050 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 5
119
+ 2024-08-04 21:20:55,050 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 6
120
+ 2024-08-04 21:20:55,050 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
121
+ 2024-08-04 21:20:55,050 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 6
122
+ 2024-08-04 21:20:55,050 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
123
+ 2024-08-04 21:20:55,050 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 6
124
+ 2024-08-04 21:20:55,053 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
125
+ 2024-08-04 21:20:55,265 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 7
126
+ 2024-08-04 21:20:55,265 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
127
+ 2024-08-04 21:20:55,265 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 7
128
+ 2024-08-04 21:20:55,265 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
129
+ 2024-08-04 21:20:55,265 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 7
130
+ 2024-08-04 21:20:55,512 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 8
131
+ 2024-08-04 21:20:55,513 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
132
+ 2024-08-04 21:20:55,513 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 8
133
+ 2024-08-04 21:20:55,513 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
134
+ 2024-08-04 21:20:55,513 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 8
135
+ 2024-08-04 21:20:55,513 INFO SenderThread:10096 [job_builder.py:build():296] Attempting to build job artifact
136
+ 2024-08-04 21:20:55,514 INFO SenderThread:10096 [job_builder.py:_get_source_type():426] is repo sourced job
137
+ 2024-08-04 21:20:55,528 INFO SenderThread:10096 [job_builder.py:build():402] adding wandb-job metadata file
138
+ 2024-08-04 21:20:55,537 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 9
139
+ 2024-08-04 21:20:55,537 DEBUG SenderThread:10096 [sender.py:send():382] send: artifact
140
+ 2024-08-04 21:20:55,537 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
141
+ 2024-08-04 21:20:55,539 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 9
142
+ 2024-08-04 21:20:55,864 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/config.yaml
143
+ 2024-08-04 21:20:55,864 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
144
+ 2024-08-04 21:20:55,864 INFO Thread-12 :10096 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json
145
+ 2024-08-04 21:20:56,009 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: poll_exit
146
+ 2024-08-04 21:20:57,540 INFO SenderThread:10096 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
147
+ 2024-08-04 21:20:57,540 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
148
+ 2024-08-04 21:20:57,540 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 9
149
+ 2024-08-04 21:20:57,540 INFO SenderThread:10096 [dir_watcher.py:finish():358] shutting down directory watcher
150
+ 2024-08-04 21:20:57,865 INFO SenderThread:10096 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_211947-niq3ake5/files
151
+ 2024-08-04 21:20:57,865 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/requirements.txt requirements.txt
152
+ 2024-08-04 21:20:57,865 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/config.yaml config.yaml
153
+ 2024-08-04 21:20:57,867 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/wandb-metadata.json wandb-metadata.json
154
+ 2024-08-04 21:20:57,867 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json wandb-summary.json
155
+ 2024-08-04 21:20:57,869 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/output.log output.log
156
+ 2024-08-04 21:20:57,869 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 10
157
+ 2024-08-04 21:20:57,870 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: poll_exit
158
+ 2024-08-04 21:20:57,872 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
159
+ 2024-08-04 21:20:57,872 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 10
160
+ 2024-08-04 21:20:57,872 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
161
+ 2024-08-04 21:20:57,872 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 10
162
+ 2024-08-04 21:20:57,872 INFO SenderThread:10096 [file_pusher.py:finish():172] shutting down file pusher
163
+ 2024-08-04 21:20:58,009 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: poll_exit
164
+ 2024-08-04 21:20:58,010 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: poll_exit
165
+ 2024-08-04 21:20:58,272 INFO wandb-upload_1:10096 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_211947-niq3ake5/files/config.yaml
166
+ 2024-08-04 21:20:58,376 INFO wandb-upload_0:10096 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_211947-niq3ake5/files/requirements.txt
167
+ 2024-08-04 21:20:58,453 INFO wandb-upload_2:10096 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json
168
+ 2024-08-04 21:20:58,476 INFO wandb-upload_3:10096 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_211947-niq3ake5/files/output.log
169
+ 2024-08-04 21:20:58,677 INFO Thread-11 (_thread_body):10096 [sender.py:transition_state():617] send defer: 11
170
+ 2024-08-04 21:20:58,677 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
171
+ 2024-08-04 21:20:58,677 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 11
172
+ 2024-08-04 21:20:58,677 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
173
+ 2024-08-04 21:20:58,677 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 11
174
+ 2024-08-04 21:20:58,677 INFO SenderThread:10096 [file_pusher.py:join():178] waiting for file pusher
175
+ 2024-08-04 21:20:58,677 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 12
176
+ 2024-08-04 21:20:58,677 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
177
+ 2024-08-04 21:20:58,677 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 12
178
+ 2024-08-04 21:20:58,678 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
179
+ 2024-08-04 21:20:58,678 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 12
180
+ 2024-08-04 21:20:58,678 INFO SenderThread:10096 [file_stream.py:finish():595] file stream finish called
181
+ 2024-08-04 21:20:58,860 INFO SenderThread:10096 [file_stream.py:finish():599] file stream finish is done
182
+ 2024-08-04 21:20:58,860 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 13
183
+ 2024-08-04 21:20:58,860 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
184
+ 2024-08-04 21:20:58,860 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 13
185
+ 2024-08-04 21:20:58,860 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
186
+ 2024-08-04 21:20:58,860 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 13
187
+ 2024-08-04 21:20:58,860 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 14
188
+ 2024-08-04 21:20:58,861 DEBUG SenderThread:10096 [sender.py:send():382] send: final
189
+ 2024-08-04 21:20:58,861 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
190
+ 2024-08-04 21:20:58,861 DEBUG SenderThread:10096 [sender.py:send():382] send: footer
191
+ 2024-08-04 21:20:58,861 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 14
192
+ 2024-08-04 21:20:58,861 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
193
+ 2024-08-04 21:20:58,861 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 14
194
+ 2024-08-04 21:20:58,861 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: poll_exit
195
+ 2024-08-04 21:20:58,862 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: poll_exit
196
+ 2024-08-04 21:20:58,862 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: poll_exit
197
+ 2024-08-04 21:20:58,862 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: poll_exit
198
+ 2024-08-04 21:20:58,862 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: server_info
199
+ 2024-08-04 21:20:58,863 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: server_info
200
+ 2024-08-04 21:20:58,864 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: get_summary
201
+ 2024-08-04 21:20:58,864 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: sampled_history
202
+ 2024-08-04 21:20:58,864 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
203
+ 2024-08-04 21:20:58,865 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: job_info
204
+ 2024-08-04 21:20:59,033 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: job_info
205
+ 2024-08-04 21:20:59,033 INFO MainThread:10096 [wandb_run.py:_footer_history_summary_info():3866] rendering history
206
+ 2024-08-04 21:20:59,033 INFO MainThread:10096 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
207
+ 2024-08-04 21:20:59,033 INFO MainThread:10096 [wandb_run.py:_footer_sync_info():3825] logging synced files
208
+ 2024-08-04 21:20:59,033 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: shutdown
209
+ 2024-08-04 21:20:59,033 INFO HandlerThread:10096 [handler.py:finish():869] shutting down handler
210
+ 2024-08-04 21:20:59,865 INFO WriterThread:10096 [datastore.py:close():296] close: /project/wandb/run-20240804_211947-niq3ake5/run-niq3ake5.wandb
211
+ 2024-08-04 21:21:00,033 INFO SenderThread:10096 [sender.py:finish():1572] shutting down sender
212
+ 2024-08-04 21:21:00,033 INFO SenderThread:10096 [file_pusher.py:finish():172] shutting down file pusher
213
+ 2024-08-04 21:21:00,033 INFO SenderThread:10096 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_211947-niq3ake5/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 21:19:47,163 INFO MainThread:10025 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 21:19:47,163 INFO MainThread:10025 [wandb_setup.py:_flush():76] Configure stats pid to 10025
3
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
6
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_211947-niq3ake5/logs/debug.log
9
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_211947-niq3ake5/logs/debug-internal.log
10
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-21:19:16', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 21:19:47,169 INFO MainThread:10025 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 21:19:47,170 INFO MainThread:10025 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 21:19:47,175 INFO MainThread:10025 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 21:19:47,312 INFO MainThread:10025 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 21:19:47,827 INFO MainThread:10025 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 21:19:47,910 INFO MainThread:10025 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 21:19:47,910 INFO MainThread:10025 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 21:19:47,990 INFO MainThread:10025 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 21:19:47,991 INFO MainThread:10025 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 21:19:47,991 INFO MainThread:10025 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 21:19:47,991 INFO MainThread:10025 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 21:19:47,992 INFO MainThread:10025 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 21:20:54,436 INFO MainThread:10025 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
29
+ 2024-08-04 21:20:54,436 INFO MainThread:10025 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 21:21:00,034 WARNING MsgRouterThr:10025 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_211947-niq3ake5/run-niq3ake5.wandb ADDED
Binary file (22.2 kB). View file
 
wandb/run-20240812_055620-qpw0uqx2/files/config.yaml ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '235289369'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '235289369'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '235289369'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/Phi/Phi-2
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-sample-Phi-2_train_2024-08-12-05:56:09
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-sample-Phi-2
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-sample-Phi-2
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/Phi/Phi-2
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: anyprecision
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-sample-Phi-2
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 50304
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723409780.063771
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
wandb/run-20240812_055620-qpw0uqx2/files/output.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-sample-Phi-2.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-sample-Phi-2/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-Phi-2/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-sample-Phi-2/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-Phi-2/latest_iteration.txt
8
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
9
+ Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240812_055620-qpw0uqx2/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-11T20:56:20.724831",
5
+ "startedAt": "2024-08-11T20:56:20.050826",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/Phi/Phi-2",
23
+ "--train-data-path",
24
+ "235289369",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "235289369",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "235289369",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "anyprecision",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/Phi/Phi-2",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-sample-Phi-2",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-sample-Phi-2",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-sample-Phi-2",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-sample-Phi-2_train_2024-08-12-05:56:09"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0429999999997,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.043,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.043,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.043,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.043,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.043,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.043,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.043,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.043,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.043,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.043,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.043,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.043,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.043,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.043,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.043,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.043,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.043,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.043,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.487823486328125
214
+ }
215
+ }
wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 19}}
wandb/run-20240812_055620-qpw0uqx2/logs/debug-internal.log ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 05:56:20,065 INFO StreamThr :11662 [internal.py:wandb_internal():86] W&B internal server running at pid: 11662, started at: 2024-08-12 05:56:20.064563
2
+ 2024-08-12 05:56:20,067 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-12 05:56:20,069 INFO WriterThread:11662 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_055620-qpw0uqx2/run-qpw0uqx2.wandb
4
+ 2024-08-12 05:56:20,070 DEBUG SenderThread:11662 [sender.py:send():382] send: header
5
+ 2024-08-12 05:56:20,085 DEBUG SenderThread:11662 [sender.py:send():382] send: run
6
+ 2024-08-12 05:56:20,612 INFO SenderThread:11662 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_055620-qpw0uqx2/files
7
+ 2024-08-12 05:56:20,612 INFO SenderThread:11662 [sender.py:_start_run_threads():1136] run started: qpw0uqx2 with start time 1723409780.063771
8
+ 2024-08-12 05:56:20,617 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-12 05:56:20,617 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-12 05:56:20,704 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-12 05:56:20,711 DEBUG HandlerThread:11662 [system_info.py:__init__():27] System info init
12
+ 2024-08-12 05:56:20,711 DEBUG HandlerThread:11662 [system_info.py:__init__():42] System info init done
13
+ 2024-08-12 05:56:20,711 INFO HandlerThread:11662 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-12 05:56:20,711 INFO SystemMonitor:11662 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-12 05:56:20,711 INFO HandlerThread:11662 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-12 05:56:20,712 INFO SystemMonitor:11662 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-12 05:56:20,712 INFO SystemMonitor:11662 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-12 05:56:20,713 INFO SystemMonitor:11662 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-12 05:56:20,714 INFO SystemMonitor:11662 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-12 05:56:20,714 INFO SystemMonitor:11662 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-12 05:56:20,724 DEBUG HandlerThread:11662 [system_info.py:probe():151] Probing system
22
+ 2024-08-12 05:56:20,729 DEBUG HandlerThread:11662 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-12 05:56:20,742 DEBUG HandlerThread:11662 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-12 05:56:20,742 DEBUG HandlerThread:11662 [system_info.py:probe():199] Probing system done
25
+ 2024-08-12 05:56:20,742 DEBUG HandlerThread:11662 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T20:56:20.724831', 'startedAt': '2024-08-11T20:56:20.050826', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Phi/Phi-2', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Phi/Phi-2', '--save', '/work/llm_recipes/models/yans-sample-Phi-2', '--load', '/work/llm_recipes/models/yans-sample-Phi-2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-Phi-2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-Phi-2_train_2024-08-12-05:56:09'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
26
+ 2024-08-12 05:56:20,742 INFO HandlerThread:11662 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-12 05:56:20,742 INFO HandlerThread:11662 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-12 05:56:20,743 INFO HandlerThread:11662 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-12 05:56:20,749 DEBUG SenderThread:11662 [sender.py:send():382] send: files
30
+ 2024-08-12 05:56:20,749 INFO SenderThread:11662 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-12 05:56:20,759 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-12 05:56:20,759 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-12 05:56:20,759 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: python_packages
34
+ 2024-08-12 05:56:20,760 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-12 05:56:20,761 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-12 05:56:21,039 DEBUG SenderThread:11662 [sender.py:send():382] send: telemetry
37
+ 2024-08-12 05:56:21,402 INFO wandb-upload_0:11662 [upload_job.py:push():131] Uploaded file /tmp/tmp1ghcluufwandb/07mrguha-wandb-metadata.json
38
+ 2024-08-12 05:56:21,614 INFO Thread-12 :11662 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log
39
+ 2024-08-12 05:56:21,614 INFO Thread-12 :11662 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt
40
+ 2024-08-12 05:56:21,614 INFO Thread-12 :11662 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-metadata.json
41
+ 2024-08-12 05:56:23,614 INFO Thread-12 :11662 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log
42
+ 2024-08-12 05:56:25,807 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-12 05:56:30,807 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
44
+ 2024-08-12 05:56:35,758 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: stop_status
45
+ 2024-08-12 05:56:35,759 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: internal_messages
46
+ 2024-08-12 05:56:35,759 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: stop_status
47
+ 2024-08-12 05:56:35,962 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
48
+ 2024-08-12 05:56:40,657 DEBUG SenderThread:11662 [sender.py:send():382] send: exit
49
+ 2024-08-12 05:56:40,657 INFO SenderThread:11662 [sender.py:send_exit():589] handling exit code: 255
50
+ 2024-08-12 05:56:40,657 INFO SenderThread:11662 [sender.py:send_exit():591] handling runtime: 19
51
+ 2024-08-12 05:56:40,659 INFO SenderThread:11662 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
52
+ 2024-08-12 05:56:40,659 INFO SenderThread:11662 [sender.py:send_exit():597] send defer
53
+ 2024-08-12 05:56:40,659 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
54
+ 2024-08-12 05:56:40,659 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 0
55
+ 2024-08-12 05:56:40,659 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
56
+ 2024-08-12 05:56:40,659 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 0
57
+ 2024-08-12 05:56:40,659 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 1
58
+ 2024-08-12 05:56:40,659 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
59
+ 2024-08-12 05:56:40,660 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 1
60
+ 2024-08-12 05:56:40,660 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
61
+ 2024-08-12 05:56:40,660 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 1
62
+ 2024-08-12 05:56:40,660 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 2
63
+ 2024-08-12 05:56:40,660 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
64
+ 2024-08-12 05:56:40,660 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 2
65
+ 2024-08-12 05:56:40,660 INFO HandlerThread:11662 [system_monitor.py:finish():203] Stopping system monitor
66
+ 2024-08-12 05:56:40,660 DEBUG SystemMonitor:11662 [system_monitor.py:_start():172] Starting system metrics aggregation loop
67
+ 2024-08-12 05:56:40,660 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined cpu monitor
68
+ 2024-08-12 05:56:40,660 DEBUG SystemMonitor:11662 [system_monitor.py:_start():179] Finished system metrics aggregation loop
69
+ 2024-08-12 05:56:40,661 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined disk monitor
70
+ 2024-08-12 05:56:40,661 DEBUG SystemMonitor:11662 [system_monitor.py:_start():183] Publishing last batch of metrics
71
+ 2024-08-12 05:56:40,693 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined gpu monitor
72
+ 2024-08-12 05:56:40,693 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined memory monitor
73
+ 2024-08-12 05:56:40,693 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined network monitor
74
+ 2024-08-12 05:56:40,693 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
75
+ 2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 2
76
+ 2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 3
77
+ 2024-08-12 05:56:40,694 DEBUG SenderThread:11662 [sender.py:send():382] send: stats
78
+ 2024-08-12 05:56:40,694 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
79
+ 2024-08-12 05:56:40,694 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 3
80
+ 2024-08-12 05:56:40,694 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
81
+ 2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 3
82
+ 2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 4
83
+ 2024-08-12 05:56:40,694 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
84
+ 2024-08-12 05:56:40,694 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 4
85
+ 2024-08-12 05:56:40,694 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
86
+ 2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 4
87
+ 2024-08-12 05:56:40,695 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 5
88
+ 2024-08-12 05:56:40,695 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
89
+ 2024-08-12 05:56:40,695 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 5
90
+ 2024-08-12 05:56:40,695 DEBUG SenderThread:11662 [sender.py:send():382] send: summary
91
+ 2024-08-12 05:56:40,696 INFO SenderThread:11662 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
92
+ 2024-08-12 05:56:40,696 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
93
+ 2024-08-12 05:56:40,696 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 5
94
+ 2024-08-12 05:56:40,696 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 6
95
+ 2024-08-12 05:56:40,696 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
96
+ 2024-08-12 05:56:40,696 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 6
97
+ 2024-08-12 05:56:40,696 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
98
+ 2024-08-12 05:56:40,696 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 6
99
+ 2024-08-12 05:56:40,699 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
100
+ 2024-08-12 05:56:40,927 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 7
101
+ 2024-08-12 05:56:40,927 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
102
+ 2024-08-12 05:56:40,928 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 7
103
+ 2024-08-12 05:56:40,928 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
104
+ 2024-08-12 05:56:40,928 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 7
105
+ 2024-08-12 05:56:41,625 INFO Thread-12 :11662 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_055620-qpw0uqx2/files/config.yaml
106
+ 2024-08-12 05:56:41,626 INFO Thread-12 :11662 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json
107
+ 2024-08-12 05:56:41,657 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: poll_exit
108
+ 2024-08-12 05:56:41,724 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 8
109
+ 2024-08-12 05:56:41,724 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: poll_exit
110
+ 2024-08-12 05:56:41,724 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
111
+ 2024-08-12 05:56:41,724 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 8
112
+ 2024-08-12 05:56:41,724 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
113
+ 2024-08-12 05:56:41,725 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 8
114
+ 2024-08-12 05:56:41,725 INFO SenderThread:11662 [job_builder.py:build():296] Attempting to build job artifact
115
+ 2024-08-12 05:56:41,725 INFO SenderThread:11662 [job_builder.py:_get_source_type():426] is repo sourced job
116
+ 2024-08-12 05:56:41,740 INFO SenderThread:11662 [job_builder.py:build():402] adding wandb-job metadata file
117
+ 2024-08-12 05:56:41,748 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 9
118
+ 2024-08-12 05:56:41,749 DEBUG SenderThread:11662 [sender.py:send():382] send: artifact
119
+ 2024-08-12 05:56:41,749 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
120
+ 2024-08-12 05:56:41,750 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 9
121
+ 2024-08-12 05:56:42,618 INFO SenderThread:11662 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTgzMzc4Mw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'versionIndex': 7}}}
122
+ 2024-08-12 05:56:42,618 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
123
+ 2024-08-12 05:56:42,618 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 9
124
+ 2024-08-12 05:56:42,618 INFO SenderThread:11662 [dir_watcher.py:finish():358] shutting down directory watcher
125
+ 2024-08-12 05:56:42,626 INFO Thread-12 :11662 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log
126
+ 2024-08-12 05:56:42,627 INFO SenderThread:11662 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_055620-qpw0uqx2/files
127
+ 2024-08-12 05:56:42,627 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt requirements.txt
128
+ 2024-08-12 05:56:42,627 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/config.yaml config.yaml
129
+ 2024-08-12 05:56:42,629 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-metadata.json wandb-metadata.json
130
+ 2024-08-12 05:56:42,630 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json wandb-summary.json
131
+ 2024-08-12 05:56:42,631 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log output.log
132
+ 2024-08-12 05:56:42,631 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 10
133
+ 2024-08-12 05:56:42,633 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
134
+ 2024-08-12 05:56:42,633 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 10
135
+ 2024-08-12 05:56:42,633 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
136
+ 2024-08-12 05:56:42,634 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 10
137
+ 2024-08-12 05:56:42,635 INFO SenderThread:11662 [file_pusher.py:finish():172] shutting down file pusher
138
+ 2024-08-12 05:56:42,657 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: poll_exit
139
+ 2024-08-12 05:56:42,657 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: poll_exit
140
+ 2024-08-12 05:56:43,034 INFO wandb-upload_0:11662 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt
141
+ 2024-08-12 05:56:43,159 INFO wandb-upload_1:11662 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_055620-qpw0uqx2/files/config.yaml
142
+ 2024-08-12 05:56:43,201 INFO wandb-upload_2:11662 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json
143
+ 2024-08-12 05:56:43,202 INFO wandb-upload_3:11662 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log
144
+ 2024-08-12 05:56:43,402 INFO Thread-11 (_thread_body):11662 [sender.py:transition_state():617] send defer: 11
145
+ 2024-08-12 05:56:43,403 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
146
+ 2024-08-12 05:56:43,403 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 11
147
+ 2024-08-12 05:56:43,403 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
148
+ 2024-08-12 05:56:43,403 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 11
149
+ 2024-08-12 05:56:43,403 INFO SenderThread:11662 [file_pusher.py:join():178] waiting for file pusher
150
+ 2024-08-12 05:56:43,403 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 12
151
+ 2024-08-12 05:56:43,403 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
152
+ 2024-08-12 05:56:43,404 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 12
153
+ 2024-08-12 05:56:43,404 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
154
+ 2024-08-12 05:56:43,404 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 12
155
+ 2024-08-12 05:56:43,404 INFO SenderThread:11662 [file_stream.py:finish():595] file stream finish called
156
+ 2024-08-12 05:56:43,591 INFO SenderThread:11662 [file_stream.py:finish():599] file stream finish is done
157
+ 2024-08-12 05:56:43,591 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 13
158
+ 2024-08-12 05:56:43,591 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
159
+ 2024-08-12 05:56:43,591 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 13
160
+ 2024-08-12 05:56:43,592 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
161
+ 2024-08-12 05:56:43,592 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 13
162
+ 2024-08-12 05:56:43,592 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 14
163
+ 2024-08-12 05:56:43,592 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
164
+ 2024-08-12 05:56:43,592 DEBUG SenderThread:11662 [sender.py:send():382] send: final
165
+ 2024-08-12 05:56:43,592 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 14
166
+ 2024-08-12 05:56:43,592 DEBUG SenderThread:11662 [sender.py:send():382] send: footer
167
+ 2024-08-12 05:56:43,593 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
168
+ 2024-08-12 05:56:43,593 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 14
169
+ 2024-08-12 05:56:47,593 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
170
+ 2024-08-12 05:56:52,594 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
171
+ 2024-08-12 05:56:57,595 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
172
+ 2024-08-12 05:57:02,595 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
173
+ 2024-08-12 05:57:07,596 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
174
+ 2024-08-12 05:57:12,597 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
175
+ 2024-08-12 05:57:17,123 WARNING StreamThr :11662 [internal.py:is_dead():414] Internal process exiting, parent pid 11591 disappeared
176
+ 2024-08-12 05:57:17,123 ERROR StreamThr :11662 [internal.py:wandb_internal():152] Internal process shutdown.
177
+ 2024-08-12 05:57:17,597 INFO SenderThread:11662 [sender.py:finish():1572] shutting down sender
178
+ 2024-08-12 05:57:17,597 INFO SenderThread:11662 [file_pusher.py:finish():172] shutting down file pusher
179
+ 2024-08-12 05:57:17,597 INFO SenderThread:11662 [file_pusher.py:join():178] waiting for file pusher
180
+ 2024-08-12 05:57:17,598 INFO WriterThread:11662 [datastore.py:close():296] close: /project/wandb/run-20240812_055620-qpw0uqx2/run-qpw0uqx2.wandb
181
+ 2024-08-12 05:57:17,598 INFO HandlerThread:11662 [handler.py:finish():869] shutting down handler
wandb/run-20240812_055620-qpw0uqx2/logs/debug.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 05:56:20,056 INFO MainThread:11591 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Configure stats pid to 11591
3
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train phi'}
6
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_055620-qpw0uqx2/logs/debug.log
9
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_055620-qpw0uqx2/logs/debug-internal.log
10
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Phi/Phi-2', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-Phi-2_train_2024-08-12-05:56:09', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-Phi-2', 'save': '/work/llm_recipes/models/yans-sample-Phi-2', 'base_model': '/share/pretrained_lm/Phi/Phi-2', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-Phi-2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 50304, 'gradient_accumulation_steps': 320}
13
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:init():616] starting backend
14
+ 2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-12 05:56:20,062 INFO MainThread:11591 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-12 05:56:20,063 INFO MainThread:11591 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-12 05:56:20,068 INFO MainThread:11591 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-12 05:56:20,080 INFO MainThread:11591 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-12 05:56:20,616 INFO MainThread:11591 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-12 05:56:20,697 INFO MainThread:11591 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-12 05:56:20,697 INFO MainThread:11591 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-12 05:56:20,758 INFO MainThread:11591 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-12 05:56:20,758 INFO MainThread:11591 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-12 05:56:20,759 INFO MainThread:11591 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-12 05:56:20,759 INFO MainThread:11591 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-12 05:56:20,760 INFO MainThread:11591 [wandb_init.py:init():847] run started, returning control to user process
wandb/run-20240812_055620-qpw0uqx2/run-qpw0uqx2.wandb ADDED
Binary file (7.38 kB). View file
 
wandb/run-20240812_073955-ikoro1zp/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '304771887'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '304771887'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '304771887'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-qwen2-0.5B_train_2024-08-12-07:39:43
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 10
138
+ save_interval:
139
+ desc: null
140
+ value: 10
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-qwen2-0.5B
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 151680
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723415995.685329
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ model_architecture:
316
+ desc: null
317
+ value: Qwen2ForCausalLM
318
+ activation_function:
319
+ desc: null
320
+ value: silu
321
+ hidden_size:
322
+ desc: null
323
+ value: 896
324
+ model_type:
325
+ desc: null
326
+ value: qwen2
327
+ max_position_embeddings:
328
+ desc: null
329
+ value: 4096
330
+ num_attention_heads:
331
+ desc: null
332
+ value: 14
333
+ num_hidden_layers:
334
+ desc: null
335
+ value: 24
wandb/run-20240812_073955-ikoro1zp/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240812_073955-ikoro1zp/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240812_073955-ikoro1zp/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-11T22:39:56.314869",
5
+ "startedAt": "2024-08-11T22:39:55.672249",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
23
+ "--train-data-path",
24
+ "304771887",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "304771887",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "304771887",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "10",
56
+ "--eval-interval",
57
+ "10",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-qwen2-0.5B",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-qwen2-0.5B_train_2024-08-12-07:39:43"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0429999999997,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.043,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.043,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.043,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.043,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.043,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.043,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.043,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.043,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.043,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.043,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.043,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.043,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.043,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.043,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.043,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.043,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.043,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.043,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.487823486328125
214
+ }
215
+ }
wandb/run-20240812_073955-ikoro1zp/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 2.4635202884674072, "training/perplexity": 11.746088463770842, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1167, "optimizer/lr": 1.9945203423500063e-05, "optimizer/variance_l2": 0.0048320659825907535, "optimizer/variance_sqrt_l2": 0.5318417899390797, "optimizer/momentum_l2": 0.127020084622386, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.2829437255859375, "optimizer/variance_sqrt_l1": 4615.0, "optimizer/momentum_l1": 979.125, "optimizer/weight_l1": 6918144.0, "optimizer/variance_abs_max": 0.0029296875, "optimizer/variance_sqrt_abs_max": 0.05419921875, "optimizer/momentum_abs_max": 0.01129150390625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 73.68068221400608, "stats/tokens_per_sec": 17793.53774429062, "stats/tokens_per_sec_per_gpu": 17793.53774429062, "stats/tflops": 71.54763648032535, "_timestamp": 1723503194.8273196, "_runtime": 87199.14199066162, "_step": 1167, "evaluation/val_loss": 2.4397435188293457, "evaluation/val_ppl": 11.470099449157715, "_wandb": {"runtime": 87227}}
wandb/run-20240812_073955-ikoro1zp/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240812_073955-ikoro1zp/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Configure stats pid to 14724
3
+ 2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
6
+ 2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_073955-ikoro1zp/logs/debug.log
9
+ 2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_073955-ikoro1zp/logs/debug-internal.log
10
+ 2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:39:43', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
13
+ 2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:init():616] starting backend
14
+ 2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-12 07:39:55,684 INFO MainThread:14724 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-12 07:39:55,685 INFO MainThread:14724 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-12 07:39:55,689 INFO MainThread:14724 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-12 07:39:55,704 INFO MainThread:14724 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-12 07:39:56,202 INFO MainThread:14724 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-12 07:39:56,287 INFO MainThread:14724 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-12 07:39:56,287 INFO MainThread:14724 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-12 07:39:56,346 INFO MainThread:14724 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-12 07:39:56,346 INFO MainThread:14724 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-12 07:39:56,346 INFO MainThread:14724 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-12 07:39:56,347 INFO MainThread:14724 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-12 07:39:56,348 INFO MainThread:14724 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-12 07:40:02,086 INFO MainThread:14724 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-12 07:40:02,086 INFO MainThread:14724 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
wandb/run-20240823_160642-78xnl14c/files/config.yaml ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '1754785366'
31
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
32
+ - '28623823675'
33
+ - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
34
+ valid_data_path:
35
+ desc: null
36
+ value:
37
+ - '1754785366'
38
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
39
+ test_data_path:
40
+ desc: null
41
+ value:
42
+ - '1754785366'
43
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
44
+ data_cache_path:
45
+ desc: null
46
+ value: null
47
+ vocab_size:
48
+ desc: null
49
+ value: null
50
+ vocab_file:
51
+ desc: null
52
+ value: null
53
+ merge_file:
54
+ desc: null
55
+ value: null
56
+ seq_length:
57
+ desc: null
58
+ value: 2048
59
+ num_workers:
60
+ desc: null
61
+ value: 2
62
+ tokenizer_type:
63
+ desc: null
64
+ value: HFPreTrainedTokenizer
65
+ tokenizer_model:
66
+ desc: null
67
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
68
+ reset_position_ids:
69
+ desc: null
70
+ value: false
71
+ reset_attention_mask:
72
+ desc: null
73
+ value: false
74
+ eod_mask_loss:
75
+ desc: null
76
+ value: false
77
+ retro_return_doc_ids:
78
+ desc: null
79
+ value: false
80
+ short_seq_prob:
81
+ desc: null
82
+ value: 0.1
83
+ vocab_extra_ids:
84
+ desc: null
85
+ value: 0
86
+ seed:
87
+ desc: null
88
+ value: 1234
89
+ use_mpi:
90
+ desc: null
91
+ value: false
92
+ wandb_entity:
93
+ desc: null
94
+ value: iwakawa-koichi-q5-tohoku-nlp6723
95
+ wandb_name:
96
+ desc: null
97
+ value: Qwen2-0.5b-0.2_train_2024-08-23-16:06:29
98
+ wandb_project:
99
+ desc: null
100
+ value: llm_tutorial-0.2
101
+ quantization:
102
+ desc: null
103
+ value: false
104
+ use_freeze_layers:
105
+ desc: null
106
+ value: false
107
+ freeze_layers:
108
+ desc: null
109
+ value: null
110
+ bf16:
111
+ desc: null
112
+ value: true
113
+ fp16:
114
+ desc: null
115
+ value: false
116
+ mixed_precision:
117
+ desc: null
118
+ value: true
119
+ param_dtype:
120
+ desc: null
121
+ value: null
122
+ load:
123
+ desc: null
124
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
125
+ save:
126
+ desc: null
127
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
128
+ base_model:
129
+ desc: null
130
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
131
+ use_better_transformer:
132
+ desc: null
133
+ value: false
134
+ grad_clip_norm:
135
+ desc: null
136
+ value: 1.0
137
+ eval_interval:
138
+ desc: null
139
+ value: 10
140
+ save_interval:
141
+ desc: null
142
+ value: 10
143
+ eval_iters:
144
+ desc: null
145
+ value: 10
146
+ optimizer:
147
+ desc: null
148
+ value: anyprecision
149
+ lr:
150
+ desc: null
151
+ value: 2.0e-05
152
+ lr_decay_style:
153
+ desc: null
154
+ value: cosine
155
+ lr_decay_iters:
156
+ desc: null
157
+ value: 7500
158
+ lr_warmup_iters:
159
+ desc: null
160
+ value: 500
161
+ min_lr:
162
+ desc: null
163
+ value: 1.0e-06
164
+ train_iters:
165
+ desc: null
166
+ value: 7500
167
+ train_samples:
168
+ desc: null
169
+ value: null
170
+ global_batch_size:
171
+ desc: null
172
+ value: 320
173
+ micro_batch_size:
174
+ desc: null
175
+ value: 5
176
+ make_vocab_size_divisible_by:
177
+ desc: null
178
+ value: 128
179
+ sliding_window_size:
180
+ desc: null
181
+ value: 131072
182
+ skip_batch:
183
+ desc: null
184
+ value: null
185
+ no_save_optimizer_state:
186
+ desc: null
187
+ value: false
188
+ continual_pretraining:
189
+ desc: null
190
+ value: false
191
+ instruction_tuning:
192
+ desc: null
193
+ value: false
194
+ direct_preference_optimization:
195
+ desc: null
196
+ value: false
197
+ attention_dropout:
198
+ desc: null
199
+ value: 0.1
200
+ hidden_dropout:
201
+ desc: null
202
+ value: 0.1
203
+ weight_decay:
204
+ desc: null
205
+ value: 0.1
206
+ adam_beta1:
207
+ desc: null
208
+ value: 0.9
209
+ adam_beta2:
210
+ desc: null
211
+ value: 0.95
212
+ adam_eps:
213
+ desc: null
214
+ value: 1.0e-06
215
+ hf_transformer_model_dir:
216
+ desc: null
217
+ value: null
218
+ instruction_train_data_path:
219
+ desc: null
220
+ value: null
221
+ instruction_valid_data_path:
222
+ desc: null
223
+ value: null
224
+ epoch:
225
+ desc: null
226
+ value: null
227
+ instruction_dataset_size:
228
+ desc: null
229
+ value: null
230
+ save_sampler_state:
231
+ desc: null
232
+ value: false
233
+ label_smoothing:
234
+ desc: null
235
+ value: 0.0
236
+ save_n_checkpoints:
237
+ desc: null
238
+ value: 10
239
+ hf_repo_id:
240
+ desc: null
241
+ value: koichi12/Qwen2-0.5b-0.2
242
+ create_public_hf_repo:
243
+ desc: null
244
+ value: false
245
+ upload_all_checkpoints_to_hf:
246
+ desc: null
247
+ value: true
248
+ hf_upload_retry_limit:
249
+ desc: null
250
+ value: 2
251
+ exit_duration_in_mins:
252
+ desc: null
253
+ value: null
254
+ source_key:
255
+ desc: null
256
+ value: null
257
+ target_key:
258
+ desc: null
259
+ value: null
260
+ attn_implementation:
261
+ desc: null
262
+ value: flash_attention_2
263
+ efficient_instruction_tuning:
264
+ desc: null
265
+ value: false
266
+ remove_padding_masking:
267
+ desc: null
268
+ value: false
269
+ save_start_iter:
270
+ desc: null
271
+ value: null
272
+ valid_micro_batch_size:
273
+ desc: null
274
+ value: 1
275
+ rank:
276
+ desc: null
277
+ value: 0
278
+ world_size:
279
+ desc: null
280
+ value: 1
281
+ padded_vocab_size:
282
+ desc: null
283
+ value: 151680
284
+ gradient_accumulation_steps:
285
+ desc: null
286
+ value: 64
287
+ _wandb:
288
+ desc: null
289
+ value:
290
+ python_version: 3.10.12
291
+ cli_version: 0.16.3
292
+ framework: huggingface
293
+ huggingface_version: 4.43.3
294
+ is_jupyter_run: false
295
+ is_kaggle_kernel: false
296
+ start_time: 1724396802.555005
297
+ t:
298
+ 1:
299
+ - 1
300
+ - 11
301
+ - 49
302
+ - 55
303
+ - 71
304
+ - 105
305
+ 2:
306
+ - 1
307
+ - 11
308
+ - 49
309
+ - 55
310
+ - 71
311
+ - 105
312
+ 3:
313
+ - 13
314
+ - 16
315
+ - 23
316
+ 4: 3.10.12
317
+ 5: 0.16.3
318
+ 6: 4.43.3
319
+ 8:
320
+ - 5
321
+ 13: linux-x86_64
322
+ model_architecture:
323
+ desc: null
324
+ value: Qwen2ForCausalLM
325
+ activation_function:
326
+ desc: null
327
+ value: silu
328
+ hidden_size:
329
+ desc: null
330
+ value: 896
331
+ model_type:
332
+ desc: null
333
+ value: qwen2
334
+ max_position_embeddings:
335
+ desc: null
336
+ value: 2048
337
+ num_attention_heads:
338
+ desc: null
339
+ value: 14
340
+ num_hidden_layers:
341
+ desc: null
342
+ value: 24
wandb/run-20240823_160642-78xnl14c/files/output.log ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000010/model.pt
5
+ Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000010/model.pt
6
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
7
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
8
+ BFloat16 enabled for mixed precision - using bfSixteen policy
9
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
10
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
11
+ warnings.warn(
12
+ Let split = None
13
+ Unable to save the indexes because path_to_cache is None
14
+ Building a BlendedDataset for a single MegatronDataset
15
+ Unable to save the indexes because path_to_cache is None
16
+ Building a BlendedDataset for a single MegatronDataset
17
+ Unable to save the indexes because path_to_cache is None
18
+ --> applying fsdp activation checkpointing...
19
+ > datasets target sizes (minimum size):
20
+ train: 2400000
21
+ validation: 2403200
22
+ test: 3200
23
+ > building train, validation, and test datasets for GPT ...
24
+ > finished creating GPT datasets ...
25
+ Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000010/optimizer.pt
26
+ Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000010/optimizer.pt
27
+ model info: FullyShardedDataParallel(
28
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
29
+ (model): Qwen2Model(
30
+ (embed_tokens): Embedding(151936, 896)
31
+ (layers): ModuleList(
32
+ (0-23): 24 x FullyShardedDataParallel(
33
+ (_fsdp_wrapped_module): CheckpointWrapper(
34
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
35
+ (self_attn): Qwen2FlashAttention2(
36
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
37
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
38
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
39
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
40
+ (rotary_emb): Qwen2RotaryEmbedding()
41
+ )
42
+ (mlp): Qwen2MLP(
43
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
44
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
45
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
46
+ (act_fn): SiLU()
47
+ )
48
+ (input_layernorm): Qwen2RMSNorm()
49
+ (post_attention_layernorm): Qwen2RMSNorm()
50
+ )
51
+ )
52
+ )
53
+ )
54
+ (norm): Qwen2RMSNorm()
55
+ )
56
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
57
+ )
58
+ )
59
+ model config: Qwen2Config {
60
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
61
+ "architectures": [
62
+ "Qwen2ForCausalLM"
63
+ ],
64
+ "attention_dropout": 0.0,
65
+ "bos_token_id": 151643,
66
+ "eos_token_id": 151643,
67
+ "hidden_act": "silu",
68
+ "hidden_size": 896,
69
+ "initializer_range": 0.02,
70
+ "intermediate_size": 4864,
71
+ "label_smoothing": 0.0,
72
+ "max_position_embeddings": 2048,
73
+ "max_window_layers": 24,
74
+ "model_type": "qwen2",
75
+ "num_attention_heads": 14,
76
+ "num_hidden_layers": 24,
77
+ "num_key_value_heads": 2,
78
+ "rms_norm_eps": 1e-06,
79
+ "rope_theta": 1000000.0,
80
+ "sliding_window": 131072,
81
+ "tie_word_embeddings": true,
82
+ "torch_dtype": "bfloat16",
83
+ "transformers_version": "4.43.3",
84
+ "use_cache": false,
85
+ "use_sliding_window": false,
86
+ "vocab_size": 151936
87
+ }
88
+ [rank0]:[2024-08-23 16:06:47,708] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
89
+ ------------------------------------------------------------------
90
+ iteration: 11 , TFLOPS: 78.6590174462143, Tokens per sec: 22522.217665366814, Loss: 4.254438400268555
91
+ ------------------------------------------------------------------
92
+ ------------------------------------------------------------------
93
+ iteration: 12 , TFLOPS: 82.22378668182658, Tokens per sec: 23542.90812474309, Loss: 4.243721008300781
94
+ ------------------------------------------------------------------
95
+ ------------------------------------------------------------------
96
+ iteration: 13 , TFLOPS: 81.9584539835788, Tokens per sec: 23466.93615131027, Loss: 4.228161334991455
97
+ ------------------------------------------------------------------
98
+ ------------------------------------------------------------------
99
+ iteration: 14 , TFLOPS: 81.93344116019101, Tokens per sec: 23459.774299165976, Loss: 4.26573371887207
100
+ ------------------------------------------------------------------
101
+ ------------------------------------------------------------------
102
+ iteration: 15 , TFLOPS: 82.18786108647078, Tokens per sec: 23532.62164895645, Loss: 4.256962776184082
103
+ ------------------------------------------------------------------
104
+ ------------------------------------------------------------------
105
+ iteration: 16 , TFLOPS: 81.96618258354832, Tokens per sec: 23469.14905984146, Loss: 4.25914192199707
106
+ ------------------------------------------------------------------
107
+ ------------------------------------------------------------------
108
+ iteration: 17 , TFLOPS: 81.87438758063158, Tokens per sec: 23442.865651995726, Loss: 4.231760025024414
109
+ ------------------------------------------------------------------
110
+ ------------------------------------------------------------------
111
+ iteration: 18 , TFLOPS: 81.88852755364577, Tokens per sec: 23446.914311136756, Loss: 4.2337870597839355
112
+ ------------------------------------------------------------------
113
+ ------------------------------------------------------------------
114
+ iteration: 19 , TFLOPS: 82.2474099184413, Tokens per sec: 23549.672100371965, Loss: 4.217740535736084
115
+ ------------------------------------------------------------------
116
+ ------------------------------------------------------------------
117
+ iteration: 20 , TFLOPS: 81.96982275621532, Tokens per sec: 23470.191340355606, Loss: 4.259789943695068
118
+ ------------------------------------------------------------------
119
+ Saving checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020
120
+ Saving model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/model.pt
121
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
122
+ warnings.warn(
123
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
124
+ warnings.warn(
125
+ Saved model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/model.pt
126
+ Saving optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/optimizer.pt
127
+ [rank0]:[2024-08-23 16:11:31,233] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.006403441000202292, 'preprocessing_with_comm': 0.0006401519999599259, 'state_converting': 0.9829786160003096, <Type.ALL: 'all'>: 0.9914544049997858})
128
+ Saved optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/optimizer.pt
129
+ Saving scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/scheduler.pt
130
+ Saved scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/scheduler.pt
131
+ Saving RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/rng.pt
132
+ Saved RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/rng.pt
133
+ Saved checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020, took 4.19s
134
+ eval ppl=57.051639556884766, eval loss=4.043956756591797
135
+ ------------------------------------------------------------------
136
+ iteration: 21 , TFLOPS: 81.67173393242047, Tokens per sec: 23384.840396610194, Loss: 4.235799789428711
137
+ ------------------------------------------------------------------
138
+ ------------------------------------------------------------------
139
+ iteration: 22 , TFLOPS: 82.17554686977023, Tokens per sec: 23529.095753542137, Loss: 4.2608537673950195
140
+ ------------------------------------------------------------------
141
+ ------------------------------------------------------------------
142
+ iteration: 23 , TFLOPS: 81.93471100946451, Tokens per sec: 23460.13789157621, Loss: 4.211125373840332
143
+ ------------------------------------------------------------------
144
+ ------------------------------------------------------------------
145
+ iteration: 24 , TFLOPS: 81.95296715435393, Tokens per sec: 23465.36512276062, Loss: 4.202465534210205
146
+ ------------------------------------------------------------------
147
+ ------------------------------------------------------------------
148
+ iteration: 25 , TFLOPS: 81.86218501721527, Tokens per sec: 23439.37172595571, Loss: 4.217883586883545
149
+ ------------------------------------------------------------------
150
+ ------------------------------------------------------------------
151
+ iteration: 26 , TFLOPS: 82.2054193168547, Tokens per sec: 23537.64904822184, Loss: 4.235620021820068
152
+ ------------------------------------------------------------------
153
+ ------------------------------------------------------------------
154
+ iteration: 27 , TFLOPS: 81.90650355273718, Tokens per sec: 23452.06132895408, Loss: 4.211632251739502
155
+ ------------------------------------------------------------------
156
+ ------------------------------------------------------------------
157
+ iteration: 28 , TFLOPS: 81.86577887433305, Tokens per sec: 23440.400745067473, Loss: 4.186619758605957
158
+ ------------------------------------------------------------------
159
+ ------------------------------------------------------------------
160
+ iteration: 29 , TFLOPS: 81.98724461570576, Tokens per sec: 23475.179692922393, Loss: 4.187148571014404
161
+ ------------------------------------------------------------------
162
+ ------------------------------------------------------------------
163
+ iteration: 30 , TFLOPS: 82.14778410156913, Tokens per sec: 23521.146517348767, Loss: 4.202610492706299
164
+ ------------------------------------------------------------------
165
+ Saving checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030
166
+ Saving model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/model.pt
167
+ Saved model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/model.pt
168
+ Saving optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/optimizer.pt
169
+ [rank0]:[2024-08-23 16:16:16,915] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.0063674119996903755, 'preprocessing_with_comm': 0.0006565419998878497, 'state_converting': 0.9940128050002386, <Type.ALL: 'all'>: 1.0024299110000356})
170
+ Saved optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/optimizer.pt
171
+ Saving scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/scheduler.pt
172
+ Saved scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/scheduler.pt
173
+ Saving RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/rng.pt
174
+ Saved RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/rng.pt
175
+ Saved checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030, took 4.25s
176
+ eval ppl=49.190242767333984, eval loss=3.895695209503174
177
+ ------------------------------------------------------------------
178
+ iteration: 31 , TFLOPS: 81.49741363388435, Tokens per sec: 23334.92775042431, Loss: 4.200056552886963
179
+ ------------------------------------------------------------------
180
+ ------------------------------------------------------------------
181
+ iteration: 32 , TFLOPS: 81.78042346796904, Tokens per sec: 23415.961168990125, Loss: 4.181160926818848
182
+ ------------------------------------------------------------------
183
+ ------------------------------------------------------------------
184
+ iteration: 33 , TFLOPS: 81.92238077096499, Tokens per sec: 23456.60740868476, Loss: 4.155094623565674
185
+ ------------------------------------------------------------------
186
+ ------------------------------------------------------------------
187
+ iteration: 34 , TFLOPS: 82.11937602485122, Tokens per sec: 23513.012511762787, Loss: 4.177372932434082
188
+ ------------------------------------------------------------------
189
+ ------------------------------------------------------------------
190
+ iteration: 35 , TFLOPS: 81.71716264856666, Tokens per sec: 23397.847874538613, Loss: 4.142157077789307
191
+ ------------------------------------------------------------------
192
+ ------------------------------------------------------------------
193
+ iteration: 36 , TFLOPS: 81.89085188246439, Tokens per sec: 23447.57982974198, Loss: 4.166767597198486
194
+ ------------------------------------------------------------------
195
+ ------------------------------------------------------------------
196
+ iteration: 37 , TFLOPS: 81.95670790833275, Tokens per sec: 23466.436202441997, Loss: 4.169678688049316
197
+ ------------------------------------------------------------------
198
+ ------------------------------------------------------------------
199
+ iteration: 38 , TFLOPS: 82.18953848455327, Tokens per sec: 23533.101933683516, Loss: 4.145669937133789
200
+ ------------------------------------------------------------------
201
+ ------------------------------------------------------------------
202
+ iteration: 39 , TFLOPS: 81.94531984778251, Tokens per sec: 23463.175490741974, Loss: 4.136501312255859
203
+ ------------------------------------------------------------------
204
+ ------------------------------------------------------------------
205
+ iteration: 40 , TFLOPS: 81.88752676960887, Tokens per sec: 23446.627759427276, Loss: 4.12642240524292
206
+ ------------------------------------------------------------------
207
+ Saving checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040
208
+ Saving model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
209
+ Saved model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
210
+ [rank0]:[2024-08-23 16:21:02,356] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.006349127999783377, 'preprocessing_with_comm': 0.0006216020001375, 'state_converting': 0.7473425819998738, <Type.ALL: 'all'>: 0.7556547180001871})
211
+ Saving optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
212
+ Saved optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
213
+ Saving scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/scheduler.pt
214
+ Saved scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/scheduler.pt
215
+ Saving RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/rng.pt
216
+ Saved RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/rng.pt
217
+ Saved checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040, took 3.86s
218
+ eval ppl=30.580110549926758, eval loss=3.4203498363494873
219
+ ------------------------------------------------------------------
220
+ iteration: 41 , TFLOPS: 81.74253389997872, Tokens per sec: 23405.112354382087, Loss: 4.1325507164001465
221
+ ------------------------------------------------------------------
222
+ ------------------------------------------------------------------
223
+ iteration: 42 , TFLOPS: 81.9581488586999, Tokens per sec: 23466.848785752438, Loss: 4.099006652832031
224
+ ------------------------------------------------------------------
225
+ ------------------------------------------------------------------
226
+ iteration: 43 , TFLOPS: 81.91532895406834, Tokens per sec: 23454.588281568267, Loss: 4.136029243469238
227
+ ------------------------------------------------------------------
228
+ ------------------------------------------------------------------
229
+ iteration: 44 , TFLOPS: 81.95839032196434, Tokens per sec: 23466.917923257286, Loss: 4.140143871307373
230
+ ------------------------------------------------------------------
231
+ ------------------------------------------------------------------
232
+ iteration: 45 , TFLOPS: 82.23276643961596, Tokens per sec: 23545.47927381259, Loss: 4.161101341247559
233
+ ------------------------------------------------------------------
234
+ ------------------------------------------------------------------
235
+ iteration: 46 , TFLOPS: 81.96124008786207, Tokens per sec: 23467.733888799798, Loss: 4.099796772003174
236
+ ------------------------------------------------------------------
237
+ ------------------------------------------------------------------
238
+ iteration: 47 , TFLOPS: 81.96352747485656, Tokens per sec: 23468.388829955275, Loss: 4.10368537902832
239
+ ------------------------------------------------------------------
240
+ Traceback (most recent call last):
241
+ File "/project/examples/finetuning.py", line 13, in <module>
242
+ main()
243
+ File "/project/src/llama_recipes/finetuning.py", line 282, in main
244
+ train(
245
+ File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
246
+ loss.backward()
247
+ File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
248
+ torch.autograd.backward(
249
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
250
+ _engine_run_backward(
251
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
252
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
253
+ KeyboardInterrupt
wandb/run-20240823_160642-78xnl14c/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0
wandb/run-20240823_160642-78xnl14c/files/wandb-metadata.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-23T07:06:43.074166",
5
+ "startedAt": "2024-08-23T07:06:42.542542",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "2048",
11
+ "--sliding-window-size",
12
+ "131072",
13
+ "--micro-batch-size",
14
+ "5",
15
+ "--valid_micro_batch_size",
16
+ "1",
17
+ "--global-batch-size",
18
+ "320",
19
+ "--train-iters",
20
+ "7500",
21
+ "--tokenizer-type",
22
+ "HFPreTrainedTokenizer",
23
+ "--tokenizer-model",
24
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
25
+ "--train-data-path",
26
+ "1754785366",
27
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
28
+ "28623823675",
29
+ "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
30
+ "--valid-data-path",
31
+ "1754785366",
32
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
33
+ "--test-data-path",
34
+ "1754785366",
35
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
36
+ "--lr",
37
+ "2e-5",
38
+ "--min-lr",
39
+ "1e-6",
40
+ "--lr-decay-style",
41
+ "cosine",
42
+ "--lr-warmup-iters",
43
+ "500",
44
+ "--lr-decay-iters",
45
+ "7500",
46
+ "--weight-decay",
47
+ "0.1",
48
+ "--grad-clip-norm",
49
+ "1.0",
50
+ "--optimizer",
51
+ "anyprecision",
52
+ "--adam-beta1",
53
+ "0.9",
54
+ "--adam-beta2",
55
+ "0.95",
56
+ "--adam-eps",
57
+ "1e-6",
58
+ "--save-interval",
59
+ "10",
60
+ "--eval-interval",
61
+ "10",
62
+ "--eval-iters",
63
+ "10",
64
+ "--bf16",
65
+ "--mixed-precision",
66
+ "--base-model",
67
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
68
+ "--save",
69
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
70
+ "--load",
71
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
72
+ "--fsdp-activation-checkpointing",
73
+ "--sharding-strategy",
74
+ "FULL_SHARD",
75
+ "--checkpoint-type",
76
+ "LOCAL_STATE_DICT",
77
+ "--save-n-checkpoints",
78
+ "10",
79
+ "--upload-all-checkpoints-to-hf",
80
+ "--hf-upload-retry-limit",
81
+ "2",
82
+ "--hf-repo-id",
83
+ "koichi12/Qwen2-0.5b-0.2",
84
+ "--wandb-entity",
85
+ "iwakawa-koichi-q5-tohoku-nlp6723",
86
+ "--wandb-project",
87
+ "llm_tutorial-0.2",
88
+ "--wandb-name",
89
+ "Qwen2-0.5b-0.2_train_2024-08-23-16:06:29"
90
+ ],
91
+ "state": "running",
92
+ "program": "/project/examples/finetuning.py",
93
+ "codePathLocal": "examples/finetuning.py",
94
+ "codePath": "examples/finetuning.py",
95
+ "git": {
96
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
97
+ "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
98
+ },
99
+ "email": null,
100
+ "root": "/project",
101
+ "host": "gpu-koiwa-00",
102
+ "username": "koiwa",
103
+ "executable": "/usr/bin/python",
104
+ "cpu_count": 18,
105
+ "cpu_count_logical": 18,
106
+ "cpu_freq": {
107
+ "current": 2400.0389999999993,
108
+ "min": 0.0,
109
+ "max": 0.0
110
+ },
111
+ "cpu_freq_per_core": [
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2400.039,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ }
202
+ ],
203
+ "disk": {
204
+ "/": {
205
+ "total": 0.0625,
206
+ "used": 1.1444091796875e-05
207
+ }
208
+ },
209
+ "gpu": "NVIDIA A100-SXM4-40GB",
210
+ "gpu_count": 1,
211
+ "gpu_devices": [
212
+ {
213
+ "name": "NVIDIA A100-SXM4-40GB",
214
+ "memory_total": 42949672960
215
+ }
216
+ ],
217
+ "memory": {
218
+ "total": 56.487831115722656
219
+ }
220
+ }
wandb/run-20240823_160642-78xnl14c/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 4.10368537902832, "training/perplexity": 60.56307470314165, "utils/batch_size": 5, "utils/global_batch_size": 320, "utils/seq_len": 2049, "utils/gradient_accumulation_steps": 64, "utils/iteration": 47, "optimizer/lr": 2.786e-06, "optimizer/variance_l2": 0.05447102242525233, "optimizer/variance_sqrt_l2": 0.9553866833854993, "optimizer/momentum_l2": 0.9463549769133376, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.9140548706054688, "optimizer/variance_sqrt_l1": 4069.0, "optimizer/momentum_l1": 3366.75, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.044921875, "optimizer/variance_sqrt_abs_max": 0.2119140625, "optimizer/momentum_abs_max": 0.2353515625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 27.938858723999147, "stats/tokens_per_sec": 23468.388829955275, "stats/tokens_per_sec_per_gpu": 23468.388829955275, "stats/tflops": 81.96352747485656, "_timestamp": 1724397861.433749, "_runtime": 1058.8787438869476, "_step": 47, "evaluation/val_loss": 3.4203498363494873, "evaluation/val_ppl": 30.580110549926758, "_wandb": {"runtime": 1064}}
wandb/run-20240823_160642-78xnl14c/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240823_160642-78xnl14c/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 16:06:42,548 INFO MainThread:10858 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Configure stats pid to 10858
3
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_160642-78xnl14c/logs/debug.log
9
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_160642-78xnl14c/logs/debug-internal.log
10
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 2048, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-16:06:29', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 5, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 64}
13
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:init():616] starting backend
14
+ 2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-23 16:06:42,554 INFO MainThread:10858 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-23 16:06:42,554 INFO MainThread:10858 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-23 16:06:42,559 INFO MainThread:10858 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-23 16:06:42,570 INFO MainThread:10858 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-23 16:06:42,985 INFO MainThread:10858 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-23 16:06:43,007 INFO MainThread:10858 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-23 16:06:43,007 INFO MainThread:10858 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-23 16:06:43,104 INFO MainThread:10858 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-23 16:06:43,105 INFO MainThread:10858 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-23 16:06:43,105 INFO MainThread:10858 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-23 16:06:43,105 INFO MainThread:10858 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-23 16:06:43,106 INFO MainThread:10858 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-23 16:06:47,996 INFO MainThread:10858 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 2048, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-23 16:06:47,997 INFO MainThread:10858 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-23 16:24:34,131 WARNING MsgRouterThr:10858 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240823_160642-78xnl14c/run-78xnl14c.wandb ADDED
Binary file (137 kB). View file
 
wandb/run-20240823_162922-z3gs82jm/files/config.yaml ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '1754785366'
31
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
32
+ - '28623823675'
33
+ - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
34
+ valid_data_path:
35
+ desc: null
36
+ value:
37
+ - '1754785366'
38
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
39
+ test_data_path:
40
+ desc: null
41
+ value:
42
+ - '1754785366'
43
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
44
+ data_cache_path:
45
+ desc: null
46
+ value: null
47
+ vocab_size:
48
+ desc: null
49
+ value: null
50
+ vocab_file:
51
+ desc: null
52
+ value: null
53
+ merge_file:
54
+ desc: null
55
+ value: null
56
+ seq_length:
57
+ desc: null
58
+ value: 2048
59
+ num_workers:
60
+ desc: null
61
+ value: 2
62
+ tokenizer_type:
63
+ desc: null
64
+ value: HFPreTrainedTokenizer
65
+ tokenizer_model:
66
+ desc: null
67
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
68
+ reset_position_ids:
69
+ desc: null
70
+ value: false
71
+ reset_attention_mask:
72
+ desc: null
73
+ value: false
74
+ eod_mask_loss:
75
+ desc: null
76
+ value: false
77
+ retro_return_doc_ids:
78
+ desc: null
79
+ value: false
80
+ short_seq_prob:
81
+ desc: null
82
+ value: 0.1
83
+ vocab_extra_ids:
84
+ desc: null
85
+ value: 0
86
+ seed:
87
+ desc: null
88
+ value: 1234
89
+ use_mpi:
90
+ desc: null
91
+ value: false
92
+ wandb_entity:
93
+ desc: null
94
+ value: iwakawa-koichi-q5-tohoku-nlp6723
95
+ wandb_name:
96
+ desc: null
97
+ value: Qwen2-0.5b-0.2_train_2024-08-23-16:29:10
98
+ wandb_project:
99
+ desc: null
100
+ value: llm_tutorial-0.2
101
+ quantization:
102
+ desc: null
103
+ value: false
104
+ use_freeze_layers:
105
+ desc: null
106
+ value: false
107
+ freeze_layers:
108
+ desc: null
109
+ value: null
110
+ bf16:
111
+ desc: null
112
+ value: true
113
+ fp16:
114
+ desc: null
115
+ value: false
116
+ mixed_precision:
117
+ desc: null
118
+ value: true
119
+ param_dtype:
120
+ desc: null
121
+ value: null
122
+ load:
123
+ desc: null
124
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
125
+ save:
126
+ desc: null
127
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
128
+ base_model:
129
+ desc: null
130
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
131
+ use_better_transformer:
132
+ desc: null
133
+ value: false
134
+ grad_clip_norm:
135
+ desc: null
136
+ value: 1.0
137
+ eval_interval:
138
+ desc: null
139
+ value: 10
140
+ save_interval:
141
+ desc: null
142
+ value: 10
143
+ eval_iters:
144
+ desc: null
145
+ value: 10
146
+ optimizer:
147
+ desc: null
148
+ value: anyprecision
149
+ lr:
150
+ desc: null
151
+ value: 2.0e-05
152
+ lr_decay_style:
153
+ desc: null
154
+ value: cosine
155
+ lr_decay_iters:
156
+ desc: null
157
+ value: 7500
158
+ lr_warmup_iters:
159
+ desc: null
160
+ value: 500
161
+ min_lr:
162
+ desc: null
163
+ value: 1.0e-06
164
+ train_iters:
165
+ desc: null
166
+ value: 7500
167
+ train_samples:
168
+ desc: null
169
+ value: null
170
+ global_batch_size:
171
+ desc: null
172
+ value: 640
173
+ micro_batch_size:
174
+ desc: null
175
+ value: 5
176
+ make_vocab_size_divisible_by:
177
+ desc: null
178
+ value: 128
179
+ sliding_window_size:
180
+ desc: null
181
+ value: 131072
182
+ skip_batch:
183
+ desc: null
184
+ value: null
185
+ no_save_optimizer_state:
186
+ desc: null
187
+ value: false
188
+ continual_pretraining:
189
+ desc: null
190
+ value: false
191
+ instruction_tuning:
192
+ desc: null
193
+ value: false
194
+ direct_preference_optimization:
195
+ desc: null
196
+ value: false
197
+ attention_dropout:
198
+ desc: null
199
+ value: 0.1
200
+ hidden_dropout:
201
+ desc: null
202
+ value: 0.1
203
+ weight_decay:
204
+ desc: null
205
+ value: 0.1
206
+ adam_beta1:
207
+ desc: null
208
+ value: 0.9
209
+ adam_beta2:
210
+ desc: null
211
+ value: 0.95
212
+ adam_eps:
213
+ desc: null
214
+ value: 1.0e-06
215
+ hf_transformer_model_dir:
216
+ desc: null
217
+ value: null
218
+ instruction_train_data_path:
219
+ desc: null
220
+ value: null
221
+ instruction_valid_data_path:
222
+ desc: null
223
+ value: null
224
+ epoch:
225
+ desc: null
226
+ value: null
227
+ instruction_dataset_size:
228
+ desc: null
229
+ value: null
230
+ save_sampler_state:
231
+ desc: null
232
+ value: false
233
+ label_smoothing:
234
+ desc: null
235
+ value: 0.0
236
+ save_n_checkpoints:
237
+ desc: null
238
+ value: 10
239
+ hf_repo_id:
240
+ desc: null
241
+ value: koichi12/Qwen2-0.5b-0.2
242
+ create_public_hf_repo:
243
+ desc: null
244
+ value: false
245
+ upload_all_checkpoints_to_hf:
246
+ desc: null
247
+ value: true
248
+ hf_upload_retry_limit:
249
+ desc: null
250
+ value: 2
251
+ exit_duration_in_mins:
252
+ desc: null
253
+ value: null
254
+ source_key:
255
+ desc: null
256
+ value: null
257
+ target_key:
258
+ desc: null
259
+ value: null
260
+ attn_implementation:
261
+ desc: null
262
+ value: flash_attention_2
263
+ efficient_instruction_tuning:
264
+ desc: null
265
+ value: false
266
+ remove_padding_masking:
267
+ desc: null
268
+ value: false
269
+ save_start_iter:
270
+ desc: null
271
+ value: null
272
+ valid_micro_batch_size:
273
+ desc: null
274
+ value: 1
275
+ rank:
276
+ desc: null
277
+ value: 0
278
+ world_size:
279
+ desc: null
280
+ value: 1
281
+ padded_vocab_size:
282
+ desc: null
283
+ value: 151680
284
+ gradient_accumulation_steps:
285
+ desc: null
286
+ value: 128
287
+ _wandb:
288
+ desc: null
289
+ value:
290
+ python_version: 3.10.12
291
+ cli_version: 0.16.3
292
+ framework: huggingface
293
+ huggingface_version: 4.43.3
294
+ is_jupyter_run: false
295
+ is_kaggle_kernel: false
296
+ start_time: 1724398162.884223
297
+ t:
298
+ 1:
299
+ - 1
300
+ - 11
301
+ - 49
302
+ - 55
303
+ - 71
304
+ - 105
305
+ 2:
306
+ - 1
307
+ - 11
308
+ - 49
309
+ - 55
310
+ - 71
311
+ - 105
312
+ 3:
313
+ - 13
314
+ - 16
315
+ - 23
316
+ 4: 3.10.12
317
+ 5: 0.16.3
318
+ 6: 4.43.3
319
+ 8:
320
+ - 5
321
+ 13: linux-x86_64
322
+ model_architecture:
323
+ desc: null
324
+ value: Qwen2ForCausalLM
325
+ activation_function:
326
+ desc: null
327
+ value: silu
328
+ hidden_size:
329
+ desc: null
330
+ value: 896
331
+ model_type:
332
+ desc: null
333
+ value: qwen2
334
+ max_position_embeddings:
335
+ desc: null
336
+ value: 2048
337
+ num_attention_heads:
338
+ desc: null
339
+ value: 14
340
+ num_hidden_layers:
341
+ desc: null
342
+ value: 24
wandb/run-20240823_162922-z3gs82jm/files/output.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
5
+ Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
6
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
7
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
8
+ BFloat16 enabled for mixed precision - using bfSixteen policy
9
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
10
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
11
+ warnings.warn(
12
+ Let split = None
13
+ --> applying fsdp activation checkpointing...
14
+ > datasets target sizes (minimum size):
15
+ train: 4800000
16
+ validation: 4806400
17
+ test: 6400
18
+ > building train, validation, and test datasets for GPT ...
19
+ Unable to save the indexes because path_to_cache is None
20
+ Building a BlendedDataset for a single MegatronDataset
21
+ Unable to save the indexes because path_to_cache is None
22
+ > finished creating GPT datasets ...
23
+ Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
24
+ Building a BlendedDataset for a single MegatronDataset
25
+ Unable to save the indexes because path_to_cache is None
26
+ Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
27
+ model info: FullyShardedDataParallel(
28
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
29
+ (model): Qwen2Model(
30
+ (embed_tokens): Embedding(151936, 896)
31
+ (layers): ModuleList(
32
+ (0-23): 24 x FullyShardedDataParallel(
33
+ (_fsdp_wrapped_module): CheckpointWrapper(
34
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
35
+ (self_attn): Qwen2FlashAttention2(
36
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
37
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
38
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
39
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
40
+ (rotary_emb): Qwen2RotaryEmbedding()
41
+ )
42
+ (mlp): Qwen2MLP(
43
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
44
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
45
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
46
+ (act_fn): SiLU()
47
+ )
48
+ (input_layernorm): Qwen2RMSNorm()
49
+ (post_attention_layernorm): Qwen2RMSNorm()
50
+ )
51
+ )
52
+ )
53
+ )
54
+ (norm): Qwen2RMSNorm()
55
+ )
56
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
57
+ )
58
+ )
59
+ model config: Qwen2Config {
60
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
61
+ "architectures": [
62
+ "Qwen2ForCausalLM"
63
+ ],
64
+ "attention_dropout": 0.0,
65
+ "bos_token_id": 151643,
66
+ "eos_token_id": 151643,
67
+ "hidden_act": "silu",
68
+ "hidden_size": 896,
69
+ "initializer_range": 0.02,
70
+ "intermediate_size": 4864,
71
+ "label_smoothing": 0.0,
72
+ "max_position_embeddings": 2048,
73
+ "max_window_layers": 24,
74
+ "model_type": "qwen2",
75
+ "num_attention_heads": 14,
76
+ "num_hidden_layers": 24,
77
+ "num_key_value_heads": 2,
78
+ "rms_norm_eps": 1e-06,
79
+ "rope_theta": 1000000.0,
80
+ "sliding_window": 131072,
81
+ "tie_word_embeddings": true,
82
+ "torch_dtype": "bfloat16",
83
+ "transformers_version": "4.43.3",
84
+ "use_cache": false,
85
+ "use_sliding_window": false,
86
+ "vocab_size": 151936
87
+ }
88
+ [rank0]:[2024-08-23 16:29:30,218] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
89
+ ------------------------------------------------------------------
90
+ iteration: 41 , TFLOPS: 80.56174819694237, Tokens per sec: 23067.0212685365, Loss: 4.141458988189697
91
+ ------------------------------------------------------------------
92
+ ------------------------------------------------------------------
93
+ iteration: 42 , TFLOPS: 82.11115614789513, Tokens per sec: 23510.658937258577, Loss: 4.13422155380249
94
+ ------------------------------------------------------------------
95
+ ------------------------------------------------------------------
96
+ iteration: 43 , TFLOPS: 82.06292558214139, Tokens per sec: 23496.84921352573, Loss: 4.125084400177002
97
+ ------------------------------------------------------------------
98
+ ------------------------------------------------------------------
99
+ iteration: 44 , TFLOPS: 82.21135543718982, Tokens per sec: 23539.348721045328, Loss: 4.142415523529053
100
+ ------------------------------------------------------------------
101
+ ------------------------------------------------------------------
102
+ iteration: 45 , TFLOPS: 82.19624054931023, Tokens per sec: 23535.020917242735, Loss: 4.127298831939697
103
+ ------------------------------------------------------------------
104
+ ------------------------------------------------------------------
105
+ iteration: 46 , TFLOPS: 82.08863919564567, Tokens per sec: 23504.21172095474, Loss: 4.131596565246582
106
+ ------------------------------------------------------------------
107
+ ------------------------------------------------------------------
108
+ iteration: 47 , TFLOPS: 82.19786737309032, Tokens per sec: 23535.486721170473, Loss: 4.140783786773682
109
+ ------------------------------------------------------------------
110
+ ------------------------------------------------------------------
111
+ iteration: 48 , TFLOPS: 82.07990700682468, Tokens per sec: 23501.711457619313, Loss: 4.111098289489746
112
+ ------------------------------------------------------------------
113
+ Traceback (most recent call last):
114
+ File "/project/examples/finetuning.py", line 13, in <module>
115
+ main()
116
+ File "/project/src/llama_recipes/finetuning.py", line 282, in main
117
+ train(
118
+ File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
119
+ loss: torch.Tensor = model(**batch).loss
120
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
121
+ return self._call_impl(*args, **kwargs)
122
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
123
+ return forward_call(*args, **kwargs)
124
+ File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
125
+ output = self._fsdp_wrapped_module(*args, **kwargs)
126
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
127
+ return self._call_impl(*args, **kwargs)
128
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
129
+ return forward_call(*args, **kwargs)
130
+ File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 1054, in forward
131
+ outputs = self.model(
132
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
133
+ return self._call_impl(*args, **kwargs)
134
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
135
+ return forward_call(*args, **kwargs)
136
+ File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 856, in forward
137
+ layer_outputs = decoder_layer(
138
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
139
+ return self._call_impl(*args, **kwargs)
140
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
141
+ return forward_call(*args, **kwargs)
142
+ File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
143
+ output = self._fsdp_wrapped_module(*args, **kwargs)
144
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
145
+ return self._call_impl(*args, **kwargs)
146
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
147
+ return forward_call(*args, **kwargs)
148
+ File "/usr/local/lib/python3.10/dist-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 168, in forward
149
+ return self.checkpoint_fn( # type: ignore[misc]
150
+ File "/usr/local/lib/python3.10/dist-packages/torch/_compile.py", line 24, in inner
151
+ return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
152
+ File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 417, in _fn
153
+ return fn(*args, **kwargs)
154
+ File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/external_utils.py", line 25, in inner
155
+ return fn(*args, **kwargs)
156
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 488, in checkpoint
157
+ ret = function(*args, **kwargs)
158
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
159
+ return self._call_impl(*args, **kwargs)
160
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
161
+ return forward_call(*args, **kwargs)
162
+ File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 609, in forward
163
+ hidden_states = self.post_attention_layernorm(hidden_states)
164
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
165
+ return self._call_impl(*args, **kwargs)
166
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
167
+ return forward_call(*args, **kwargs)
168
+ File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 78, in forward
169
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
170
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 1091, in pack_hook
171
+ with torch.no_grad():
172
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 149, in __new__
173
+ def __new__(cls, orig_func=None):
174
+ KeyboardInterrupt
wandb/run-20240823_162922-z3gs82jm/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0
wandb/run-20240823_162922-z3gs82jm/files/wandb-metadata.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-23T07:29:23.385958",
5
+ "startedAt": "2024-08-23T07:29:22.871856",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "2048",
11
+ "--sliding-window-size",
12
+ "131072",
13
+ "--micro-batch-size",
14
+ "5",
15
+ "--valid_micro_batch_size",
16
+ "1",
17
+ "--global-batch-size",
18
+ "640",
19
+ "--train-iters",
20
+ "7500",
21
+ "--tokenizer-type",
22
+ "HFPreTrainedTokenizer",
23
+ "--tokenizer-model",
24
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
25
+ "--train-data-path",
26
+ "1754785366",
27
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
28
+ "28623823675",
29
+ "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
30
+ "--valid-data-path",
31
+ "1754785366",
32
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
33
+ "--test-data-path",
34
+ "1754785366",
35
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
36
+ "--lr",
37
+ "2e-5",
38
+ "--min-lr",
39
+ "1e-6",
40
+ "--lr-decay-style",
41
+ "cosine",
42
+ "--lr-warmup-iters",
43
+ "500",
44
+ "--lr-decay-iters",
45
+ "7500",
46
+ "--weight-decay",
47
+ "0.1",
48
+ "--grad-clip-norm",
49
+ "1.0",
50
+ "--optimizer",
51
+ "anyprecision",
52
+ "--adam-beta1",
53
+ "0.9",
54
+ "--adam-beta2",
55
+ "0.95",
56
+ "--adam-eps",
57
+ "1e-6",
58
+ "--save-interval",
59
+ "10",
60
+ "--eval-interval",
61
+ "10",
62
+ "--eval-iters",
63
+ "10",
64
+ "--bf16",
65
+ "--mixed-precision",
66
+ "--base-model",
67
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
68
+ "--save",
69
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
70
+ "--load",
71
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
72
+ "--fsdp-activation-checkpointing",
73
+ "--sharding-strategy",
74
+ "FULL_SHARD",
75
+ "--checkpoint-type",
76
+ "LOCAL_STATE_DICT",
77
+ "--save-n-checkpoints",
78
+ "10",
79
+ "--upload-all-checkpoints-to-hf",
80
+ "--hf-upload-retry-limit",
81
+ "2",
82
+ "--hf-repo-id",
83
+ "koichi12/Qwen2-0.5b-0.2",
84
+ "--wandb-entity",
85
+ "iwakawa-koichi-q5-tohoku-nlp6723",
86
+ "--wandb-project",
87
+ "llm_tutorial-0.2",
88
+ "--wandb-name",
89
+ "Qwen2-0.5b-0.2_train_2024-08-23-16:29:10"
90
+ ],
91
+ "state": "running",
92
+ "program": "/project/examples/finetuning.py",
93
+ "codePathLocal": "examples/finetuning.py",
94
+ "codePath": "examples/finetuning.py",
95
+ "git": {
96
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
97
+ "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
98
+ },
99
+ "email": null,
100
+ "root": "/project",
101
+ "host": "gpu-koiwa-00",
102
+ "username": "koiwa",
103
+ "executable": "/usr/bin/python",
104
+ "cpu_count": 18,
105
+ "cpu_count_logical": 18,
106
+ "cpu_freq": {
107
+ "current": 2400.0389999999993,
108
+ "min": 0.0,
109
+ "max": 0.0
110
+ },
111
+ "cpu_freq_per_core": [
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2400.039,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ }
202
+ ],
203
+ "disk": {
204
+ "/": {
205
+ "total": 0.0625,
206
+ "used": 1.1444091796875e-05
207
+ }
208
+ },
209
+ "gpu": "NVIDIA A100-SXM4-40GB",
210
+ "gpu_count": 1,
211
+ "gpu_devices": [
212
+ {
213
+ "name": "NVIDIA A100-SXM4-40GB",
214
+ "memory_total": 42949672960
215
+ }
216
+ ],
217
+ "memory": {
218
+ "total": 56.487831115722656
219
+ }
220
+ }
wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 4.111098289489746, "training/perplexity": 61.013691480602496, "utils/batch_size": 5, "utils/global_batch_size": 640, "utils/seq_len": 2049, "utils/gradient_accumulation_steps": 128, "utils/iteration": 48, "optimizer/lr": 2.8240000000000004e-06, "optimizer/variance_l2": 0.05465312149531553, "optimizer/variance_sqrt_l2": 0.9576321330918345, "optimizer/momentum_l2": 0.9493419425990095, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.91644287109375, "optimizer/variance_sqrt_l1": 3987.5, "optimizer/momentum_l1": 3375.75, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.044921875, "optimizer/variance_sqrt_abs_max": 0.2119140625, "optimizer/momentum_abs_max": 0.234375, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 55.79848949999996, "stats/tokens_per_sec": 23501.711457619313, "stats/tokens_per_sec_per_gpu": 23501.711457619313, "stats/tflops": 82.07990700682468, "_timestamp": 1724398618.465386, "_runtime": 455.5811629295349, "_step": 48, "_wandb": {"runtime": 505}}
wandb/run-20240823_162922-z3gs82jm/logs/debug-internal.log ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 16:29:22,886 INFO StreamThr :11966 [internal.py:wandb_internal():86] W&B internal server running at pid: 11966, started at: 2024-08-23 16:29:22.885536
2
+ 2024-08-23 16:29:22,888 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-23 16:29:22,889 INFO WriterThread:11966 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_162922-z3gs82jm/run-z3gs82jm.wandb
4
+ 2024-08-23 16:29:22,890 DEBUG SenderThread:11966 [sender.py:send():382] send: header
5
+ 2024-08-23 16:29:22,904 DEBUG SenderThread:11966 [sender.py:send():382] send: run
6
+ 2024-08-23 16:29:23,294 INFO SenderThread:11966 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_162922-z3gs82jm/files
7
+ 2024-08-23 16:29:23,294 INFO SenderThread:11966 [sender.py:_start_run_threads():1136] run started: z3gs82jm with start time 1724398162.884223
8
+ 2024-08-23 16:29:23,300 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-23 16:29:23,300 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-23 16:29:23,367 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-23 16:29:23,374 DEBUG HandlerThread:11966 [system_info.py:__init__():27] System info init
12
+ 2024-08-23 16:29:23,374 DEBUG HandlerThread:11966 [system_info.py:__init__():42] System info init done
13
+ 2024-08-23 16:29:23,374 INFO HandlerThread:11966 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-23 16:29:23,374 INFO SystemMonitor:11966 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-23 16:29:23,374 INFO HandlerThread:11966 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-23 16:29:23,374 INFO SystemMonitor:11966 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-23 16:29:23,375 INFO SystemMonitor:11966 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-23 16:29:23,375 INFO SystemMonitor:11966 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-23 16:29:23,377 INFO SystemMonitor:11966 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-23 16:29:23,378 INFO SystemMonitor:11966 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-23 16:29:23,385 DEBUG HandlerThread:11966 [system_info.py:probe():151] Probing system
22
+ 2024-08-23 16:29:23,387 DEBUG HandlerThread:11966 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-23 16:29:23,400 DEBUG HandlerThread:11966 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-23 16:29:23,400 DEBUG HandlerThread:11966 [system_info.py:probe():199] Probing system done
25
+ 2024-08-23 16:29:23,400 DEBUG HandlerThread:11966 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T07:29:23.385958', 'startedAt': '2024-08-23T07:29:22.871856', 'docker': None, 'cuda': None, 'args': ('--seq-length', '2048', '--sliding-window-size', '131072', '--micro-batch-size', '5', '--valid_micro_batch_size', '1', '--global-batch-size', '640', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-16:29:10'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
26
+ 2024-08-23 16:29:23,400 INFO HandlerThread:11966 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-23 16:29:23,400 INFO HandlerThread:11966 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-23 16:29:23,402 INFO HandlerThread:11966 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-23 16:29:23,407 DEBUG SenderThread:11966 [sender.py:send():382] send: files
30
+ 2024-08-23 16:29:23,407 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-23 16:29:23,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-23 16:29:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-23 16:29:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-23 16:29:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-23 16:29:23,421 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-23 16:29:23,617 DEBUG SenderThread:11966 [sender.py:send():382] send: telemetry
37
+ 2024-08-23 16:29:24,296 INFO Thread-12 :11966 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
38
+ 2024-08-23 16:29:24,296 INFO Thread-12 :11966 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162922-z3gs82jm/files/requirements.txt
39
+ 2024-08-23 16:29:24,297 INFO Thread-12 :11966 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-metadata.json
40
+ 2024-08-23 16:29:24,474 INFO wandb-upload_0:11966 [upload_job.py:push():131] Uploaded file /tmp/tmpcv014twmwandb/xf5mvw68-wandb-metadata.json
41
+ 2024-08-23 16:29:26,296 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
42
+ 2024-08-23 16:29:28,298 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
43
+ 2024-08-23 16:29:28,658 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
44
+ 2024-08-23 16:29:30,299 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
45
+ 2024-08-23 16:29:30,499 DEBUG SenderThread:11966 [sender.py:send():382] send: config
46
+ 2024-08-23 16:29:30,500 DEBUG SenderThread:11966 [sender.py:send():382] send: config
47
+ 2024-08-23 16:29:32,300 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
48
+ 2024-08-23 16:29:34,500 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
49
+ 2024-08-23 16:29:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
50
+ 2024-08-23 16:29:38,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
51
+ 2024-08-23 16:29:38,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
52
+ 2024-08-23 16:29:39,685 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
53
+ 2024-08-23 16:29:44,685 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
54
+ 2024-08-23 16:29:49,686 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-08-23 16:29:53,417 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
56
+ 2024-08-23 16:29:53,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
57
+ 2024-08-23 16:29:53,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
58
+ 2024-08-23 16:29:55,642 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
59
+ 2024-08-23 16:29:56,312 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/config.yaml
60
+ 2024-08-23 16:30:00,842 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
61
+ 2024-08-23 16:30:05,842 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-23 16:30:08,417 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
63
+ 2024-08-23 16:30:08,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
64
+ 2024-08-23 16:30:08,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
65
+ 2024-08-23 16:30:11,641 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
66
+ 2024-08-23 16:30:16,641 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
67
+ 2024-08-23 16:30:21,642 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
68
+ 2024-08-23 16:30:23,378 DEBUG SystemMonitor:11966 [system_monitor.py:_start():172] Starting system metrics aggregation loop
69
+ 2024-08-23 16:30:23,380 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
70
+ 2024-08-23 16:30:23,417 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
71
+ 2024-08-23 16:30:23,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
72
+ 2024-08-23 16:30:23,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
73
+ 2024-08-23 16:30:27,617 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
74
+ 2024-08-23 16:30:28,122 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
75
+ 2024-08-23 16:30:30,329 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
76
+ 2024-08-23 16:30:33,164 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
77
+ 2024-08-23 16:30:38,165 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
78
+ 2024-08-23 16:30:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
79
+ 2024-08-23 16:30:38,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
80
+ 2024-08-23 16:30:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
81
+ 2024-08-23 16:30:43,611 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
82
+ 2024-08-23 16:30:48,611 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
83
+ 2024-08-23 16:30:53,380 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
84
+ 2024-08-23 16:30:53,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
85
+ 2024-08-23 16:30:53,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
86
+ 2024-08-23 16:30:53,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
87
+ 2024-08-23 16:30:53,639 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
88
+ 2024-08-23 16:30:58,640 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-08-23 16:31:03,640 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
90
+ 2024-08-23 16:31:08,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
91
+ 2024-08-23 16:31:08,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
92
+ 2024-08-23 16:31:08,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
93
+ 2024-08-23 16:31:08,666 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
94
+ 2024-08-23 16:31:13,666 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-23 16:31:18,667 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
96
+ 2024-08-23 16:31:23,382 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
97
+ 2024-08-23 16:31:23,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
98
+ 2024-08-23 16:31:23,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
99
+ 2024-08-23 16:31:23,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
100
+ 2024-08-23 16:31:23,682 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
101
+ 2024-08-23 16:31:23,901 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
102
+ 2024-08-23 16:31:23,904 DEBUG SenderThread:11966 [sender.py:send():382] send: history
103
+ 2024-08-23 16:31:23,904 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
104
+ 2024-08-23 16:31:23,906 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
105
+ 2024-08-23 16:31:24,355 INFO Thread-12 :11966 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
106
+ 2024-08-23 16:31:26,356 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
107
+ 2024-08-23 16:31:28,944 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
108
+ 2024-08-23 16:31:33,945 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
109
+ 2024-08-23 16:31:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
110
+ 2024-08-23 16:31:38,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
111
+ 2024-08-23 16:31:38,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
112
+ 2024-08-23 16:31:39,638 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
113
+ 2024-08-23 16:31:44,639 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
114
+ 2024-08-23 16:31:49,639 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
115
+ 2024-08-23 16:31:53,383 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
116
+ 2024-08-23 16:31:53,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
117
+ 2024-08-23 16:31:53,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
118
+ 2024-08-23 16:31:53,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
119
+ 2024-08-23 16:31:54,679 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
120
+ 2024-08-23 16:31:59,679 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
121
+ 2024-08-23 16:32:04,680 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
122
+ 2024-08-23 16:32:08,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
123
+ 2024-08-23 16:32:08,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
124
+ 2024-08-23 16:32:08,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
125
+ 2024-08-23 16:32:10,613 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
126
+ 2024-08-23 16:32:15,613 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
127
+ 2024-08-23 16:32:19,715 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
128
+ 2024-08-23 16:32:19,717 DEBUG SenderThread:11966 [sender.py:send():382] send: history
129
+ 2024-08-23 16:32:19,717 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
130
+ 2024-08-23 16:32:19,719 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
131
+ 2024-08-23 16:32:20,383 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
132
+ 2024-08-23 16:32:20,760 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
133
+ 2024-08-23 16:32:22,384 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
134
+ 2024-08-23 16:32:23,385 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
135
+ 2024-08-23 16:32:23,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
136
+ 2024-08-23 16:32:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
137
+ 2024-08-23 16:32:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
138
+ 2024-08-23 16:32:26,593 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
139
+ 2024-08-23 16:32:31,594 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
140
+ 2024-08-23 16:32:36,594 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
141
+ 2024-08-23 16:32:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
142
+ 2024-08-23 16:32:38,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
143
+ 2024-08-23 16:32:38,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
144
+ 2024-08-23 16:32:41,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
145
+ 2024-08-23 16:32:46,675 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
146
+ 2024-08-23 16:32:51,676 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
147
+ 2024-08-23 16:32:53,384 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
148
+ 2024-08-23 16:32:53,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
149
+ 2024-08-23 16:32:53,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
150
+ 2024-08-23 16:32:53,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
151
+ 2024-08-23 16:32:57,603 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
152
+ 2024-08-23 16:33:02,604 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
153
+ 2024-08-23 16:33:07,604 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
154
+ 2024-08-23 16:33:08,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
155
+ 2024-08-23 16:33:08,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
156
+ 2024-08-23 16:33:08,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
157
+ 2024-08-23 16:33:12,676 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
158
+ 2024-08-23 16:33:15,428 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
159
+ 2024-08-23 16:33:15,430 DEBUG SenderThread:11966 [sender.py:send():382] send: history
160
+ 2024-08-23 16:33:15,430 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
161
+ 2024-08-23 16:33:15,432 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
162
+ 2024-08-23 16:33:16,412 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
163
+ 2024-08-23 16:33:18,413 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
164
+ 2024-08-23 16:33:18,472 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
165
+ 2024-08-23 16:33:23,385 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
166
+ 2024-08-23 16:33:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
167
+ 2024-08-23 16:33:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
168
+ 2024-08-23 16:33:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
169
+ 2024-08-23 16:33:23,682 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
170
+ 2024-08-23 16:33:28,682 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
171
+ 2024-08-23 16:33:33,683 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
172
+ 2024-08-23 16:33:38,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
173
+ 2024-08-23 16:33:38,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
174
+ 2024-08-23 16:33:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
175
+ 2024-08-23 16:33:39,667 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
176
+ 2024-08-23 16:33:44,667 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
177
+ 2024-08-23 16:33:49,668 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
178
+ 2024-08-23 16:33:53,386 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
179
+ 2024-08-23 16:33:53,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
180
+ 2024-08-23 16:33:53,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
181
+ 2024-08-23 16:33:53,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
182
+ 2024-08-23 16:33:55,611 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
183
+ 2024-08-23 16:34:00,612 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
184
+ 2024-08-23 16:34:05,612 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
185
+ 2024-08-23 16:34:08,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
186
+ 2024-08-23 16:34:08,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
187
+ 2024-08-23 16:34:08,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
188
+ 2024-08-23 16:34:10,679 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
189
+ 2024-08-23 16:34:11,150 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
190
+ 2024-08-23 16:34:11,152 DEBUG SenderThread:11966 [sender.py:send():382] send: history
191
+ 2024-08-23 16:34:11,152 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
192
+ 2024-08-23 16:34:11,154 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
193
+ 2024-08-23 16:34:11,439 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
194
+ 2024-08-23 16:34:12,439 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
195
+ 2024-08-23 16:34:16,192 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
196
+ 2024-08-23 16:34:21,193 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
197
+ 2024-08-23 16:34:23,387 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
198
+ 2024-08-23 16:34:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
199
+ 2024-08-23 16:34:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
200
+ 2024-08-23 16:34:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
201
+ 2024-08-23 16:34:26,589 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
202
+ 2024-08-23 16:34:31,590 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
203
+ 2024-08-23 16:34:36,590 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
204
+ 2024-08-23 16:34:38,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
205
+ 2024-08-23 16:34:38,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
206
+ 2024-08-23 16:34:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
207
+ 2024-08-23 16:34:41,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
208
+ 2024-08-23 16:34:46,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
209
+ 2024-08-23 16:34:51,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
210
+ 2024-08-23 16:34:53,389 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
211
+ 2024-08-23 16:34:53,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
212
+ 2024-08-23 16:34:53,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
213
+ 2024-08-23 16:34:53,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
214
+ 2024-08-23 16:34:57,671 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
215
+ 2024-08-23 16:35:02,672 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
216
+ 2024-08-23 16:35:06,945 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
217
+ 2024-08-23 16:35:06,948 DEBUG SenderThread:11966 [sender.py:send():382] send: history
218
+ 2024-08-23 16:35:06,948 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
219
+ 2024-08-23 16:35:06,949 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
220
+ 2024-08-23 16:35:07,467 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
221
+ 2024-08-23 16:35:07,988 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
222
+ 2024-08-23 16:35:08,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
223
+ 2024-08-23 16:35:08,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
224
+ 2024-08-23 16:35:08,421 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
225
+ 2024-08-23 16:35:08,468 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
226
+ 2024-08-23 16:35:13,689 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
227
+ 2024-08-23 16:35:18,689 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
228
+ 2024-08-23 16:35:23,389 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
229
+ 2024-08-23 16:35:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
230
+ 2024-08-23 16:35:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
231
+ 2024-08-23 16:35:23,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
232
+ 2024-08-23 16:35:24,596 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
233
+ 2024-08-23 16:35:29,597 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
234
+ 2024-08-23 16:35:34,597 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
235
+ 2024-08-23 16:35:38,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
236
+ 2024-08-23 16:35:38,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
237
+ 2024-08-23 16:35:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
238
+ 2024-08-23 16:35:39,689 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
239
+ 2024-08-23 16:35:44,690 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
240
+ 2024-08-23 16:35:49,690 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
241
+ 2024-08-23 16:35:53,390 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
242
+ 2024-08-23 16:35:53,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
243
+ 2024-08-23 16:35:53,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
244
+ 2024-08-23 16:35:53,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
245
+ 2024-08-23 16:35:55,688 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
246
+ 2024-08-23 16:36:00,688 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
247
+ 2024-08-23 16:36:02,665 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
248
+ 2024-08-23 16:36:02,667 DEBUG SenderThread:11966 [sender.py:send():382] send: history
249
+ 2024-08-23 16:36:02,667 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
250
+ 2024-08-23 16:36:02,668 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
251
+ 2024-08-23 16:36:03,495 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
252
+ 2024-08-23 16:36:04,496 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
253
+ 2024-08-23 16:36:05,708 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
254
+ 2024-08-23 16:36:08,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
255
+ 2024-08-23 16:36:08,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
256
+ 2024-08-23 16:36:08,421 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
257
+ 2024-08-23 16:36:11,641 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
258
+ 2024-08-23 16:36:16,641 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
259
+ 2024-08-23 16:36:21,642 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
260
+ 2024-08-23 16:36:23,391 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
261
+ 2024-08-23 16:36:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
262
+ 2024-08-23 16:36:23,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
263
+ 2024-08-23 16:36:23,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
264
+ 2024-08-23 16:36:26,684 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
265
+ 2024-08-23 16:36:31,685 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
266
+ 2024-08-23 16:36:36,685 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
267
+ 2024-08-23 16:36:38,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
268
+ 2024-08-23 16:36:38,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
269
+ 2024-08-23 16:36:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
270
+ 2024-08-23 16:36:41,692 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
271
+ 2024-08-23 16:36:46,692 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
272
+ 2024-08-23 16:36:51,693 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
273
+ 2024-08-23 16:36:53,392 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
274
+ 2024-08-23 16:36:53,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
275
+ 2024-08-23 16:36:53,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
276
+ 2024-08-23 16:36:53,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
277
+ 2024-08-23 16:36:57,615 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
278
+ 2024-08-23 16:36:58,466 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
279
+ 2024-08-23 16:36:58,468 DEBUG SenderThread:11966 [sender.py:send():382] send: history
280
+ 2024-08-23 16:36:58,468 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
281
+ 2024-08-23 16:36:58,469 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
282
+ 2024-08-23 16:36:58,523 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
283
+ 2024-08-23 16:37:00,524 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
284
+ 2024-08-23 16:37:03,508 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
285
+ 2024-08-23 16:37:08,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
286
+ 2024-08-23 16:37:08,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
287
+ 2024-08-23 16:37:08,422 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
288
+ 2024-08-23 16:37:08,688 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
289
+ 2024-08-23 16:37:13,688 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
290
+ 2024-08-23 16:37:18,689 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
291
+ 2024-08-23 16:37:23,393 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
292
+ 2024-08-23 16:37:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
293
+ 2024-08-23 16:37:23,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
294
+ 2024-08-23 16:37:23,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
295
+ 2024-08-23 16:37:24,586 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
296
+ 2024-08-23 16:37:29,587 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
297
+ 2024-08-23 16:37:34,587 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
298
+ 2024-08-23 16:37:38,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
299
+ 2024-08-23 16:37:38,421 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
300
+ 2024-08-23 16:37:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
301
+ 2024-08-23 16:37:40,196 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
302
+ 2024-08-23 16:37:45,197 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
303
+ 2024-08-23 16:37:48,676 DEBUG SenderThread:11966 [sender.py:send():382] send: exit
304
+ 2024-08-23 16:37:48,676 INFO SenderThread:11966 [sender.py:send_exit():589] handling exit code: 255
305
+ 2024-08-23 16:37:48,676 INFO SenderThread:11966 [sender.py:send_exit():591] handling runtime: 505
306
+ 2024-08-23 16:37:48,677 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
307
+ 2024-08-23 16:37:48,677 INFO SenderThread:11966 [sender.py:send_exit():597] send defer
308
+ 2024-08-23 16:37:48,678 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
309
+ 2024-08-23 16:37:48,678 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 0
310
+ 2024-08-23 16:37:48,678 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
311
+ 2024-08-23 16:37:48,678 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 0
312
+ 2024-08-23 16:37:48,678 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 1
313
+ 2024-08-23 16:37:48,678 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
314
+ 2024-08-23 16:37:48,678 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 1
315
+ 2024-08-23 16:37:48,678 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
316
+ 2024-08-23 16:37:48,678 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 1
317
+ 2024-08-23 16:37:48,678 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 2
318
+ 2024-08-23 16:37:48,678 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
319
+ 2024-08-23 16:37:48,678 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 2
320
+ 2024-08-23 16:37:48,678 INFO HandlerThread:11966 [system_monitor.py:finish():203] Stopping system monitor
321
+ 2024-08-23 16:37:48,679 DEBUG SystemMonitor:11966 [system_monitor.py:_start():179] Finished system metrics aggregation loop
322
+ 2024-08-23 16:37:48,679 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined cpu monitor
323
+ 2024-08-23 16:37:48,679 DEBUG SystemMonitor:11966 [system_monitor.py:_start():183] Publishing last batch of metrics
324
+ 2024-08-23 16:37:48,679 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined disk monitor
325
+ 2024-08-23 16:37:48,712 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined gpu monitor
326
+ 2024-08-23 16:37:48,712 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined memory monitor
327
+ 2024-08-23 16:37:48,712 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined network monitor
328
+ 2024-08-23 16:37:48,713 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
329
+ 2024-08-23 16:37:48,713 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 2
330
+ 2024-08-23 16:37:48,713 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 3
331
+ 2024-08-23 16:37:48,713 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
332
+ 2024-08-23 16:37:48,713 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
333
+ 2024-08-23 16:37:48,713 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 3
334
+ 2024-08-23 16:37:48,715 DEBUG SenderThread:11966 [sender.py:send():382] send: history
335
+ 2024-08-23 16:37:48,715 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
336
+ 2024-08-23 16:37:48,716 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
337
+ 2024-08-23 16:37:48,716 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
338
+ 2024-08-23 16:37:48,716 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 3
339
+ 2024-08-23 16:37:48,716 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 4
340
+ 2024-08-23 16:37:48,716 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
341
+ 2024-08-23 16:37:48,716 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 4
342
+ 2024-08-23 16:37:48,717 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
343
+ 2024-08-23 16:37:48,717 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 4
344
+ 2024-08-23 16:37:48,717 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 5
345
+ 2024-08-23 16:37:48,717 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
346
+ 2024-08-23 16:37:48,717 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 5
347
+ 2024-08-23 16:37:48,718 DEBUG SenderThread:11966 [sender.py:send():382] send: summary
348
+ 2024-08-23 16:37:48,718 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
349
+ 2024-08-23 16:37:48,719 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
350
+ 2024-08-23 16:37:48,719 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 5
351
+ 2024-08-23 16:37:48,719 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 6
352
+ 2024-08-23 16:37:48,719 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
353
+ 2024-08-23 16:37:48,719 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 6
354
+ 2024-08-23 16:37:48,719 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
355
+ 2024-08-23 16:37:48,719 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 6
356
+ 2024-08-23 16:37:48,719 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 7
357
+ 2024-08-23 16:37:48,719 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
358
+ 2024-08-23 16:37:48,719 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
359
+ 2024-08-23 16:37:48,719 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 7
360
+ 2024-08-23 16:37:48,720 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
361
+ 2024-08-23 16:37:48,720 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 7
362
+ 2024-08-23 16:37:49,550 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
363
+ 2024-08-23 16:37:49,671 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
364
+ 2024-08-23 16:37:50,524 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 8
365
+ 2024-08-23 16:37:50,524 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
366
+ 2024-08-23 16:37:50,524 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
367
+ 2024-08-23 16:37:50,524 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 8
368
+ 2024-08-23 16:37:50,524 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
369
+ 2024-08-23 16:37:50,524 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 8
370
+ 2024-08-23 16:37:50,525 INFO SenderThread:11966 [job_builder.py:build():296] Attempting to build job artifact
371
+ 2024-08-23 16:37:50,525 INFO SenderThread:11966 [job_builder.py:_get_source_type():426] is repo sourced job
372
+ 2024-08-23 16:37:50,540 INFO SenderThread:11966 [job_builder.py:build():402] adding wandb-job metadata file
373
+ 2024-08-23 16:37:50,549 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 9
374
+ 2024-08-23 16:37:50,550 DEBUG SenderThread:11966 [sender.py:send():382] send: artifact
375
+ 2024-08-23 16:37:50,550 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
376
+ 2024-08-23 16:37:50,551 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 9
377
+ 2024-08-23 16:37:50,551 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
378
+ 2024-08-23 16:37:50,671 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
379
+ 2024-08-23 16:37:51,630 INFO wandb-upload_0:11966 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmp7k_y5w7r
380
+ 2024-08-23 16:37:52,084 INFO wandb-upload_1:11966 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp4s29y7vc
381
+ 2024-08-23 16:37:53,444 INFO SenderThread:11966 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MjAxODA1Mw==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MTk5MDU4OQ==', 'versionIndex': 2}}}
382
+ 2024-08-23 16:37:53,444 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
383
+ 2024-08-23 16:37:53,444 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 9
384
+ 2024-08-23 16:37:53,444 INFO SenderThread:11966 [dir_watcher.py:finish():358] shutting down directory watcher
385
+ 2024-08-23 16:37:53,553 INFO SenderThread:11966 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_162922-z3gs82jm/files
386
+ 2024-08-23 16:37:53,553 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/requirements.txt requirements.txt
387
+ 2024-08-23 16:37:53,553 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/config.yaml config.yaml
388
+ 2024-08-23 16:37:53,555 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-metadata.json wandb-metadata.json
389
+ 2024-08-23 16:37:53,555 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json wandb-summary.json
390
+ 2024-08-23 16:37:53,556 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log output.log
391
+ 2024-08-23 16:37:53,558 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 10
392
+ 2024-08-23 16:37:53,558 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
393
+ 2024-08-23 16:37:53,559 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
394
+ 2024-08-23 16:37:53,559 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 10
395
+ 2024-08-23 16:37:53,560 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
396
+ 2024-08-23 16:37:53,560 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 10
397
+ 2024-08-23 16:37:53,560 INFO SenderThread:11966 [file_pusher.py:finish():172] shutting down file pusher
398
+ 2024-08-23 16:37:53,672 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
399
+ 2024-08-23 16:37:53,672 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
400
+ 2024-08-23 16:37:53,988 INFO wandb-upload_0:11966 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162922-z3gs82jm/files/requirements.txt
401
+ 2024-08-23 16:37:54,025 INFO wandb-upload_1:11966 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162922-z3gs82jm/files/config.yaml
402
+ 2024-08-23 16:37:54,673 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
403
+ 2024-08-23 16:37:54,673 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
404
+ 2024-08-23 16:37:55,582 INFO wandb-upload_2:11966 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
405
+ 2024-08-23 16:37:55,638 INFO wandb-upload_3:11966 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
406
+ 2024-08-23 16:37:55,673 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
407
+ 2024-08-23 16:37:55,674 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
408
+ 2024-08-23 16:37:55,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
409
+ 2024-08-23 16:37:55,838 INFO Thread-11 (_thread_body):11966 [sender.py:transition_state():617] send defer: 11
410
+ 2024-08-23 16:37:55,839 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
411
+ 2024-08-23 16:37:55,839 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 11
412
+ 2024-08-23 16:37:55,839 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
413
+ 2024-08-23 16:37:55,839 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 11
414
+ 2024-08-23 16:37:55,839 INFO SenderThread:11966 [file_pusher.py:join():178] waiting for file pusher
415
+ 2024-08-23 16:37:55,840 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 12
416
+ 2024-08-23 16:37:55,840 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
417
+ 2024-08-23 16:37:55,840 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 12
418
+ 2024-08-23 16:37:55,840 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
419
+ 2024-08-23 16:37:55,840 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 12
420
+ 2024-08-23 16:37:55,840 INFO SenderThread:11966 [file_stream.py:finish():595] file stream finish called
421
+ 2024-08-23 16:37:56,021 INFO SenderThread:11966 [file_stream.py:finish():599] file stream finish is done
422
+ 2024-08-23 16:37:56,021 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 13
423
+ 2024-08-23 16:37:56,021 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
424
+ 2024-08-23 16:37:56,021 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 13
425
+ 2024-08-23 16:37:56,021 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
426
+ 2024-08-23 16:37:56,021 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 13
427
+ 2024-08-23 16:37:56,021 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 14
428
+ 2024-08-23 16:37:56,021 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
429
+ 2024-08-23 16:37:56,021 DEBUG SenderThread:11966 [sender.py:send():382] send: final
430
+ 2024-08-23 16:37:56,021 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 14
431
+ 2024-08-23 16:37:56,022 DEBUG SenderThread:11966 [sender.py:send():382] send: footer
432
+ 2024-08-23 16:37:56,022 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
433
+ 2024-08-23 16:37:56,022 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 14
434
+ 2024-08-23 16:37:56,022 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
435
+ 2024-08-23 16:37:56,022 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
436
+ 2024-08-23 16:37:56,023 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
437
+ 2024-08-23 16:37:56,023 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
438
+ 2024-08-23 16:37:56,023 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: server_info
439
+ 2024-08-23 16:37:56,023 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: get_summary
440
+ 2024-08-23 16:37:56,024 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: server_info
441
+ 2024-08-23 16:37:56,025 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: sampled_history
442
+ 2024-08-23 16:37:56,026 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
443
+ 2024-08-23 16:37:56,027 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: job_info
444
+ 2024-08-23 16:37:56,184 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: job_info
445
+ 2024-08-23 16:37:56,185 INFO MainThread:11966 [wandb_run.py:_footer_history_summary_info():3866] rendering history
446
+ 2024-08-23 16:37:56,185 INFO MainThread:11966 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
447
+ 2024-08-23 16:37:56,186 INFO MainThread:11966 [wandb_run.py:_footer_sync_info():3825] logging synced files
448
+ 2024-08-23 16:37:56,186 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: shutdown
449
+ 2024-08-23 16:37:56,186 INFO HandlerThread:11966 [handler.py:finish():869] shutting down handler
450
+ 2024-08-23 16:37:57,027 INFO WriterThread:11966 [datastore.py:close():296] close: /project/wandb/run-20240823_162922-z3gs82jm/run-z3gs82jm.wandb
451
+ 2024-08-23 16:37:57,185 INFO SenderThread:11966 [sender.py:finish():1572] shutting down sender
452
+ 2024-08-23 16:37:57,185 INFO SenderThread:11966 [file_pusher.py:finish():172] shutting down file pusher
453
+ 2024-08-23 16:37:57,185 INFO SenderThread:11966 [file_pusher.py:join():178] waiting for file pusher