koichi12 commited on
Commit
ca3e41a
·
verified ·
1 Parent(s): 09e5c81

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. wandb/run-20240802_173428-s75vpwte/files/config.yaml +335 -0
  2. wandb/run-20240802_173428-s75vpwte/files/output.log +0 -0
  3. wandb/run-20240802_173428-s75vpwte/files/requirements.txt +271 -0
  4. wandb/run-20240802_173428-s75vpwte/files/wandb-metadata.json +215 -0
  5. wandb/run-20240802_173428-s75vpwte/files/wandb-summary.json +1 -0
  6. wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log +0 -0
  7. wandb/run-20240802_173428-s75vpwte/logs/debug.log +29 -0
  8. wandb/run-20240804_135607-ikp7tdz1/files/config.yaml +335 -0
  9. wandb/run-20240804_135607-ikp7tdz1/files/output.log +130 -0
  10. wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt +271 -0
  11. wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json +215 -0
  12. wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json +1 -0
  13. wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log +216 -0
  14. wandb/run-20240804_135607-ikp7tdz1/logs/debug.log +30 -0
  15. wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb +0 -0
  16. wandb/run-20240812_070449-ufge4h1y/files/config.yaml +335 -0
  17. wandb/run-20240812_070449-ufge4h1y/files/output.log +158 -0
  18. wandb/run-20240812_070449-ufge4h1y/files/requirements.txt +271 -0
  19. wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json +215 -0
  20. wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json +1 -0
  21. wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log +616 -0
  22. wandb/run-20240812_070449-ufge4h1y/logs/debug.log +29 -0
  23. wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb +0 -0
  24. wandb/run-20240812_073202-yby212na/files/config.yaml +335 -0
  25. wandb/run-20240812_073202-yby212na/files/output.log +116 -0
  26. wandb/run-20240812_073202-yby212na/files/requirements.txt +271 -0
  27. wandb/run-20240812_073202-yby212na/files/wandb-metadata.json +215 -0
  28. wandb/run-20240812_073202-yby212na/files/wandb-summary.json +1 -0
  29. wandb/run-20240812_073202-yby212na/logs/debug-internal.log +236 -0
  30. wandb/run-20240812_073202-yby212na/logs/debug.log +29 -0
  31. wandb/run-20240812_073202-yby212na/run-yby212na.wandb +0 -0
  32. wandb/run-20240815_041534-1ld4rgmy/files/config.yaml +337 -0
  33. wandb/run-20240815_041534-1ld4rgmy/files/output.log +92 -0
  34. wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt +354 -0
  35. wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json +215 -0
  36. wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json +1 -0
  37. wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log +162 -0
  38. wandb/run-20240815_041534-1ld4rgmy/logs/debug.log +29 -0
  39. wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb +0 -0
  40. wandb/run-20240824_202022-z2bjbf6e/files/config.yaml +321 -0
  41. wandb/run-20240824_202022-z2bjbf6e/files/output.log +51 -0
  42. wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt +375 -0
  43. wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json +880 -0
  44. wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json +1 -0
  45. wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log +191 -0
  46. wandb/run-20240824_202022-z2bjbf6e/logs/debug.log +28 -0
  47. wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb +0 -0
  48. wandb/run-20240826_221726-7jzdp89j/files/config.yaml +342 -0
  49. wandb/run-20240826_221726-7jzdp89j/files/output.log +0 -0
  50. wandb/run-20240826_221726-7jzdp89j/files/requirements.txt +375 -0
wandb/run-20240802_173428-s75vpwte/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-mistral-sample_train_2024-08-02-17:34:15
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-mistral-sample
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-mistral-sample
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/custom/tiny-mistral
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-mistral-sample
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32768
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722587668.341658
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 256
321
+ model_type:
322
+ desc: null
323
+ value: mistral
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 512
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 4
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 4
333
+ model_architecture:
334
+ desc: null
335
+ value: MistralForCausalLM
wandb/run-20240802_173428-s75vpwte/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240802_173428-s75vpwte/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240802_173428-s75vpwte/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-02T08:34:28.941229",
5
+ "startedAt": "2024-08-02T08:34:28.326109",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/custom/tiny-mistral",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-mistral-sample",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-mistral-sample",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-mistral-sample",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-mistral-sample_train_2024-08-02-17:34:15"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0409999999997,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.041,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.041,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.041,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.041,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.041,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.041,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.041,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.041,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.041,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.041,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.041,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.041,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.041,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.041,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.041,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.041,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.041,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.041,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240802_173428-s75vpwte/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 8.780712127685547, "training/perplexity": 6507.50970149773, "utils/batch_size": 8, "utils/global_batch_size": 320, "utils/seq_len": 513, "utils/gradient_accumulation_steps": 40, "utils/iteration": 1410, "optimizer/lr": 1.989808738231659e-05, "optimizer/variance_l2": 0.013855160145659429, "optimizer/variance_sqrt_l2": 0.9992841304001847, "optimizer/momentum_l2": 0.9839698623853019, "optimizer/weight_l2": 101.83051175850979, "optimizer/variance_l1": 1.002197265625, "optimizer/variance_sqrt_l1": 536.5, "optimizer/momentum_l1": 403.875, "optimizer/weight_l1": 332288.0, "optimizer/variance_abs_max": 0.0011444091796875, "optimizer/variance_sqrt_abs_max": 0.033935546875, "optimizer/momentum_abs_max": 0.03369140625, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 1.277997902000152, "stats/tokens_per_sec": 128450.91509389698, "stats/tokens_per_sec_per_gpu": 128450.91509389698, "stats/tflops": 9.093190310165799, "_timestamp": 1722589282.0763872, "_runtime": 1613.73472905159, "_step": 1410, "evaluation/val_loss": 8.783937454223633, "evaluation/val_ppl": 6528.5322265625, "_wandb": {"runtime": 1614}}
wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240802_173428-s75vpwte/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-02 17:34:28,332 INFO MainThread:13969 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Configure stats pid to 13969
3
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
6
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240802_173428-s75vpwte/logs/debug.log
9
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log
10
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-02-17:34:15', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
13
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:init():616] starting backend
14
+ 2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-02 17:34:28,339 INFO MainThread:13969 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-02 17:34:28,341 INFO MainThread:13969 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-02 17:34:28,346 INFO MainThread:13969 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-02 17:34:28,360 INFO MainThread:13969 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-02 17:34:28,832 INFO MainThread:13969 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-02 17:34:28,915 INFO MainThread:13969 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-02 17:34:28,915 INFO MainThread:13969 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-02 17:34:28,976 INFO MainThread:13969 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-02 17:34:28,976 INFO MainThread:13969 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-02 17:34:28,976 INFO MainThread:13969 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-02 17:34:28,976 INFO MainThread:13969 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-02 17:34:28,977 INFO MainThread:13969 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-02 17:34:33,327 INFO MainThread:13969 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
29
+ 2024-08-02 17:34:33,327 INFO MainThread:13969 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
wandb/run-20240804_135607-ikp7tdz1/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 256
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-llama-sample_train_2024-08-04-13:55:35
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-llama-sample
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-llama-sample
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 2000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 2000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 2048
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-llama-sample
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722747367.911791
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 2048
321
+ model_type:
322
+ desc: null
323
+ value: llama
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 2048
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 32
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 22
333
+ model_architecture:
334
+ desc: null
335
+ value: LlamaForCausalLM
wandb/run-20240804_135607-ikp7tdz1/files/output.log ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-llama-sample.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping model loading
11
+ --> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
12
+ --> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
13
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
14
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
15
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
16
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
17
+ BFloat16 enabled for mixed precision - using bfSixteen policy
18
+ --> applying fsdp activation checkpointing...
19
+ > datasets target sizes (minimum size):
20
+ train: 640000
21
+ validation: 35200
22
+ test: 3200
23
+ > building train, validation, and test datasets for GPT ...
24
+ > finished creating GPT datasets ...
25
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
26
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
27
+ No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping optimizer loading
28
+ File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
29
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
30
+ model info: FullyShardedDataParallel(
31
+ (_fsdp_wrapped_module): LlamaForCausalLM(
32
+ (model): LlamaModel(
33
+ (embed_tokens): Embedding(32000, 2048)
34
+ (layers): ModuleList(
35
+ (0-21): 22 x FullyShardedDataParallel(
36
+ (_fsdp_wrapped_module): CheckpointWrapper(
37
+ (_checkpoint_wrapped_module): LlamaDecoderLayer(
38
+ (self_attn): LlamaFlashAttention2(
39
+ (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
40
+ (k_proj): Linear(in_features=2048, out_features=256, bias=False)
41
+ (v_proj): Linear(in_features=2048, out_features=256, bias=False)
42
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
43
+ (rotary_emb): LlamaRotaryEmbedding()
44
+ )
45
+ (mlp): LlamaMLP(
46
+ (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
47
+ (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
48
+ (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
49
+ (act_fn): SiLU()
50
+ )
51
+ (input_layernorm): LlamaRMSNorm()
52
+ (post_attention_layernorm): LlamaRMSNorm()
53
+ )
54
+ )
55
+ )
56
+ )
57
+ (norm): LlamaRMSNorm()
58
+ (rotary_emb): LlamaRotaryEmbedding()
59
+ )
60
+ (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
61
+ )
62
+ )
63
+ model config: LlamaConfig {
64
+ "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
65
+ "architectures": [
66
+ "LlamaForCausalLM"
67
+ ],
68
+ "attention_bias": false,
69
+ "attention_dropout": 0.0,
70
+ "bos_token_id": 1,
71
+ "eos_token_id": 2,
72
+ "hidden_act": "silu",
73
+ "hidden_size": 2048,
74
+ "initializer_range": 0.02,
75
+ "intermediate_size": 5632,
76
+ "label_smoothing": 0.0,
77
+ "max_position_embeddings": 2048,
78
+ "mlp_bias": false,
79
+ "model_type": "llama",
80
+ "num_attention_heads": 32,
81
+ "num_hidden_layers": 22,
82
+ "num_key_value_heads": 4,
83
+ "pretraining_tp": 1,
84
+ "rms_norm_eps": 1e-05,
85
+ "rope_scaling": null,
86
+ "rope_theta": 10000.0,
87
+ "tie_word_embeddings": false,
88
+ "torch_dtype": "float32",
89
+ "transformers_version": "4.43.3",
90
+ "use_cache": false,
91
+ "vocab_size": 32000
92
+ }
93
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
94
+ warnings.warn(
95
+ Let split = None
96
+ Building a BlendedDataset for a single MegatronDataset
97
+ Unable to save the indexes because path_to_cache is None
98
+ Building a BlendedDataset for a single MegatronDataset
99
+ Unable to save the indexes because path_to_cache is None
100
+ Building a BlendedDataset for a single MegatronDataset
101
+ Unable to save the indexes because path_to_cache is None
102
+ Traceback (most recent call last):
103
+ File "/project/examples/finetuning.py", line 13, in <module>
104
+ main()
105
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
106
+ train(
107
+ File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
108
+ loss: torch.Tensor = model(**batch).loss
109
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
110
+ return self._call_impl(*args, **kwargs)
111
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
112
+ return forward_call(*args, **kwargs)
113
+ File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
114
+ output = self._fsdp_wrapped_module(*args, **kwargs)
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
116
+ return self._call_impl(*args, **kwargs)
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
118
+ return forward_call(*args, **kwargs)
119
+ File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
120
+ outputs = self.model(
121
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
122
+ return self._call_impl(*args, **kwargs)
123
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
124
+ return forward_call(*args, **kwargs)
125
+ File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
126
+ cache_position = torch.arange(
127
+ RuntimeError: CUDA error: device-side assert triggered
128
+ CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
129
+ For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
130
+ Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-04T04:56:08.637907",
5
+ "startedAt": "2024-08-04T04:56:07.879507",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "256",
11
+ "--sliding-window-size",
12
+ "2048",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "2000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "2000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-llama-sample",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-llama-sample",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-llama-sample",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-llama-sample_train_2024-08-04-13:55:35"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0389999999993,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.039,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48781967163086
214
+ }
215
+ }
wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 67}}
wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 13:56:07,912 INFO StreamThr :9151 [internal.py:wandb_internal():86] W&B internal server running at pid: 9151, started at: 2024-08-04 13:56:07.911369
2
+ 2024-08-04 13:56:07,914 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 13:56:07,916 INFO WriterThread:9151 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb
4
+ 2024-08-04 13:56:07,917 DEBUG SenderThread:9151 [sender.py:send():382] send: header
5
+ 2024-08-04 13:56:08,068 DEBUG SenderThread:9151 [sender.py:send():382] send: run
6
+ 2024-08-04 13:56:08,527 INFO SenderThread:9151 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_135607-ikp7tdz1/files
7
+ 2024-08-04 13:56:08,527 INFO SenderThread:9151 [sender.py:_start_run_threads():1136] run started: ikp7tdz1 with start time 1722747367.911791
8
+ 2024-08-04 13:56:08,532 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 13:56:08,533 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 13:56:08,619 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 13:56:08,625 DEBUG HandlerThread:9151 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 13:56:08,625 DEBUG HandlerThread:9151 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 13:56:08,625 INFO HandlerThread:9151 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 13:56:08,625 INFO SystemMonitor:9151 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 13:56:08,626 INFO HandlerThread:9151 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 13:56:08,626 INFO SystemMonitor:9151 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 13:56:08,627 INFO SystemMonitor:9151 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 13:56:08,628 INFO SystemMonitor:9151 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 13:56:08,628 INFO SystemMonitor:9151 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 13:56:08,629 INFO SystemMonitor:9151 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 13:56:08,637 DEBUG HandlerThread:9151 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 13:56:08,639 DEBUG HandlerThread:9151 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 13:56:08,651 DEBUG HandlerThread:9151 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 13:56:08,651 DEBUG HandlerThread:9151 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 13:56:08,651 DEBUG HandlerThread:9151 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T04:56:08.637907', 'startedAt': '2024-08-04T04:56:07.879507', 'docker': None, 'cuda': None, 'args': ('--seq-length', '256', '--sliding-window-size', '2048', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama-sample', '--load', '/work/llm_recipes/models/tiny-llama-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama-sample_train_2024-08-04-13:55:35'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
26
+ 2024-08-04 13:56:08,651 INFO HandlerThread:9151 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 13:56:08,651 INFO HandlerThread:9151 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 13:56:08,653 INFO HandlerThread:9151 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 13:56:08,681 DEBUG SenderThread:9151 [sender.py:send():382] send: files
30
+ 2024-08-04 13:56:08,681 INFO SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 13:56:08,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 13:56:08,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 13:56:08,691 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: python_packages
34
+ 2024-08-04 13:56:08,691 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-04 13:56:08,692 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 13:56:08,938 DEBUG SenderThread:9151 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 13:56:09,405 INFO wandb-upload_0:9151 [upload_job.py:push():131] Uploaded file /tmp/tmpins_li9awandb/mkgvo0s4-wandb-metadata.json
38
+ 2024-08-04 13:56:09,529 INFO Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt
39
+ 2024-08-04 13:56:09,529 INFO Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json
40
+ 2024-08-04 13:56:10,529 INFO Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
41
+ 2024-08-04 13:56:12,531 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
42
+ 2024-08-04 13:56:13,586 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-04 13:56:16,533 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
44
+ 2024-08-04 13:56:19,567 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
45
+ 2024-08-04 13:56:23,689 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
46
+ 2024-08-04 13:56:23,690 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
47
+ 2024-08-04 13:56:23,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
48
+ 2024-08-04 13:56:24,913 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
49
+ 2024-08-04 13:56:29,913 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
50
+ 2024-08-04 13:56:34,914 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
51
+ 2024-08-04 13:56:38,689 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
52
+ 2024-08-04 13:56:38,690 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
53
+ 2024-08-04 13:56:38,732 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
54
+ 2024-08-04 13:56:39,955 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-08-04 13:56:40,547 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
56
+ 2024-08-04 13:56:45,164 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
57
+ 2024-08-04 13:56:50,164 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
58
+ 2024-08-04 13:56:53,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
59
+ 2024-08-04 13:56:53,690 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
60
+ 2024-08-04 13:56:53,732 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
61
+ 2024-08-04 13:56:55,957 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-04 13:57:00,957 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
63
+ 2024-08-04 13:57:05,958 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
64
+ 2024-08-04 13:57:08,629 DEBUG SystemMonitor:9151 [system_monitor.py:_start():172] Starting system metrics aggregation loop
65
+ 2024-08-04 13:57:08,630 DEBUG SenderThread:9151 [sender.py:send():382] send: stats
66
+ 2024-08-04 13:57:08,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
67
+ 2024-08-04 13:57:08,690 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
68
+ 2024-08-04 13:57:08,732 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
69
+ 2024-08-04 13:57:11,872 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
70
+ 2024-08-04 13:57:15,196 DEBUG SenderThread:9151 [sender.py:send():382] send: config
71
+ 2024-08-04 13:57:15,197 DEBUG SenderThread:9151 [sender.py:send():382] send: config
72
+ 2024-08-04 13:57:16,571 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
73
+ 2024-08-04 13:57:16,600 DEBUG SenderThread:9151 [sender.py:send():382] send: exit
74
+ 2024-08-04 13:57:16,601 INFO SenderThread:9151 [sender.py:send_exit():589] handling exit code: 1
75
+ 2024-08-04 13:57:16,601 INFO SenderThread:9151 [sender.py:send_exit():591] handling runtime: 67
76
+ 2024-08-04 13:57:16,602 INFO SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
77
+ 2024-08-04 13:57:16,602 INFO SenderThread:9151 [sender.py:send_exit():597] send defer
78
+ 2024-08-04 13:57:16,602 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
79
+ 2024-08-04 13:57:16,603 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 0
80
+ 2024-08-04 13:57:16,603 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
81
+ 2024-08-04 13:57:16,603 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 0
82
+ 2024-08-04 13:57:16,603 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 1
83
+ 2024-08-04 13:57:16,603 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
84
+ 2024-08-04 13:57:16,603 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 1
85
+ 2024-08-04 13:57:16,603 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
86
+ 2024-08-04 13:57:16,603 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 1
87
+ 2024-08-04 13:57:16,603 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 2
88
+ 2024-08-04 13:57:16,603 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
89
+ 2024-08-04 13:57:16,603 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 2
90
+ 2024-08-04 13:57:16,603 INFO HandlerThread:9151 [system_monitor.py:finish():203] Stopping system monitor
91
+ 2024-08-04 13:57:16,603 DEBUG SystemMonitor:9151 [system_monitor.py:_start():179] Finished system metrics aggregation loop
92
+ 2024-08-04 13:57:16,604 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined cpu monitor
93
+ 2024-08-04 13:57:16,604 DEBUG SystemMonitor:9151 [system_monitor.py:_start():183] Publishing last batch of metrics
94
+ 2024-08-04 13:57:16,604 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined disk monitor
95
+ 2024-08-04 13:57:16,637 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined gpu monitor
96
+ 2024-08-04 13:57:16,637 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined memory monitor
97
+ 2024-08-04 13:57:16,637 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined network monitor
98
+ 2024-08-04 13:57:16,638 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
99
+ 2024-08-04 13:57:16,638 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 2
100
+ 2024-08-04 13:57:16,638 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 3
101
+ 2024-08-04 13:57:16,638 DEBUG SenderThread:9151 [sender.py:send():382] send: stats
102
+ 2024-08-04 13:57:16,638 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
103
+ 2024-08-04 13:57:16,638 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 3
104
+ 2024-08-04 13:57:16,638 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
105
+ 2024-08-04 13:57:16,638 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 3
106
+ 2024-08-04 13:57:16,638 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 4
107
+ 2024-08-04 13:57:16,638 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
108
+ 2024-08-04 13:57:16,638 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 4
109
+ 2024-08-04 13:57:16,639 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
110
+ 2024-08-04 13:57:16,639 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 4
111
+ 2024-08-04 13:57:16,639 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 5
112
+ 2024-08-04 13:57:16,639 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
113
+ 2024-08-04 13:57:16,639 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 5
114
+ 2024-08-04 13:57:16,639 DEBUG SenderThread:9151 [sender.py:send():382] send: summary
115
+ 2024-08-04 13:57:16,640 INFO SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
116
+ 2024-08-04 13:57:16,640 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
117
+ 2024-08-04 13:57:16,640 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 5
118
+ 2024-08-04 13:57:16,640 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 6
119
+ 2024-08-04 13:57:16,640 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
120
+ 2024-08-04 13:57:16,640 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 6
121
+ 2024-08-04 13:57:16,640 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
122
+ 2024-08-04 13:57:16,640 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 6
123
+ 2024-08-04 13:57:16,643 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
124
+ 2024-08-04 13:57:16,835 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 7
125
+ 2024-08-04 13:57:16,836 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
126
+ 2024-08-04 13:57:16,836 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 7
127
+ 2024-08-04 13:57:16,836 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
128
+ 2024-08-04 13:57:16,836 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 7
129
+ 2024-08-04 13:57:17,572 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
130
+ 2024-08-04 13:57:17,572 INFO Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json
131
+ 2024-08-04 13:57:17,600 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
132
+ 2024-08-04 13:57:18,334 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 8
133
+ 2024-08-04 13:57:18,334 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
134
+ 2024-08-04 13:57:18,334 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
135
+ 2024-08-04 13:57:18,335 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 8
136
+ 2024-08-04 13:57:18,335 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
137
+ 2024-08-04 13:57:18,335 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 8
138
+ 2024-08-04 13:57:18,335 INFO SenderThread:9151 [job_builder.py:build():296] Attempting to build job artifact
139
+ 2024-08-04 13:57:18,336 INFO SenderThread:9151 [job_builder.py:_get_source_type():426] is repo sourced job
140
+ 2024-08-04 13:57:18,350 INFO SenderThread:9151 [job_builder.py:build():402] adding wandb-job metadata file
141
+ 2024-08-04 13:57:18,359 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 9
142
+ 2024-08-04 13:57:18,360 DEBUG SenderThread:9151 [sender.py:send():382] send: artifact
143
+ 2024-08-04 13:57:18,360 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
144
+ 2024-08-04 13:57:18,361 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 9
145
+ 2024-08-04 13:57:18,573 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
146
+ 2024-08-04 13:57:18,601 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
147
+ 2024-08-04 13:57:19,234 INFO SenderThread:9151 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
148
+ 2024-08-04 13:57:19,234 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
149
+ 2024-08-04 13:57:19,234 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 9
150
+ 2024-08-04 13:57:19,234 INFO SenderThread:9151 [dir_watcher.py:finish():358] shutting down directory watcher
151
+ 2024-08-04 13:57:19,573 INFO SenderThread:9151 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_135607-ikp7tdz1/files
152
+ 2024-08-04 13:57:19,574 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt requirements.txt
153
+ 2024-08-04 13:57:19,574 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml config.yaml
154
+ 2024-08-04 13:57:19,575 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json wandb-metadata.json
155
+ 2024-08-04 13:57:19,576 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json wandb-summary.json
156
+ 2024-08-04 13:57:19,577 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log output.log
157
+ 2024-08-04 13:57:19,579 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 10
158
+ 2024-08-04 13:57:19,579 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
159
+ 2024-08-04 13:57:19,579 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
160
+ 2024-08-04 13:57:19,580 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 10
161
+ 2024-08-04 13:57:19,581 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
162
+ 2024-08-04 13:57:19,581 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 10
163
+ 2024-08-04 13:57:19,581 INFO SenderThread:9151 [file_pusher.py:finish():172] shutting down file pusher
164
+ 2024-08-04 13:57:19,601 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
165
+ 2024-08-04 13:57:19,601 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
166
+ 2024-08-04 13:57:19,983 INFO wandb-upload_0:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt
167
+ 2024-08-04 13:57:20,084 INFO wandb-upload_1:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
168
+ 2024-08-04 13:57:20,165 INFO wandb-upload_2:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json
169
+ 2024-08-04 13:57:20,334 INFO wandb-upload_3:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
170
+ 2024-08-04 13:57:20,534 INFO Thread-11 (_thread_body):9151 [sender.py:transition_state():617] send defer: 11
171
+ 2024-08-04 13:57:20,534 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
172
+ 2024-08-04 13:57:20,534 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 11
173
+ 2024-08-04 13:57:20,535 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
174
+ 2024-08-04 13:57:20,535 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 11
175
+ 2024-08-04 13:57:20,535 INFO SenderThread:9151 [file_pusher.py:join():178] waiting for file pusher
176
+ 2024-08-04 13:57:20,535 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 12
177
+ 2024-08-04 13:57:20,535 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
178
+ 2024-08-04 13:57:20,535 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 12
179
+ 2024-08-04 13:57:20,535 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
180
+ 2024-08-04 13:57:20,535 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 12
181
+ 2024-08-04 13:57:20,535 INFO SenderThread:9151 [file_stream.py:finish():595] file stream finish called
182
+ 2024-08-04 13:57:20,601 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
183
+ 2024-08-04 13:57:20,717 INFO SenderThread:9151 [file_stream.py:finish():599] file stream finish is done
184
+ 2024-08-04 13:57:20,717 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 13
185
+ 2024-08-04 13:57:20,717 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
186
+ 2024-08-04 13:57:20,717 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
187
+ 2024-08-04 13:57:20,718 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 13
188
+ 2024-08-04 13:57:20,718 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
189
+ 2024-08-04 13:57:20,718 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 13
190
+ 2024-08-04 13:57:20,718 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 14
191
+ 2024-08-04 13:57:20,718 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
192
+ 2024-08-04 13:57:20,718 DEBUG SenderThread:9151 [sender.py:send():382] send: final
193
+ 2024-08-04 13:57:20,718 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 14
194
+ 2024-08-04 13:57:20,718 DEBUG SenderThread:9151 [sender.py:send():382] send: footer
195
+ 2024-08-04 13:57:20,719 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
196
+ 2024-08-04 13:57:20,719 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 14
197
+ 2024-08-04 13:57:20,719 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
198
+ 2024-08-04 13:57:20,719 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
199
+ 2024-08-04 13:57:20,719 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
200
+ 2024-08-04 13:57:20,720 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: server_info
201
+ 2024-08-04 13:57:20,720 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
202
+ 2024-08-04 13:57:20,720 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: get_summary
203
+ 2024-08-04 13:57:20,720 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: server_info
204
+ 2024-08-04 13:57:20,721 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: sampled_history
205
+ 2024-08-04 13:57:20,722 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
206
+ 2024-08-04 13:57:20,722 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: job_info
207
+ 2024-08-04 13:57:20,885 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: job_info
208
+ 2024-08-04 13:57:20,885 INFO MainThread:9151 [wandb_run.py:_footer_history_summary_info():3866] rendering history
209
+ 2024-08-04 13:57:20,885 INFO MainThread:9151 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
210
+ 2024-08-04 13:57:20,885 INFO MainThread:9151 [wandb_run.py:_footer_sync_info():3825] logging synced files
211
+ 2024-08-04 13:57:20,886 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: shutdown
212
+ 2024-08-04 13:57:20,886 INFO HandlerThread:9151 [handler.py:finish():869] shutting down handler
213
+ 2024-08-04 13:57:21,722 INFO WriterThread:9151 [datastore.py:close():296] close: /project/wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb
214
+ 2024-08-04 13:57:21,885 INFO SenderThread:9151 [sender.py:finish():1572] shutting down sender
215
+ 2024-08-04 13:57:21,885 INFO SenderThread:9151 [file_pusher.py:finish():172] shutting down file pusher
216
+ 2024-08-04 13:57:21,885 INFO SenderThread:9151 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_135607-ikp7tdz1/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Configure stats pid to 9079
3
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
6
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_135607-ikp7tdz1/logs/debug.log
9
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log
10
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 256, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama-sample_train_2024-08-04-13:55:35', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama-sample', 'save': '/work/llm_recipes/models/tiny-llama-sample', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 2048, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 13:56:07,905 INFO MainThread:9079 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 13:56:07,905 INFO MainThread:9079 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 13:56:07,909 INFO MainThread:9079 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 13:56:07,911 INFO MainThread:9079 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 13:56:07,916 INFO MainThread:9079 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 13:56:08,064 INFO MainThread:9079 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 13:56:08,532 INFO MainThread:9079 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 13:56:08,612 INFO MainThread:9079 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 13:56:08,612 INFO MainThread:9079 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 13:56:08,689 INFO MainThread:9079 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 13:56:08,689 INFO MainThread:9079 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 13:56:08,690 INFO MainThread:9079 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 13:56:08,690 INFO MainThread:9079 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 13:56:08,691 INFO MainThread:9079 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 13:57:15,195 INFO MainThread:9079 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
29
+ 2024-08-04 13:57:15,196 INFO MainThread:9079 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 13:57:21,887 WARNING MsgRouterThr:9079 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb ADDED
Binary file (22.5 kB). View file
 
wandb/run-20240812_070449-ufge4h1y/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '304771887'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '304771887'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '304771887'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-qwen2-0.5B_train_2024-08-12-07:04:37
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 5
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-qwen2-0.5B
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 151680
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723413889.11596
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ model_architecture:
316
+ desc: null
317
+ value: Qwen2ForCausalLM
318
+ activation_function:
319
+ desc: null
320
+ value: silu
321
+ hidden_size:
322
+ desc: null
323
+ value: 896
324
+ model_type:
325
+ desc: null
326
+ value: qwen2
327
+ max_position_embeddings:
328
+ desc: null
329
+ value: 4096
330
+ num_attention_heads:
331
+ desc: null
332
+ value: 14
333
+ num_hidden_layers:
334
+ desc: null
335
+ value: 24
wandb/run-20240812_070449-ufge4h1y/files/output.log ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
8
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
9
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
10
+ warnings.warn(
11
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
12
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
13
+ No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
14
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
15
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
16
+ BFloat16 enabled for mixed precision - using bfSixteen policy
17
+ Let split = None
18
+ Building a BlendedDataset for a single MegatronDataset
19
+ Unable to save the indexes because path_to_cache is None
20
+ Building a BlendedDataset for a single MegatronDataset
21
+ Unable to save the indexes because path_to_cache is None
22
+ Building a BlendedDataset for a single MegatronDataset
23
+ Unable to save the indexes because path_to_cache is None
24
+ --> applying fsdp activation checkpointing...
25
+ > datasets target sizes (minimum size):
26
+ train: 6400000
27
+ validation: 323200
28
+ test: 3200
29
+ > building train, validation, and test datasets for GPT ...
30
+ > finished creating GPT datasets ...
31
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
32
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
33
+ No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
34
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
35
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
36
+ model info: FullyShardedDataParallel(
37
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
38
+ (model): Qwen2Model(
39
+ (embed_tokens): Embedding(151936, 896)
40
+ (layers): ModuleList(
41
+ (0-23): 24 x FullyShardedDataParallel(
42
+ (_fsdp_wrapped_module): CheckpointWrapper(
43
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
44
+ (self_attn): Qwen2FlashAttention2(
45
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
46
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
47
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
48
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
49
+ (rotary_emb): Qwen2RotaryEmbedding()
50
+ )
51
+ (mlp): Qwen2MLP(
52
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
53
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
54
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
55
+ (act_fn): SiLU()
56
+ )
57
+ (input_layernorm): Qwen2RMSNorm()
58
+ (post_attention_layernorm): Qwen2RMSNorm()
59
+ )
60
+ )
61
+ )
62
+ )
63
+ (norm): Qwen2RMSNorm()
64
+ )
65
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
66
+ )
67
+ )
68
+ model config: Qwen2Config {
69
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
70
+ "architectures": [
71
+ "Qwen2ForCausalLM"
72
+ ],
73
+ "attention_dropout": 0.0,
74
+ "bos_token_id": 151643,
75
+ "eos_token_id": 151643,
76
+ "hidden_act": "silu",
77
+ "hidden_size": 896,
78
+ "initializer_range": 0.02,
79
+ "intermediate_size": 4864,
80
+ "label_smoothing": 0.0,
81
+ "max_position_embeddings": 4096,
82
+ "max_window_layers": 24,
83
+ "model_type": "qwen2",
84
+ "num_attention_heads": 14,
85
+ "num_hidden_layers": 24,
86
+ "num_key_value_heads": 2,
87
+ "rms_norm_eps": 1e-06,
88
+ "rope_theta": 1000000.0,
89
+ "sliding_window": null,
90
+ "tie_word_embeddings": true,
91
+ "torch_dtype": "bfloat16",
92
+ "transformers_version": "4.43.3",
93
+ "use_cache": false,
94
+ "use_sliding_window": false,
95
+ "vocab_size": 151936
96
+ }
97
+ ------------------------------------------------------------------
98
+ iteration: 1 , TFLOPS: 69.43623917184445, Tokens per sec: 17268.44384112612, Loss: 4.1814446449279785
99
+ ------------------------------------------------------------------
100
+ ------------------------------------------------------------------
101
+ iteration: 2 , TFLOPS: 69.64205785663373, Tokens per sec: 17319.629914020166, Loss: 4.191491603851318
102
+ ------------------------------------------------------------------
103
+ ------------------------------------------------------------------
104
+ iteration: 3 , TFLOPS: 69.60094665048808, Tokens per sec: 17309.405763590446, Loss: 4.197597026824951
105
+ ------------------------------------------------------------------
106
+ ------------------------------------------------------------------
107
+ iteration: 4 , TFLOPS: 69.47512522949748, Tokens per sec: 17278.114608304662, Loss: 4.183670520782471
108
+ ------------------------------------------------------------------
109
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
110
+ warnings.warn(
111
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
112
+ warnings.warn(
113
+ ------------------------------------------------------------------
114
+ iteration: 5 , TFLOPS: 69.67467547447801, Tokens per sec: 17327.7417517103, Loss: 4.198245048522949
115
+ ------------------------------------------------------------------
116
+ Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005
117
+ Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
118
+ Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
119
+ [rank0]:[2024-08-12 07:11:16,345] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.006517466999866883, 'preprocessing_with_comm': 0.0007555539996246807, 'state_converting': 0.9849483990001318, <Type.ALL: 'all'>: 0.9936859660001574})
120
+ Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
121
+ Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
122
+ Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
123
+ Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
124
+ Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
125
+ Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
126
+ Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005, took 4.44s
127
+ ------------------------------------------------------------------
128
+ iteration: 6 , TFLOPS: 70.22008480550622, Tokens per sec: 17463.382312253587, Loss: 4.179391860961914
129
+ ------------------------------------------------------------------
130
+ ------------------------------------------------------------------
131
+ iteration: 7 , TFLOPS: 69.98955682269778, Tokens per sec: 17406.051161079293, Loss: 4.190949440002441
132
+ ------------------------------------------------------------------
133
+ ------------------------------------------------------------------
134
+ iteration: 8 , TFLOPS: 69.94509258955091, Tokens per sec: 17394.993129679646, Loss: 4.189082622528076
135
+ ------------------------------------------------------------------
136
+ ------------------------------------------------------------------
137
+ iteration: 9 , TFLOPS: 70.07602036768274, Tokens per sec: 17427.55421033261, Loss: 4.181089878082275
138
+ ------------------------------------------------------------------
139
+ ------------------------------------------------------------------
140
+ iteration: 10 , TFLOPS: 70.03395601975187, Tokens per sec: 17417.093018329397, Loss: 4.1603803634643555
141
+ ------------------------------------------------------------------
142
+ Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010
143
+ Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/model.pt
144
+ Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/model.pt
145
+ [rank0]:[2024-08-12 07:17:37,283] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.0064329239994549425, 'preprocessing_with_comm': 0.0007190309997895383, 'state_converting': 0.9757228209991808, <Type.ALL: 'all'>: 0.9842789310005173})
146
+ Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/optimizer.pt
147
+ Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/optimizer.pt
148
+ Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/scheduler.pt
149
+ Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/scheduler.pt
150
+ Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/rng.pt
151
+ Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/rng.pt
152
+ Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010, took 4.48s
153
+ ------------------------------------------------------------------
154
+ iteration: 11 , TFLOPS: 70.31766010694388, Tokens per sec: 17487.64879951231, Loss: 4.118324279785156
155
+ ------------------------------------------------------------------
156
+ ------------------------------------------------------------------
157
+ iteration: 12 , TFLOPS: 70.37958976318761, Tokens per sec: 17503.050393891557, Loss: 4.171144008636475
158
+ ------------------------------------------------------------------
wandb/run-20240812_070449-ufge4h1y/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-11T22:04:49.754332",
5
+ "startedAt": "2024-08-11T22:04:49.102690",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
23
+ "--train-data-path",
24
+ "304771887",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "304771887",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "304771887",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "5",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-qwen2-0.5B",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-qwen2-0.5B_train_2024-08-12-07:04:37"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0429999999997,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.043,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.043,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.043,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.043,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.043,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.043,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.043,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.043,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.043,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.043,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.043,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.043,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.043,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.043,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.043,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.043,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.043,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.043,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.487823486328125
214
+ }
215
+ }
wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 4.171144008636475, "training/perplexity": 64.78952950804121, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 12, "optimizer/lr": 1.4560000000000001e-06, "optimizer/variance_l2": 0.012989128226478895, "optimizer/variance_sqrt_l2": 0.6784465027663834, "optimizer/momentum_l2": 0.7107880089338467, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.4604034423828125, "optimizer/variance_sqrt_l1": 2849.0, "optimizer/momentum_l1": 2785.25, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.00909423828125, "optimizer/variance_sqrt_abs_max": 0.09521484375, "optimizer/momentum_abs_max": 0.10107421875, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.9035151299995, "stats/tokens_per_sec": 17503.050393891557, "stats/tokens_per_sec_per_gpu": 17503.050393891557, "stats/tflops": 70.37958976318761, "_timestamp": 1723414808.909133, "_runtime": 919.7931730747223, "_step": 12, "_wandb": {"runtime": 922}}
wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 07:04:49,117 INFO StreamThr :13762 [internal.py:wandb_internal():86] W&B internal server running at pid: 13762, started at: 2024-08-12 07:04:49.116639
2
+ 2024-08-12 07:04:49,119 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-12 07:04:49,121 INFO WriterThread:13762 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb
4
+ 2024-08-12 07:04:49,122 DEBUG SenderThread:13762 [sender.py:send():382] send: header
5
+ 2024-08-12 07:04:49,136 DEBUG SenderThread:13762 [sender.py:send():382] send: run
6
+ 2024-08-12 07:04:49,638 INFO SenderThread:13762 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_070449-ufge4h1y/files
7
+ 2024-08-12 07:04:49,638 INFO SenderThread:13762 [sender.py:_start_run_threads():1136] run started: ufge4h1y with start time 1723413889.11596
8
+ 2024-08-12 07:04:49,643 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-12 07:04:49,643 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-12 07:04:49,733 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-12 07:04:49,739 DEBUG HandlerThread:13762 [system_info.py:__init__():27] System info init
12
+ 2024-08-12 07:04:49,739 DEBUG HandlerThread:13762 [system_info.py:__init__():42] System info init done
13
+ 2024-08-12 07:04:49,740 INFO HandlerThread:13762 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-12 07:04:49,740 INFO SystemMonitor:13762 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-12 07:04:49,740 INFO HandlerThread:13762 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-12 07:04:49,740 INFO SystemMonitor:13762 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-12 07:04:49,741 INFO SystemMonitor:13762 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-12 07:04:49,741 INFO SystemMonitor:13762 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-12 07:04:49,742 INFO SystemMonitor:13762 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-12 07:04:49,744 INFO SystemMonitor:13762 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-12 07:04:49,754 DEBUG HandlerThread:13762 [system_info.py:probe():151] Probing system
22
+ 2024-08-12 07:04:49,756 DEBUG HandlerThread:13762 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-12 07:04:49,770 DEBUG HandlerThread:13762 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-12 07:04:49,771 DEBUG HandlerThread:13762 [system_info.py:probe():199] Probing system done
25
+ 2024-08-12 07:04:49,771 DEBUG HandlerThread:13762 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T22:04:49.754332', 'startedAt': '2024-08-11T22:04:49.102690', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '5', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-07:04:37'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
26
+ 2024-08-12 07:04:49,771 INFO HandlerThread:13762 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-12 07:04:49,771 INFO HandlerThread:13762 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-12 07:04:49,772 INFO HandlerThread:13762 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-12 07:04:49,779 DEBUG SenderThread:13762 [sender.py:send():382] send: files
30
+ 2024-08-12 07:04:49,779 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-12 07:04:49,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-12 07:04:49,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-12 07:04:49,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-12 07:04:49,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-12 07:04:49,791 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-12 07:04:50,088 DEBUG SenderThread:13762 [sender.py:send():382] send: telemetry
37
+ 2024-08-12 07:04:50,465 INFO wandb-upload_0:13762 [upload_job.py:push():131] Uploaded file /tmp/tmp0h3j51sdwandb/z7nk28zc-wandb-metadata.json
38
+ 2024-08-12 07:04:50,640 INFO Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json
39
+ 2024-08-12 07:04:50,640 INFO Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
40
+ 2024-08-12 07:04:50,640 INFO Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/requirements.txt
41
+ 2024-08-12 07:04:52,640 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
42
+ 2024-08-12 07:04:54,468 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-12 07:04:54,641 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
44
+ 2024-08-12 07:04:54,719 DEBUG SenderThread:13762 [sender.py:send():382] send: config
45
+ 2024-08-12 07:04:54,719 DEBUG SenderThread:13762 [sender.py:send():382] send: config
46
+ 2024-08-12 07:04:56,643 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
47
+ 2024-08-12 07:04:59,720 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
48
+ 2024-08-12 07:05:04,721 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
49
+ 2024-08-12 07:05:04,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
50
+ 2024-08-12 07:05:04,790 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
51
+ 2024-08-12 07:05:04,790 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
52
+ 2024-08-12 07:05:10,015 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
53
+ 2024-08-12 07:05:15,015 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
54
+ 2024-08-12 07:05:19,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
55
+ 2024-08-12 07:05:19,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
56
+ 2024-08-12 07:05:19,828 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
57
+ 2024-08-12 07:05:20,046 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
58
+ 2024-08-12 07:05:20,658 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/config.yaml
59
+ 2024-08-12 07:05:25,253 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
60
+ 2024-08-12 07:05:30,254 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
61
+ 2024-08-12 07:05:34,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
62
+ 2024-08-12 07:05:34,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
63
+ 2024-08-12 07:05:34,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
64
+ 2024-08-12 07:05:36,061 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
65
+ 2024-08-12 07:05:41,062 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
66
+ 2024-08-12 07:05:46,063 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
67
+ 2024-08-12 07:05:49,744 DEBUG SystemMonitor:13762 [system_monitor.py:_start():172] Starting system metrics aggregation loop
68
+ 2024-08-12 07:05:49,746 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
69
+ 2024-08-12 07:05:49,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
70
+ 2024-08-12 07:05:49,788 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
71
+ 2024-08-12 07:05:49,828 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
72
+ 2024-08-12 07:05:51,986 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
73
+ 2024-08-12 07:05:56,987 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
74
+ 2024-08-12 07:06:01,988 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
75
+ 2024-08-12 07:06:04,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
76
+ 2024-08-12 07:06:04,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
77
+ 2024-08-12 07:06:04,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
78
+ 2024-08-12 07:06:06,993 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
79
+ 2024-08-12 07:06:10,837 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
80
+ 2024-08-12 07:06:12,691 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
81
+ 2024-08-12 07:06:12,882 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
82
+ 2024-08-12 07:06:17,882 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
83
+ 2024-08-12 07:06:19,747 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
84
+ 2024-08-12 07:06:19,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
85
+ 2024-08-12 07:06:19,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
86
+ 2024-08-12 07:06:19,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
87
+ 2024-08-12 07:06:23,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
88
+ 2024-08-12 07:06:28,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-08-12 07:06:33,040 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
90
+ 2024-08-12 07:06:34,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
91
+ 2024-08-12 07:06:34,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
92
+ 2024-08-12 07:06:34,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
93
+ 2024-08-12 07:06:39,036 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
94
+ 2024-08-12 07:06:44,037 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-12 07:06:49,037 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
96
+ 2024-08-12 07:06:49,748 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
97
+ 2024-08-12 07:06:49,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
98
+ 2024-08-12 07:06:49,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
99
+ 2024-08-12 07:06:49,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
100
+ 2024-08-12 07:06:54,988 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
101
+ 2024-08-12 07:06:59,989 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
102
+ 2024-08-12 07:07:04,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
103
+ 2024-08-12 07:07:04,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
104
+ 2024-08-12 07:07:04,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
105
+ 2024-08-12 07:07:05,036 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
106
+ 2024-08-12 07:07:10,037 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
107
+ 2024-08-12 07:07:15,038 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
108
+ 2024-08-12 07:07:19,749 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
109
+ 2024-08-12 07:07:19,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
110
+ 2024-08-12 07:07:19,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
111
+ 2024-08-12 07:07:19,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
112
+ 2024-08-12 07:07:20,985 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
113
+ 2024-08-12 07:07:25,986 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
114
+ 2024-08-12 07:07:26,535 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
115
+ 2024-08-12 07:07:26,538 DEBUG SenderThread:13762 [sender.py:send():382] send: history
116
+ 2024-08-12 07:07:26,538 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
117
+ 2024-08-12 07:07:26,540 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
118
+ 2024-08-12 07:07:26,739 INFO Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
119
+ 2024-08-12 07:07:28,741 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
120
+ 2024-08-12 07:07:31,578 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
121
+ 2024-08-12 07:07:34,791 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
122
+ 2024-08-12 07:07:34,791 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
123
+ 2024-08-12 07:07:34,791 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
124
+ 2024-08-12 07:07:37,002 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
125
+ 2024-08-12 07:07:42,003 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
126
+ 2024-08-12 07:07:47,004 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
127
+ 2024-08-12 07:07:49,750 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
128
+ 2024-08-12 07:07:49,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
129
+ 2024-08-12 07:07:49,792 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
130
+ 2024-08-12 07:07:49,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
131
+ 2024-08-12 07:07:52,985 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
132
+ 2024-08-12 07:07:57,986 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
133
+ 2024-08-12 07:08:02,986 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
134
+ 2024-08-12 07:08:04,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
135
+ 2024-08-12 07:08:04,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
136
+ 2024-08-12 07:08:04,793 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
137
+ 2024-08-12 07:08:08,037 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
138
+ 2024-08-12 07:08:13,038 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
139
+ 2024-08-12 07:08:18,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
140
+ 2024-08-12 07:08:19,751 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
141
+ 2024-08-12 07:08:19,791 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
142
+ 2024-08-12 07:08:19,792 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
143
+ 2024-08-12 07:08:19,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
144
+ 2024-08-12 07:08:23,989 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
145
+ 2024-08-12 07:08:28,990 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
146
+ 2024-08-12 07:08:33,991 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
147
+ 2024-08-12 07:08:34,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
148
+ 2024-08-12 07:08:34,792 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
149
+ 2024-08-12 07:08:34,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
150
+ 2024-08-12 07:08:39,042 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
151
+ 2024-08-12 07:08:42,279 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
152
+ 2024-08-12 07:08:42,281 DEBUG SenderThread:13762 [sender.py:send():382] send: history
153
+ 2024-08-12 07:08:42,282 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
154
+ 2024-08-12 07:08:42,283 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
155
+ 2024-08-12 07:08:42,792 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
156
+ 2024-08-12 07:08:44,322 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
157
+ 2024-08-12 07:08:44,793 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
158
+ 2024-08-12 07:08:49,322 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
159
+ 2024-08-12 07:08:49,752 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
160
+ 2024-08-12 07:08:49,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
161
+ 2024-08-12 07:08:49,792 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
162
+ 2024-08-12 07:08:49,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
163
+ 2024-08-12 07:08:54,999 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
164
+ 2024-08-12 07:08:59,999 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
165
+ 2024-08-12 07:09:04,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
166
+ 2024-08-12 07:09:04,793 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
167
+ 2024-08-12 07:09:04,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
168
+ 2024-08-12 07:09:05,001 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
169
+ 2024-08-12 07:09:10,002 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
170
+ 2024-08-12 07:09:15,003 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
171
+ 2024-08-12 07:09:19,753 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
172
+ 2024-08-12 07:09:19,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
173
+ 2024-08-12 07:09:19,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
174
+ 2024-08-12 07:09:19,793 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
175
+ 2024-08-12 07:09:20,044 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
176
+ 2024-08-12 07:09:25,045 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
177
+ 2024-08-12 07:09:30,046 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
178
+ 2024-08-12 07:09:34,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
179
+ 2024-08-12 07:09:34,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
180
+ 2024-08-12 07:09:34,793 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
181
+ 2024-08-12 07:09:35,995 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
182
+ 2024-08-12 07:09:40,995 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
183
+ 2024-08-12 07:09:45,996 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
184
+ 2024-08-12 07:09:49,754 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
185
+ 2024-08-12 07:09:49,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
186
+ 2024-08-12 07:09:49,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
187
+ 2024-08-12 07:09:49,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
188
+ 2024-08-12 07:09:51,979 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
189
+ 2024-08-12 07:09:56,980 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
190
+ 2024-08-12 07:09:58,160 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
191
+ 2024-08-12 07:09:58,162 DEBUG SenderThread:13762 [sender.py:send():382] send: history
192
+ 2024-08-12 07:09:58,162 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
193
+ 2024-08-12 07:09:58,163 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
194
+ 2024-08-12 07:09:58,845 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
195
+ 2024-08-12 07:10:00,846 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
196
+ 2024-08-12 07:10:02,202 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
197
+ 2024-08-12 07:10:04,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
198
+ 2024-08-12 07:10:04,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
199
+ 2024-08-12 07:10:04,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
200
+ 2024-08-12 07:10:08,061 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
201
+ 2024-08-12 07:10:13,062 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
202
+ 2024-08-12 07:10:18,063 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
203
+ 2024-08-12 07:10:19,755 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
204
+ 2024-08-12 07:10:19,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
205
+ 2024-08-12 07:10:19,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
206
+ 2024-08-12 07:10:19,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
207
+ 2024-08-12 07:10:23,070 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
208
+ 2024-08-12 07:10:28,071 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
209
+ 2024-08-12 07:10:33,072 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
210
+ 2024-08-12 07:10:34,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
211
+ 2024-08-12 07:10:34,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
212
+ 2024-08-12 07:10:34,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
213
+ 2024-08-12 07:10:38,976 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
214
+ 2024-08-12 07:10:43,977 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
215
+ 2024-08-12 07:10:48,978 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
216
+ 2024-08-12 07:10:49,758 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
217
+ 2024-08-12 07:10:49,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
218
+ 2024-08-12 07:10:49,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
219
+ 2024-08-12 07:10:49,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
220
+ 2024-08-12 07:10:54,010 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
221
+ 2024-08-12 07:10:59,011 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
222
+ 2024-08-12 07:11:04,012 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
223
+ 2024-08-12 07:11:04,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
224
+ 2024-08-12 07:11:04,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
225
+ 2024-08-12 07:11:04,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
226
+ 2024-08-12 07:11:09,041 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
227
+ 2024-08-12 07:11:13,824 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
228
+ 2024-08-12 07:11:13,826 DEBUG SenderThread:13762 [sender.py:send():382] send: history
229
+ 2024-08-12 07:11:13,826 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
230
+ 2024-08-12 07:11:13,827 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
231
+ 2024-08-12 07:11:13,896 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
232
+ 2024-08-12 07:11:14,866 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
233
+ 2024-08-12 07:11:14,897 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
234
+ 2024-08-12 07:11:16,898 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
235
+ 2024-08-12 07:11:18,900 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
236
+ 2024-08-12 07:11:19,757 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
237
+ 2024-08-12 07:11:19,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
238
+ 2024-08-12 07:11:19,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
239
+ 2024-08-12 07:11:19,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
240
+ 2024-08-12 07:11:20,004 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
241
+ 2024-08-12 07:11:20,901 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
242
+ 2024-08-12 07:11:25,004 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
243
+ 2024-08-12 07:11:30,005 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
244
+ 2024-08-12 07:11:34,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
245
+ 2024-08-12 07:11:34,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
246
+ 2024-08-12 07:11:34,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
247
+ 2024-08-12 07:11:35,993 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
248
+ 2024-08-12 07:11:40,994 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
249
+ 2024-08-12 07:11:45,994 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
250
+ 2024-08-12 07:11:49,758 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
251
+ 2024-08-12 07:11:49,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
252
+ 2024-08-12 07:11:49,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
253
+ 2024-08-12 07:11:49,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
254
+ 2024-08-12 07:11:51,989 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
255
+ 2024-08-12 07:11:56,990 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
256
+ 2024-08-12 07:12:01,990 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
257
+ 2024-08-12 07:12:04,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
258
+ 2024-08-12 07:12:04,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
259
+ 2024-08-12 07:12:04,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
260
+ 2024-08-12 07:12:06,998 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
261
+ 2024-08-12 07:12:11,999 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
262
+ 2024-08-12 07:12:17,000 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
263
+ 2024-08-12 07:12:19,760 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
264
+ 2024-08-12 07:12:19,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
265
+ 2024-08-12 07:12:19,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
266
+ 2024-08-12 07:12:19,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
267
+ 2024-08-12 07:12:22,010 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
268
+ 2024-08-12 07:12:27,011 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
269
+ 2024-08-12 07:12:32,011 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
270
+ 2024-08-12 07:12:33,344 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
271
+ 2024-08-12 07:12:33,346 DEBUG SenderThread:13762 [sender.py:send():382] send: history
272
+ 2024-08-12 07:12:33,346 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
273
+ 2024-08-12 07:12:33,348 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
274
+ 2024-08-12 07:12:33,948 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
275
+ 2024-08-12 07:12:34,796 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
276
+ 2024-08-12 07:12:34,796 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
277
+ 2024-08-12 07:12:34,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
278
+ 2024-08-12 07:12:34,948 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
279
+ 2024-08-12 07:12:38,002 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
280
+ 2024-08-12 07:12:43,002 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
281
+ 2024-08-12 07:12:48,003 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
282
+ 2024-08-12 07:12:49,760 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
283
+ 2024-08-12 07:12:49,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
284
+ 2024-08-12 07:12:49,796 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
285
+ 2024-08-12 07:12:49,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
286
+ 2024-08-12 07:12:53,056 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
287
+ 2024-08-12 07:12:58,057 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
288
+ 2024-08-12 07:13:03,057 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
289
+ 2024-08-12 07:13:04,796 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
290
+ 2024-08-12 07:13:04,796 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
291
+ 2024-08-12 07:13:04,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
292
+ 2024-08-12 07:13:09,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
293
+ 2024-08-12 07:13:14,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
294
+ 2024-08-12 07:13:19,034 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
295
+ 2024-08-12 07:13:19,761 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
296
+ 2024-08-12 07:13:19,796 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
297
+ 2024-08-12 07:13:19,796 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
298
+ 2024-08-12 07:13:19,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
299
+ 2024-08-12 07:13:25,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
300
+ 2024-08-12 07:13:30,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
301
+ 2024-08-12 07:13:34,838 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
302
+ 2024-08-12 07:13:34,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
303
+ 2024-08-12 07:13:34,839 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
304
+ 2024-08-12 07:13:35,055 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
305
+ 2024-08-12 07:13:40,056 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
306
+ 2024-08-12 07:13:45,057 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
307
+ 2024-08-12 07:13:48,668 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
308
+ 2024-08-12 07:13:48,670 DEBUG SenderThread:13762 [sender.py:send():382] send: history
309
+ 2024-08-12 07:13:48,671 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
310
+ 2024-08-12 07:13:48,672 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
311
+ 2024-08-12 07:13:48,997 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
312
+ 2024-08-12 07:13:49,762 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
313
+ 2024-08-12 07:13:49,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
314
+ 2024-08-12 07:13:49,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
315
+ 2024-08-12 07:13:49,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
316
+ 2024-08-12 07:13:50,998 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
317
+ 2024-08-12 07:13:51,022 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
318
+ 2024-08-12 07:13:56,023 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
319
+ 2024-08-12 07:14:01,023 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
320
+ 2024-08-12 07:14:04,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
321
+ 2024-08-12 07:14:04,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
322
+ 2024-08-12 07:14:04,839 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
323
+ 2024-08-12 07:14:06,089 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
324
+ 2024-08-12 07:14:11,090 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
325
+ 2024-08-12 07:14:16,090 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
326
+ 2024-08-12 07:14:19,763 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
327
+ 2024-08-12 07:14:19,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
328
+ 2024-08-12 07:14:19,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
329
+ 2024-08-12 07:14:19,839 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
330
+ 2024-08-12 07:14:21,108 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
331
+ 2024-08-12 07:14:26,109 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
332
+ 2024-08-12 07:14:31,109 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
333
+ 2024-08-12 07:14:34,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
334
+ 2024-08-12 07:14:34,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
335
+ 2024-08-12 07:14:34,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
336
+ 2024-08-12 07:14:37,031 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
337
+ 2024-08-12 07:14:42,032 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
338
+ 2024-08-12 07:14:47,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
339
+ 2024-08-12 07:14:49,764 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
340
+ 2024-08-12 07:14:49,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
341
+ 2024-08-12 07:14:49,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
342
+ 2024-08-12 07:14:49,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
343
+ 2024-08-12 07:14:52,060 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
344
+ 2024-08-12 07:14:57,061 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
345
+ 2024-08-12 07:15:02,061 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
346
+ 2024-08-12 07:15:04,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
347
+ 2024-08-12 07:15:04,041 DEBUG SenderThread:13762 [sender.py:send():382] send: history
348
+ 2024-08-12 07:15:04,041 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
349
+ 2024-08-12 07:15:04,043 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
350
+ 2024-08-12 07:15:04,047 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
351
+ 2024-08-12 07:15:04,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
352
+ 2024-08-12 07:15:04,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
353
+ 2024-08-12 07:15:04,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
354
+ 2024-08-12 07:15:05,048 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
355
+ 2024-08-12 07:15:07,077 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
356
+ 2024-08-12 07:15:12,077 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
357
+ 2024-08-12 07:15:17,078 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
358
+ 2024-08-12 07:15:19,765 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
359
+ 2024-08-12 07:15:19,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
360
+ 2024-08-12 07:15:19,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
361
+ 2024-08-12 07:15:19,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
362
+ 2024-08-12 07:15:22,080 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
363
+ 2024-08-12 07:15:27,081 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
364
+ 2024-08-12 07:15:32,082 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
365
+ 2024-08-12 07:15:34,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
366
+ 2024-08-12 07:15:34,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
367
+ 2024-08-12 07:15:34,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
368
+ 2024-08-12 07:15:38,041 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
369
+ 2024-08-12 07:15:43,042 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
370
+ 2024-08-12 07:15:48,042 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
371
+ 2024-08-12 07:15:49,766 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
372
+ 2024-08-12 07:15:49,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
373
+ 2024-08-12 07:15:49,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
374
+ 2024-08-12 07:15:49,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
375
+ 2024-08-12 07:15:53,080 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
376
+ 2024-08-12 07:15:58,080 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
377
+ 2024-08-12 07:16:03,081 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
378
+ 2024-08-12 07:16:04,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
379
+ 2024-08-12 07:16:04,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
380
+ 2024-08-12 07:16:04,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
381
+ 2024-08-12 07:16:09,051 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
382
+ 2024-08-12 07:16:14,052 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
383
+ 2024-08-12 07:16:19,053 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
384
+ 2024-08-12 07:16:19,269 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
385
+ 2024-08-12 07:16:19,271 DEBUG SenderThread:13762 [sender.py:send():382] send: history
386
+ 2024-08-12 07:16:19,271 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
387
+ 2024-08-12 07:16:19,273 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
388
+ 2024-08-12 07:16:19,767 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
389
+ 2024-08-12 07:16:19,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
390
+ 2024-08-12 07:16:19,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
391
+ 2024-08-12 07:16:19,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
392
+ 2024-08-12 07:16:20,099 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
393
+ 2024-08-12 07:16:21,099 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
394
+ 2024-08-12 07:16:25,052 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
395
+ 2024-08-12 07:16:30,052 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
396
+ 2024-08-12 07:16:34,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
397
+ 2024-08-12 07:16:34,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
398
+ 2024-08-12 07:16:34,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
399
+ 2024-08-12 07:16:35,100 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
400
+ 2024-08-12 07:16:40,100 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
401
+ 2024-08-12 07:16:45,101 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
402
+ 2024-08-12 07:16:49,768 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
403
+ 2024-08-12 07:16:49,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
404
+ 2024-08-12 07:16:49,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
405
+ 2024-08-12 07:16:49,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
406
+ 2024-08-12 07:16:51,038 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
407
+ 2024-08-12 07:16:56,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
408
+ 2024-08-12 07:17:01,040 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
409
+ 2024-08-12 07:17:04,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
410
+ 2024-08-12 07:17:04,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
411
+ 2024-08-12 07:17:04,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
412
+ 2024-08-12 07:17:06,122 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
413
+ 2024-08-12 07:17:11,123 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
414
+ 2024-08-12 07:17:16,124 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
415
+ 2024-08-12 07:17:19,769 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
416
+ 2024-08-12 07:17:19,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
417
+ 2024-08-12 07:17:19,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
418
+ 2024-08-12 07:17:19,842 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
419
+ 2024-08-12 07:17:22,020 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
420
+ 2024-08-12 07:17:27,021 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
421
+ 2024-08-12 07:17:32,022 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
422
+ 2024-08-12 07:17:34,545 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
423
+ 2024-08-12 07:17:34,548 DEBUG SenderThread:13762 [sender.py:send():382] send: history
424
+ 2024-08-12 07:17:34,548 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
425
+ 2024-08-12 07:17:34,550 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
426
+ 2024-08-12 07:17:35,013 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
427
+ 2024-08-12 07:17:35,041 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
428
+ 2024-08-12 07:17:35,041 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
429
+ 2024-08-12 07:17:35,149 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
430
+ 2024-08-12 07:17:37,151 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
431
+ 2024-08-12 07:17:37,272 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
432
+ 2024-08-12 07:17:39,152 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
433
+ 2024-08-12 07:17:41,154 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
434
+ 2024-08-12 07:17:43,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
435
+ 2024-08-12 07:17:48,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
436
+ 2024-08-12 07:17:49,770 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
437
+ 2024-08-12 07:17:49,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
438
+ 2024-08-12 07:17:49,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
439
+ 2024-08-12 07:17:49,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
440
+ 2024-08-12 07:17:53,197 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
441
+ 2024-08-12 07:17:58,198 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
442
+ 2024-08-12 07:18:03,198 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
443
+ 2024-08-12 07:18:04,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
444
+ 2024-08-12 07:18:04,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
445
+ 2024-08-12 07:18:04,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
446
+ 2024-08-12 07:18:08,232 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
447
+ 2024-08-12 07:18:13,233 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
448
+ 2024-08-12 07:18:18,233 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
449
+ 2024-08-12 07:18:19,771 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
450
+ 2024-08-12 07:18:19,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
451
+ 2024-08-12 07:18:19,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
452
+ 2024-08-12 07:18:19,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
453
+ 2024-08-12 07:18:23,237 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
454
+ 2024-08-12 07:18:28,237 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
455
+ 2024-08-12 07:18:33,238 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
456
+ 2024-08-12 07:18:34,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
457
+ 2024-08-12 07:18:34,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
458
+ 2024-08-12 07:18:34,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
459
+ 2024-08-12 07:18:39,167 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
460
+ 2024-08-12 07:18:44,168 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
461
+ 2024-08-12 07:18:49,168 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
462
+ 2024-08-12 07:18:49,772 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
463
+ 2024-08-12 07:18:49,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
464
+ 2024-08-12 07:18:49,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
465
+ 2024-08-12 07:18:49,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
466
+ 2024-08-12 07:18:54,004 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
467
+ 2024-08-12 07:18:54,006 DEBUG SenderThread:13762 [sender.py:send():382] send: history
468
+ 2024-08-12 07:18:54,007 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
469
+ 2024-08-12 07:18:54,008 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
470
+ 2024-08-12 07:18:54,198 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
471
+ 2024-08-12 07:18:55,009 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
472
+ 2024-08-12 07:18:55,199 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
473
+ 2024-08-12 07:19:00,010 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
474
+ 2024-08-12 07:19:04,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
475
+ 2024-08-12 07:19:04,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
476
+ 2024-08-12 07:19:04,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
477
+ 2024-08-12 07:19:05,244 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
478
+ 2024-08-12 07:19:10,245 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
479
+ 2024-08-12 07:19:15,245 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
480
+ 2024-08-12 07:19:19,773 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
481
+ 2024-08-12 07:19:19,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
482
+ 2024-08-12 07:19:19,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
483
+ 2024-08-12 07:19:19,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
484
+ 2024-08-12 07:19:21,167 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
485
+ 2024-08-12 07:19:26,168 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
486
+ 2024-08-12 07:19:31,169 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
487
+ 2024-08-12 07:19:34,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
488
+ 2024-08-12 07:19:34,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
489
+ 2024-08-12 07:19:34,972 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
490
+ 2024-08-12 07:19:37,149 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
491
+ 2024-08-12 07:19:42,150 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
492
+ 2024-08-12 07:19:47,151 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
493
+ 2024-08-12 07:19:49,774 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
494
+ 2024-08-12 07:19:49,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
495
+ 2024-08-12 07:19:49,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
496
+ 2024-08-12 07:19:49,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
497
+ 2024-08-12 07:19:52,230 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
498
+ 2024-08-12 07:19:57,230 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
499
+ 2024-08-12 07:20:02,231 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
500
+ 2024-08-12 07:20:04,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
501
+ 2024-08-12 07:20:04,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
502
+ 2024-08-12 07:20:04,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
503
+ 2024-08-12 07:20:08,210 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
504
+ 2024-08-12 07:20:08,910 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
505
+ 2024-08-12 07:20:08,913 DEBUG SenderThread:13762 [sender.py:send():382] send: history
506
+ 2024-08-12 07:20:08,913 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
507
+ 2024-08-12 07:20:08,914 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
508
+ 2024-08-12 07:20:09,243 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
509
+ 2024-08-12 07:20:09,244 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
510
+ 2024-08-12 07:20:12,332 DEBUG SenderThread:13762 [sender.py:send():382] send: exit
511
+ 2024-08-12 07:20:12,332 INFO SenderThread:13762 [sender.py:send_exit():589] handling exit code: 255
512
+ 2024-08-12 07:20:12,332 INFO SenderThread:13762 [sender.py:send_exit():591] handling runtime: 922
513
+ 2024-08-12 07:20:12,333 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
514
+ 2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:send_exit():597] send defer
515
+ 2024-08-12 07:20:12,334 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
516
+ 2024-08-12 07:20:12,334 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 0
517
+ 2024-08-12 07:20:12,334 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
518
+ 2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 0
519
+ 2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 1
520
+ 2024-08-12 07:20:12,334 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
521
+ 2024-08-12 07:20:12,334 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 1
522
+ 2024-08-12 07:20:12,334 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
523
+ 2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 1
524
+ 2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 2
525
+ 2024-08-12 07:20:12,335 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
526
+ 2024-08-12 07:20:12,335 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 2
527
+ 2024-08-12 07:20:12,335 INFO HandlerThread:13762 [system_monitor.py:finish():203] Stopping system monitor
528
+ 2024-08-12 07:20:12,335 DEBUG SystemMonitor:13762 [system_monitor.py:_start():179] Finished system metrics aggregation loop
529
+ 2024-08-12 07:20:12,335 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined cpu monitor
530
+ 2024-08-12 07:20:12,335 DEBUG SystemMonitor:13762 [system_monitor.py:_start():183] Publishing last batch of metrics
531
+ 2024-08-12 07:20:12,335 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined disk monitor
532
+ 2024-08-12 07:20:12,371 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined gpu monitor
533
+ 2024-08-12 07:20:12,371 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined memory monitor
534
+ 2024-08-12 07:20:12,371 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined network monitor
535
+ 2024-08-12 07:20:12,372 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
536
+ 2024-08-12 07:20:12,372 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 2
537
+ 2024-08-12 07:20:12,372 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 3
538
+ 2024-08-12 07:20:12,372 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
539
+ 2024-08-12 07:20:12,372 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
540
+ 2024-08-12 07:20:12,373 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 3
541
+ 2024-08-12 07:20:12,374 DEBUG SenderThread:13762 [sender.py:send():382] send: history
542
+ 2024-08-12 07:20:12,374 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
543
+ 2024-08-12 07:20:12,375 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
544
+ 2024-08-12 07:20:12,376 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
545
+ 2024-08-12 07:20:12,376 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 3
546
+ 2024-08-12 07:20:12,376 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 4
547
+ 2024-08-12 07:20:12,376 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
548
+ 2024-08-12 07:20:12,376 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 4
549
+ 2024-08-12 07:20:12,376 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
550
+ 2024-08-12 07:20:12,376 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 4
551
+ 2024-08-12 07:20:12,376 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 5
552
+ 2024-08-12 07:20:12,376 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
553
+ 2024-08-12 07:20:12,376 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 5
554
+ 2024-08-12 07:20:12,377 DEBUG SenderThread:13762 [sender.py:send():382] send: summary
555
+ 2024-08-12 07:20:12,378 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
556
+ 2024-08-12 07:20:12,378 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
557
+ 2024-08-12 07:20:12,378 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 5
558
+ 2024-08-12 07:20:12,378 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 6
559
+ 2024-08-12 07:20:12,378 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
560
+ 2024-08-12 07:20:12,378 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 6
561
+ 2024-08-12 07:20:12,379 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
562
+ 2024-08-12 07:20:12,379 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 6
563
+ 2024-08-12 07:20:12,379 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 7
564
+ 2024-08-12 07:20:12,379 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
565
+ 2024-08-12 07:20:12,379 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
566
+ 2024-08-12 07:20:12,379 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 7
567
+ 2024-08-12 07:20:12,379 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
568
+ 2024-08-12 07:20:12,379 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 7
569
+ 2024-08-12 07:20:13,247 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
570
+ 2024-08-12 07:20:13,332 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: poll_exit
571
+ 2024-08-12 07:20:15,017 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 8
572
+ 2024-08-12 07:20:15,017 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: poll_exit
573
+ 2024-08-12 07:20:15,017 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
574
+ 2024-08-12 07:20:15,017 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 8
575
+ 2024-08-12 07:20:15,018 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
576
+ 2024-08-12 07:20:15,018 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 8
577
+ 2024-08-12 07:20:15,018 INFO SenderThread:13762 [job_builder.py:build():296] Attempting to build job artifact
578
+ 2024-08-12 07:20:15,019 INFO SenderThread:13762 [job_builder.py:_get_source_type():426] is repo sourced job
579
+ 2024-08-12 07:20:15,033 INFO SenderThread:13762 [job_builder.py:build():402] adding wandb-job metadata file
580
+ 2024-08-12 07:20:15,042 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 9
581
+ 2024-08-12 07:20:15,042 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
582
+ 2024-08-12 07:20:15,042 DEBUG SenderThread:13762 [sender.py:send():382] send: artifact
583
+ 2024-08-12 07:20:15,042 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 9
584
+ 2024-08-12 07:20:15,248 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
585
+ 2024-08-12 07:20:15,333 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: poll_exit
586
+ 2024-08-12 07:20:15,953 INFO SenderThread:13762 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
587
+ 2024-08-12 07:20:15,953 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
588
+ 2024-08-12 07:20:15,953 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 9
589
+ 2024-08-12 07:20:15,953 INFO SenderThread:13762 [dir_watcher.py:finish():358] shutting down directory watcher
590
+ 2024-08-12 07:20:16,249 INFO SenderThread:13762 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_070449-ufge4h1y/files
591
+ 2024-08-12 07:20:16,249 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/requirements.txt requirements.txt
592
+ 2024-08-12 07:20:16,250 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/config.yaml config.yaml
593
+ 2024-08-12 07:20:16,250 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json wandb-metadata.json
594
+ 2024-08-12 07:20:16,250 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json wandb-summary.json
595
+ 2024-08-12 07:20:16,250 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log output.log
596
+ 2024-08-12 07:20:16,250 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 10
597
+ 2024-08-12 07:20:16,250 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: poll_exit
598
+ 2024-08-12 07:20:16,251 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
599
+ 2024-08-12 07:20:16,251 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 10
600
+ 2024-08-12 07:20:16,251 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
601
+ 2024-08-12 07:20:16,251 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 10
602
+ 2024-08-12 07:20:16,251 INFO SenderThread:13762 [file_pusher.py:finish():172] shutting down file pusher
603
+ 2024-08-12 07:20:20,252 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
604
+ 2024-08-12 07:20:25,252 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
605
+ 2024-08-12 07:20:30,253 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
606
+ 2024-08-12 07:20:35,254 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
607
+ 2024-08-12 07:20:40,254 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
608
+ 2024-08-12 07:20:43,105 WARNING StreamThr :13762 [internal.py:is_dead():414] Internal process exiting, parent pid 13691 disappeared
609
+ 2024-08-12 07:20:43,105 ERROR StreamThr :13762 [internal.py:wandb_internal():152] Internal process shutdown.
610
+ 2024-08-12 07:20:43,255 INFO SenderThread:13762 [sender.py:finish():1572] shutting down sender
611
+ 2024-08-12 07:20:43,255 INFO SenderThread:13762 [file_pusher.py:finish():172] shutting down file pusher
612
+ 2024-08-12 07:20:43,255 INFO HandlerThread:13762 [handler.py:finish():869] shutting down handler
613
+ 2024-08-12 07:20:43,255 INFO SenderThread:13762 [file_pusher.py:join():178] waiting for file pusher
614
+ 2024-08-12 07:20:43,255 INFO WriterThread:13762 [datastore.py:close():296] close: /project/wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb
615
+ 2024-08-12 07:20:43,255 INFO SenderThread:13762 [file_stream.py:finish():595] file stream finish called
616
+ 2024-08-12 07:20:43,425 INFO SenderThread:13762 [file_stream.py:finish():599] file stream finish is done
wandb/run-20240812_070449-ufge4h1y/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 07:04:49,108 INFO MainThread:13691 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Configure stats pid to 13691
3
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
6
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_070449-ufge4h1y/logs/debug.log
9
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log
10
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:04:37', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 5, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
13
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:init():616] starting backend
14
+ 2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-12 07:04:49,114 INFO MainThread:13691 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-12 07:04:49,115 INFO MainThread:13691 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-12 07:04:49,120 INFO MainThread:13691 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-12 07:04:49,131 INFO MainThread:13691 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-12 07:04:49,642 INFO MainThread:13691 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-12 07:04:49,725 INFO MainThread:13691 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-12 07:04:49,725 INFO MainThread:13691 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-12 07:04:49,788 INFO MainThread:13691 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-12 07:04:49,788 INFO MainThread:13691 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-12 07:04:49,788 INFO MainThread:13691 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-12 07:04:49,788 INFO MainThread:13691 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-12 07:04:49,789 INFO MainThread:13691 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-12 07:04:54,718 INFO MainThread:13691 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-12 07:04:54,718 INFO MainThread:13691 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb ADDED
Binary file (81.8 kB). View file
 
wandb/run-20240812_073202-yby212na/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '304771887'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '304771887'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '304771887'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-qwen2-0.5B_train_2024-08-12-07:31:51
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 5
138
+ save_interval:
139
+ desc: null
140
+ value: 1
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-qwen2-0.5B
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 151680
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723415522.366221
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ model_architecture:
316
+ desc: null
317
+ value: Qwen2ForCausalLM
318
+ activation_function:
319
+ desc: null
320
+ value: silu
321
+ hidden_size:
322
+ desc: null
323
+ value: 896
324
+ model_type:
325
+ desc: null
326
+ value: qwen2
327
+ max_position_embeddings:
328
+ desc: null
329
+ value: 4096
330
+ num_attention_heads:
331
+ desc: null
332
+ value: 14
333
+ num_hidden_layers:
334
+ desc: null
335
+ value: 24
wandb/run-20240812_073202-yby212na/files/output.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
11
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
12
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
13
+ BFloat16 enabled for mixed precision - using bfSixteen policy
14
+ --> applying fsdp activation checkpointing...
15
+ > datasets target sizes (minimum size):
16
+ train: 6400000
17
+ validation: 12803200
18
+ test: 3200
19
+ > building train, validation, and test datasets for GPT ...
20
+ > finished creating GPT datasets ...
21
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
22
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
23
+ No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
24
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
25
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
26
+ model info: FullyShardedDataParallel(
27
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
28
+ (model): Qwen2Model(
29
+ (embed_tokens): Embedding(151936, 896)
30
+ (layers): ModuleList(
31
+ (0-23): 24 x FullyShardedDataParallel(
32
+ (_fsdp_wrapped_module): CheckpointWrapper(
33
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
34
+ (self_attn): Qwen2FlashAttention2(
35
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
36
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
37
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
38
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
39
+ (rotary_emb): Qwen2RotaryEmbedding()
40
+ )
41
+ (mlp): Qwen2MLP(
42
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
43
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
44
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
45
+ (act_fn): SiLU()
46
+ )
47
+ (input_layernorm): Qwen2RMSNorm()
48
+ (post_attention_layernorm): Qwen2RMSNorm()
49
+ )
50
+ )
51
+ )
52
+ )
53
+ (norm): Qwen2RMSNorm()
54
+ )
55
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
56
+ )
57
+ )
58
+ model config: Qwen2Config {
59
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
60
+ "architectures": [
61
+ "Qwen2ForCausalLM"
62
+ ],
63
+ "attention_dropout": 0.0,
64
+ "bos_token_id": 151643,
65
+ "eos_token_id": 151643,
66
+ "hidden_act": "silu",
67
+ "hidden_size": 896,
68
+ "initializer_range": 0.02,
69
+ "intermediate_size": 4864,
70
+ "label_smoothing": 0.0,
71
+ "max_position_embeddings": 4096,
72
+ "max_window_layers": 24,
73
+ "model_type": "qwen2",
74
+ "num_attention_heads": 14,
75
+ "num_hidden_layers": 24,
76
+ "num_key_value_heads": 2,
77
+ "rms_norm_eps": 1e-06,
78
+ "rope_theta": 1000000.0,
79
+ "sliding_window": null,
80
+ "tie_word_embeddings": true,
81
+ "torch_dtype": "bfloat16",
82
+ "transformers_version": "4.43.3",
83
+ "use_cache": false,
84
+ "use_sliding_window": false,
85
+ "vocab_size": 151936
86
+ }
87
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
88
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
89
+ warnings.warn(
90
+ Let split = None
91
+ Building a BlendedDataset for a single MegatronDataset
92
+ Unable to save the indexes because path_to_cache is None
93
+ Building a BlendedDataset for a single MegatronDataset
94
+ Unable to save the indexes because path_to_cache is None
95
+ Building a BlendedDataset for a single MegatronDataset
96
+ Unable to save the indexes because path_to_cache is None
97
+ ------------------------------------------------------------------
98
+ iteration: 1 , TFLOPS: 69.93553660778689, Tokens per sec: 17392.616605023257, Loss: 4.1814446449279785
99
+ ------------------------------------------------------------------
100
+ Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001
101
+ Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/model.pt
102
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
103
+ warnings.warn(
104
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
105
+ warnings.warn(
106
+ Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/model.pt
107
+ Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/optimizer.pt
108
+ [rank0]:[2024-08-12 07:33:22,462] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.006542664999869885, 'preprocessing_with_comm': 0.0007797380003466969, 'state_converting': 0.9963913259998662, <Type.ALL: 'all'>: 1.0051406040001893})
109
+ Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/optimizer.pt
110
+ Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/scheduler.pt
111
+ Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/scheduler.pt
112
+ Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/rng.pt
113
+ Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/rng.pt
114
+ None
115
+ /work/llm_recipes/models/yans-qwen2-0.5B/tokenizer
116
+ Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001, took 4.39s
wandb/run-20240812_073202-yby212na/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240812_073202-yby212na/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-11T22:32:03.032279",
5
+ "startedAt": "2024-08-11T22:32:02.353340",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
23
+ "--train-data-path",
24
+ "304771887",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "304771887",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "304771887",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "1",
56
+ "--eval-interval",
57
+ "5",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-qwen2-0.5B",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-qwen2-0.5B_train_2024-08-12-07:31:51"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0429999999997,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.043,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.043,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.043,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.043,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.043,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.043,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.043,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.043,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.043,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.043,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.043,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.043,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.043,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.043,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.043,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.043,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.043,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.043,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.487823486328125
214
+ }
215
+ }
wandb/run-20240812_073202-yby212na/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 132}, "training/loss": 4.1814446449279785, "training/perplexity": 65.46035190441053, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1, "optimizer/lr": 1.038e-06, "optimizer/variance_l2": 0.001437161465185535, "optimizer/variance_sqrt_l2": 0.22307888709863474, "optimizer/momentum_l2": 0.09989735636562776, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.04984140396118164, "optimizer/variance_sqrt_l1": 889.25, "optimizer/momentum_l1": 397.875, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.00101470947265625, "optimizer/variance_sqrt_abs_max": 0.03173828125, "optimizer/momentum_abs_max": 0.0142822265625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 75.37911228499979, "stats/tokens_per_sec": 17392.616605023257, "stats/tokens_per_sec_per_gpu": 17392.616605023257, "stats/tflops": 69.93553660778689, "_timestamp": 1723415599.9530108, "_runtime": 77.58678984642029, "_step": 1}
wandb/run-20240812_073202-yby212na/logs/debug-internal.log ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 07:32:02,368 INFO StreamThr :14458 [internal.py:wandb_internal():86] W&B internal server running at pid: 14458, started at: 2024-08-12 07:32:02.367023
2
+ 2024-08-12 07:32:02,369 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-12 07:32:02,371 INFO WriterThread:14458 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_073202-yby212na/run-yby212na.wandb
4
+ 2024-08-12 07:32:02,372 DEBUG SenderThread:14458 [sender.py:send():382] send: header
5
+ 2024-08-12 07:32:02,386 DEBUG SenderThread:14458 [sender.py:send():382] send: run
6
+ 2024-08-12 07:32:02,917 INFO SenderThread:14458 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_073202-yby212na/files
7
+ 2024-08-12 07:32:02,917 INFO SenderThread:14458 [sender.py:_start_run_threads():1136] run started: yby212na with start time 1723415522.366221
8
+ 2024-08-12 07:32:02,923 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-12 07:32:02,923 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-12 07:32:03,012 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-12 07:32:03,018 DEBUG HandlerThread:14458 [system_info.py:__init__():27] System info init
12
+ 2024-08-12 07:32:03,018 DEBUG HandlerThread:14458 [system_info.py:__init__():42] System info init done
13
+ 2024-08-12 07:32:03,018 INFO HandlerThread:14458 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-12 07:32:03,019 INFO SystemMonitor:14458 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-12 07:32:03,019 INFO HandlerThread:14458 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-12 07:32:03,019 INFO SystemMonitor:14458 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-12 07:32:03,020 INFO SystemMonitor:14458 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-12 07:32:03,020 INFO SystemMonitor:14458 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-12 07:32:03,021 INFO SystemMonitor:14458 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-12 07:32:03,022 INFO SystemMonitor:14458 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-12 07:32:03,032 DEBUG HandlerThread:14458 [system_info.py:probe():151] Probing system
22
+ 2024-08-12 07:32:03,034 DEBUG HandlerThread:14458 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-12 07:32:03,046 DEBUG HandlerThread:14458 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-12 07:32:03,047 DEBUG HandlerThread:14458 [system_info.py:probe():199] Probing system done
25
+ 2024-08-12 07:32:03,047 DEBUG HandlerThread:14458 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T22:32:03.032279', 'startedAt': '2024-08-11T22:32:02.353340', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '1', '--eval-interval', '5', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-07:31:51'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
26
+ 2024-08-12 07:32:03,047 INFO HandlerThread:14458 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-12 07:32:03,047 INFO HandlerThread:14458 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-12 07:32:03,048 INFO HandlerThread:14458 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-12 07:32:03,054 DEBUG SenderThread:14458 [sender.py:send():382] send: files
30
+ 2024-08-12 07:32:03,054 INFO SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-12 07:32:03,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-12 07:32:03,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-12 07:32:03,065 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-12 07:32:03,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-12 07:32:03,067 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-12 07:32:03,383 DEBUG SenderThread:14458 [sender.py:send():382] send: telemetry
37
+ 2024-08-12 07:32:03,716 INFO wandb-upload_0:14458 [upload_job.py:push():131] Uploaded file /tmp/tmpjkv15ab8wandb/lrd2pdzk-wandb-metadata.json
38
+ 2024-08-12 07:32:03,919 INFO Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/requirements.txt
39
+ 2024-08-12 07:32:03,920 INFO Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/output.log
40
+ 2024-08-12 07:32:03,920 INFO Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/wandb-metadata.json
41
+ 2024-08-12 07:32:04,384 DEBUG SenderThread:14458 [sender.py:send():382] send: config
42
+ 2024-08-12 07:32:04,384 DEBUG SenderThread:14458 [sender.py:send():382] send: config
43
+ 2024-08-12 07:32:05,920 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
44
+ 2024-08-12 07:32:07,384 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
45
+ 2024-08-12 07:32:12,385 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
46
+ 2024-08-12 07:32:17,386 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
47
+ 2024-08-12 07:32:18,065 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
48
+ 2024-08-12 07:32:18,065 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
49
+ 2024-08-12 07:32:18,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
50
+ 2024-08-12 07:32:23,322 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
51
+ 2024-08-12 07:32:28,323 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
52
+ 2024-08-12 07:32:33,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
53
+ 2024-08-12 07:32:33,064 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
54
+ 2024-08-12 07:32:33,104 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
55
+ 2024-08-12 07:32:34,273 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
56
+ 2024-08-12 07:32:34,938 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/config.yaml
57
+ 2024-08-12 07:32:39,667 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
58
+ 2024-08-12 07:32:44,667 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
59
+ 2024-08-12 07:32:48,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
60
+ 2024-08-12 07:32:48,064 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
61
+ 2024-08-12 07:32:48,108 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
62
+ 2024-08-12 07:32:50,338 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
63
+ 2024-08-12 07:32:55,338 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
64
+ 2024-08-12 07:33:00,339 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
65
+ 2024-08-12 07:33:03,022 DEBUG SystemMonitor:14458 [system_monitor.py:_start():172] Starting system metrics aggregation loop
66
+ 2024-08-12 07:33:03,024 DEBUG SenderThread:14458 [sender.py:send():382] send: stats
67
+ 2024-08-12 07:33:03,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
68
+ 2024-08-12 07:33:03,064 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
69
+ 2024-08-12 07:33:03,104 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
70
+ 2024-08-12 07:33:06,281 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
71
+ 2024-08-12 07:33:11,281 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
72
+ 2024-08-12 07:33:16,282 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
73
+ 2024-08-12 07:33:18,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
74
+ 2024-08-12 07:33:18,064 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
75
+ 2024-08-12 07:33:18,108 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
76
+ 2024-08-12 07:33:19,954 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: partial_history
77
+ 2024-08-12 07:33:21,450 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
78
+ 2024-08-12 07:33:21,967 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
79
+ 2024-08-12 07:33:23,969 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
80
+ 2024-08-12 07:33:25,970 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
81
+ 2024-08-12 07:33:27,344 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
82
+ 2024-08-12 07:33:32,345 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
83
+ 2024-08-12 07:33:33,025 DEBUG SenderThread:14458 [sender.py:send():382] send: stats
84
+ 2024-08-12 07:33:33,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
85
+ 2024-08-12 07:33:33,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
86
+ 2024-08-12 07:33:33,066 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
87
+ 2024-08-12 07:33:38,333 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
88
+ 2024-08-12 07:33:43,334 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-08-12 07:33:48,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
90
+ 2024-08-12 07:33:48,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
91
+ 2024-08-12 07:33:48,104 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
92
+ 2024-08-12 07:33:49,288 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
93
+ 2024-08-12 07:33:54,289 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
94
+ 2024-08-12 07:33:59,290 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-12 07:34:03,026 DEBUG SenderThread:14458 [sender.py:send():382] send: stats
96
+ 2024-08-12 07:34:03,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
97
+ 2024-08-12 07:34:03,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
98
+ 2024-08-12 07:34:03,108 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
99
+ 2024-08-12 07:34:05,251 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
100
+ 2024-08-12 07:34:10,252 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
101
+ 2024-08-12 07:34:15,187 DEBUG SenderThread:14458 [sender.py:send():382] send: exit
102
+ 2024-08-12 07:34:15,187 INFO SenderThread:14458 [sender.py:send_exit():589] handling exit code: 255
103
+ 2024-08-12 07:34:15,187 INFO SenderThread:14458 [sender.py:send_exit():591] handling runtime: 132
104
+ 2024-08-12 07:34:15,189 INFO SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
105
+ 2024-08-12 07:34:15,189 INFO SenderThread:14458 [sender.py:send_exit():597] send defer
106
+ 2024-08-12 07:34:15,189 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
107
+ 2024-08-12 07:34:15,190 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 0
108
+ 2024-08-12 07:34:15,190 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
109
+ 2024-08-12 07:34:15,190 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 0
110
+ 2024-08-12 07:34:15,190 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 1
111
+ 2024-08-12 07:34:15,190 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
112
+ 2024-08-12 07:34:15,190 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 1
113
+ 2024-08-12 07:34:15,190 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
114
+ 2024-08-12 07:34:15,190 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 1
115
+ 2024-08-12 07:34:15,190 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 2
116
+ 2024-08-12 07:34:15,190 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
117
+ 2024-08-12 07:34:15,190 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 2
118
+ 2024-08-12 07:34:15,190 INFO HandlerThread:14458 [system_monitor.py:finish():203] Stopping system monitor
119
+ 2024-08-12 07:34:15,191 DEBUG SystemMonitor:14458 [system_monitor.py:_start():179] Finished system metrics aggregation loop
120
+ 2024-08-12 07:34:15,191 DEBUG SystemMonitor:14458 [system_monitor.py:_start():183] Publishing last batch of metrics
121
+ 2024-08-12 07:34:15,191 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined cpu monitor
122
+ 2024-08-12 07:34:15,192 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined disk monitor
123
+ 2024-08-12 07:34:15,225 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined gpu monitor
124
+ 2024-08-12 07:34:15,226 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined memory monitor
125
+ 2024-08-12 07:34:15,226 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined network monitor
126
+ 2024-08-12 07:34:15,226 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
127
+ 2024-08-12 07:34:15,226 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 2
128
+ 2024-08-12 07:34:15,226 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 3
129
+ 2024-08-12 07:34:15,227 DEBUG SenderThread:14458 [sender.py:send():382] send: stats
130
+ 2024-08-12 07:34:15,227 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
131
+ 2024-08-12 07:34:15,227 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 3
132
+ 2024-08-12 07:34:15,229 DEBUG SenderThread:14458 [sender.py:send():382] send: history
133
+ 2024-08-12 07:34:15,230 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: summary_record
134
+ 2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
135
+ 2024-08-12 07:34:15,231 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
136
+ 2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 3
137
+ 2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 4
138
+ 2024-08-12 07:34:15,231 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
139
+ 2024-08-12 07:34:15,231 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 4
140
+ 2024-08-12 07:34:15,231 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
141
+ 2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 4
142
+ 2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 5
143
+ 2024-08-12 07:34:15,231 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
144
+ 2024-08-12 07:34:15,232 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 5
145
+ 2024-08-12 07:34:15,232 DEBUG SenderThread:14458 [sender.py:send():382] send: summary
146
+ 2024-08-12 07:34:15,233 INFO SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
147
+ 2024-08-12 07:34:15,233 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
148
+ 2024-08-12 07:34:15,233 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 5
149
+ 2024-08-12 07:34:15,233 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 6
150
+ 2024-08-12 07:34:15,233 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
151
+ 2024-08-12 07:34:15,234 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 6
152
+ 2024-08-12 07:34:15,234 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
153
+ 2024-08-12 07:34:15,234 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 6
154
+ 2024-08-12 07:34:15,234 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 7
155
+ 2024-08-12 07:34:15,234 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
156
+ 2024-08-12 07:34:15,234 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
157
+ 2024-08-12 07:34:15,234 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 7
158
+ 2024-08-12 07:34:15,234 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
159
+ 2024-08-12 07:34:15,234 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 7
160
+ 2024-08-12 07:34:15,862 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 8
161
+ 2024-08-12 07:34:15,862 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
162
+ 2024-08-12 07:34:15,862 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 8
163
+ 2024-08-12 07:34:15,863 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
164
+ 2024-08-12 07:34:15,863 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 8
165
+ 2024-08-12 07:34:15,863 INFO SenderThread:14458 [job_builder.py:build():296] Attempting to build job artifact
166
+ 2024-08-12 07:34:15,864 INFO SenderThread:14458 [job_builder.py:_get_source_type():426] is repo sourced job
167
+ 2024-08-12 07:34:15,878 INFO SenderThread:14458 [job_builder.py:build():402] adding wandb-job metadata file
168
+ 2024-08-12 07:34:15,887 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 9
169
+ 2024-08-12 07:34:15,887 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
170
+ 2024-08-12 07:34:15,887 DEBUG SenderThread:14458 [sender.py:send():382] send: artifact
171
+ 2024-08-12 07:34:15,887 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 9
172
+ 2024-08-12 07:34:16,002 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
173
+ 2024-08-12 07:34:16,002 INFO Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json
174
+ 2024-08-12 07:34:16,187 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: poll_exit
175
+ 2024-08-12 07:34:16,750 INFO SenderThread:14458 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
176
+ 2024-08-12 07:34:16,750 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
177
+ 2024-08-12 07:34:16,750 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 9
178
+ 2024-08-12 07:34:16,750 INFO SenderThread:14458 [dir_watcher.py:finish():358] shutting down directory watcher
179
+ 2024-08-12 07:34:17,003 INFO SenderThread:14458 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_073202-yby212na/files
180
+ 2024-08-12 07:34:17,004 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/requirements.txt requirements.txt
181
+ 2024-08-12 07:34:17,004 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/config.yaml config.yaml
182
+ 2024-08-12 07:34:17,004 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/wandb-metadata.json wandb-metadata.json
183
+ 2024-08-12 07:34:17,006 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json wandb-summary.json
184
+ 2024-08-12 07:34:17,008 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/output.log output.log
185
+ 2024-08-12 07:34:17,009 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 10
186
+ 2024-08-12 07:34:17,009 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: poll_exit
187
+ 2024-08-12 07:34:17,011 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
188
+ 2024-08-12 07:34:17,011 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 10
189
+ 2024-08-12 07:34:17,012 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
190
+ 2024-08-12 07:34:17,012 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 10
191
+ 2024-08-12 07:34:17,012 INFO SenderThread:14458 [file_pusher.py:finish():172] shutting down file pusher
192
+ 2024-08-12 07:34:17,188 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: poll_exit
193
+ 2024-08-12 07:34:17,188 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: poll_exit
194
+ 2024-08-12 07:34:17,408 INFO wandb-upload_1:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/config.yaml
195
+ 2024-08-12 07:34:17,511 INFO wandb-upload_0:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/requirements.txt
196
+ 2024-08-12 07:34:17,588 INFO wandb-upload_2:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json
197
+ 2024-08-12 07:34:17,614 INFO wandb-upload_3:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/output.log
198
+ 2024-08-12 07:34:17,814 INFO Thread-11 (_thread_body):14458 [sender.py:transition_state():617] send defer: 11
199
+ 2024-08-12 07:34:17,814 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
200
+ 2024-08-12 07:34:17,815 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 11
201
+ 2024-08-12 07:34:17,815 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
202
+ 2024-08-12 07:34:17,815 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 11
203
+ 2024-08-12 07:34:17,815 INFO SenderThread:14458 [file_pusher.py:join():178] waiting for file pusher
204
+ 2024-08-12 07:34:17,815 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 12
205
+ 2024-08-12 07:34:17,815 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
206
+ 2024-08-12 07:34:17,815 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 12
207
+ 2024-08-12 07:34:17,815 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
208
+ 2024-08-12 07:34:17,815 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 12
209
+ 2024-08-12 07:34:17,815 INFO SenderThread:14458 [file_stream.py:finish():595] file stream finish called
210
+ 2024-08-12 07:34:18,362 INFO SenderThread:14458 [file_stream.py:finish():599] file stream finish is done
211
+ 2024-08-12 07:34:18,362 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 13
212
+ 2024-08-12 07:34:18,362 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
213
+ 2024-08-12 07:34:18,363 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 13
214
+ 2024-08-12 07:34:18,363 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
215
+ 2024-08-12 07:34:18,363 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 13
216
+ 2024-08-12 07:34:18,363 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 14
217
+ 2024-08-12 07:34:18,363 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
218
+ 2024-08-12 07:34:18,363 DEBUG SenderThread:14458 [sender.py:send():382] send: final
219
+ 2024-08-12 07:34:18,363 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 14
220
+ 2024-08-12 07:34:18,363 DEBUG SenderThread:14458 [sender.py:send():382] send: footer
221
+ 2024-08-12 07:34:18,364 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
222
+ 2024-08-12 07:34:18,364 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 14
223
+ 2024-08-12 07:34:21,364 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
224
+ 2024-08-12 07:34:26,365 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
225
+ 2024-08-12 07:34:31,366 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
226
+ 2024-08-12 07:34:36,367 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
227
+ 2024-08-12 07:34:41,367 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
228
+ 2024-08-12 07:34:46,368 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
229
+ 2024-08-12 07:34:51,369 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
230
+ 2024-08-12 07:34:51,550 WARNING StreamThr :14458 [internal.py:is_dead():414] Internal process exiting, parent pid 14387 disappeared
231
+ 2024-08-12 07:34:51,550 ERROR StreamThr :14458 [internal.py:wandb_internal():152] Internal process shutdown.
232
+ 2024-08-12 07:34:52,369 INFO SenderThread:14458 [sender.py:finish():1572] shutting down sender
233
+ 2024-08-12 07:34:52,369 INFO SenderThread:14458 [file_pusher.py:finish():172] shutting down file pusher
234
+ 2024-08-12 07:34:52,369 INFO SenderThread:14458 [file_pusher.py:join():178] waiting for file pusher
235
+ 2024-08-12 07:34:52,369 INFO HandlerThread:14458 [handler.py:finish():869] shutting down handler
236
+ 2024-08-12 07:34:52,369 INFO WriterThread:14458 [datastore.py:close():296] close: /project/wandb/run-20240812_073202-yby212na/run-yby212na.wandb
wandb/run-20240812_073202-yby212na/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 07:32:02,359 INFO MainThread:14387 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-12 07:32:02,359 INFO MainThread:14387 [wandb_setup.py:_flush():76] Configure stats pid to 14387
3
+ 2024-08-12 07:32:02,359 INFO MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
6
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_073202-yby212na/logs/debug.log
9
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_073202-yby212na/logs/debug-internal.log
10
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:31:51', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 5, 'save_interval': 1, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
13
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:init():616] starting backend
14
+ 2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-12 07:32:02,365 INFO MainThread:14387 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-12 07:32:02,366 INFO MainThread:14387 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-12 07:32:02,370 INFO MainThread:14387 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-12 07:32:02,382 INFO MainThread:14387 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-12 07:32:02,922 INFO MainThread:14387 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-12 07:32:03,004 INFO MainThread:14387 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-12 07:32:03,004 INFO MainThread:14387 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-12 07:32:03,064 INFO MainThread:14387 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-12 07:32:03,064 INFO MainThread:14387 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-12 07:32:03,064 INFO MainThread:14387 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-12 07:32:03,064 INFO MainThread:14387 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-12 07:32:03,065 INFO MainThread:14387 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-12 07:32:04,383 INFO MainThread:14387 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-12 07:32:04,383 INFO MainThread:14387 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
wandb/run-20240812_073202-yby212na/run-yby212na.wandb ADDED
Binary file (26.1 kB). View file
 
wandb/run-20240815_041534-1ld4rgmy/files/config.yaml ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '304771887'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '304771887'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '304771887'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-qwen2-0.5B_train_2024-08-15-04:15:21
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 10
138
+ save_interval:
139
+ desc: null
140
+ value: 10
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-qwen2-0.5B
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 151680
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723662934.646627
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ - 105
300
+ 2:
301
+ - 1
302
+ - 11
303
+ - 49
304
+ - 55
305
+ - 71
306
+ - 105
307
+ 3:
308
+ - 13
309
+ - 16
310
+ - 23
311
+ 4: 3.10.12
312
+ 5: 0.16.3
313
+ 6: 4.43.3
314
+ 8:
315
+ - 5
316
+ 13: linux-x86_64
317
+ model_architecture:
318
+ desc: null
319
+ value: Qwen2ForCausalLM
320
+ activation_function:
321
+ desc: null
322
+ value: silu
323
+ hidden_size:
324
+ desc: null
325
+ value: 896
326
+ model_type:
327
+ desc: null
328
+ value: qwen2
329
+ max_position_embeddings:
330
+ desc: null
331
+ value: 4096
332
+ num_attention_heads:
333
+ desc: null
334
+ value: 14
335
+ num_hidden_layers:
336
+ desc: null
337
+ value: 24
wandb/run-20240815_041534-1ld4rgmy/files/output.log ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ Loading model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
5
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
6
+ Loaded model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
7
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
8
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
9
+ BFloat16 enabled for mixed precision - using bfSixteen policy
10
+ --> applying fsdp activation checkpointing...
11
+ > datasets target sizes (minimum size):
12
+ train: 6400000
13
+ validation: 6403200
14
+ test: 3200
15
+ > building train, validation, and test datasets for GPT ...
16
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
17
+ warnings.warn(
18
+ Let split = None
19
+ Building a BlendedDataset for a single MegatronDataset
20
+ Unable to save the indexes because path_to_cache is None
21
+ Building a BlendedDataset for a single MegatronDataset
22
+ Unable to save the indexes because path_to_cache is None
23
+ Building a BlendedDataset for a single MegatronDataset
24
+ Unable to save the indexes because path_to_cache is None
25
+ > finished creating GPT datasets ...
26
+ Loading optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
27
+ Loaded optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
28
+ model info: FullyShardedDataParallel(
29
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
30
+ (model): Qwen2Model(
31
+ (embed_tokens): Embedding(151936, 896)
32
+ (layers): ModuleList(
33
+ (0-23): 24 x FullyShardedDataParallel(
34
+ (_fsdp_wrapped_module): CheckpointWrapper(
35
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
36
+ (self_attn): Qwen2FlashAttention2(
37
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
38
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
39
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
40
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
41
+ (rotary_emb): Qwen2RotaryEmbedding()
42
+ )
43
+ (mlp): Qwen2MLP(
44
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
45
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
46
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
47
+ (act_fn): SiLU()
48
+ )
49
+ (input_layernorm): Qwen2RMSNorm()
50
+ (post_attention_layernorm): Qwen2RMSNorm()
51
+ )
52
+ )
53
+ )
54
+ )
55
+ (norm): Qwen2RMSNorm()
56
+ )
57
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
58
+ )
59
+ )
60
+ model config: Qwen2Config {
61
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
62
+ "architectures": [
63
+ "Qwen2ForCausalLM"
64
+ ],
65
+ "attention_dropout": 0.0,
66
+ "bos_token_id": 151643,
67
+ "eos_token_id": 151643,
68
+ "hidden_act": "silu",
69
+ "hidden_size": 896,
70
+ "initializer_range": 0.02,
71
+ "intermediate_size": 4864,
72
+ "label_smoothing": 0.0,
73
+ "max_position_embeddings": 4096,
74
+ "max_window_layers": 24,
75
+ "model_type": "qwen2",
76
+ "num_attention_heads": 14,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "rms_norm_eps": 1e-06,
80
+ "rope_theta": 1000000.0,
81
+ "sliding_window": null,
82
+ "tie_word_embeddings": true,
83
+ "torch_dtype": "bfloat16",
84
+ "transformers_version": "4.43.3",
85
+ "use_cache": false,
86
+ "use_sliding_window": false,
87
+ "vocab_size": 151936
88
+ }
89
+ [rank0]:[2024-08-15 04:15:41,598] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
90
+ ------------------------------------------------------------------
91
+ iteration: 1161 , TFLOPS: 71.0304706218284, Tokens per sec: 17664.9211934734, Loss: 2.442603349685669
92
+ ------------------------------------------------------------------
wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ cachetools==5.3.2
23
+ catalogue==2.0.10
24
+ certifi==2024.2.2
25
+ cffi==1.16.0
26
+ chardet==5.2.0
27
+ charset-normalizer==3.3.2
28
+ click==8.1.7
29
+ cloudpathlib==0.16.0
30
+ cloudpickle==3.0.0
31
+ cmake==3.28.1
32
+ colorama==0.4.6
33
+ comm==0.2.1
34
+ confection==0.1.4
35
+ contourpy==1.2.0
36
+ cramjam==2.8.3
37
+ cubinlinker==0.3.0+2.g405ac64
38
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
39
+ cudf==23.12.0
40
+ cugraph-dgl==23.12.0
41
+ cugraph-service-client==23.12.0
42
+ cugraph-service-server==23.12.0
43
+ cugraph==23.12.0
44
+ cuml==23.12.0
45
+ cupy-cuda12x==12.3.0
46
+ cycler==0.12.1
47
+ cymem==2.0.8
48
+ cython==3.0.8
49
+ dask-cuda==23.12.0
50
+ dask-cudf==23.12.0
51
+ dask==2023.11.0
52
+ dataclasses-json==0.6.7
53
+ dataproperty==1.0.1
54
+ datasets==2.20.0
55
+ debugpy==1.8.1
56
+ decorator==5.1.1
57
+ defusedxml==0.7.1
58
+ dill==0.3.8
59
+ distributed==2023.11.0
60
+ distro==1.9.0
61
+ dm-tree==0.1.8
62
+ docker-pycreds==0.4.0
63
+ einops==0.7.0
64
+ emoji==2.12.1
65
+ entmax==1.3
66
+ evaluate==0.4.2
67
+ exceptiongroup==1.2.0
68
+ execnet==2.0.2
69
+ executing==2.0.1
70
+ expecttest==0.1.3
71
+ fastjsonschema==2.19.1
72
+ fastparquet==2023.10.1
73
+ fastrlock==0.8.2
74
+ filelock==3.13.1
75
+ flash-attn==2.4.2
76
+ fonttools==4.48.1
77
+ frozenlist==1.4.1
78
+ fsspec==2023.12.2
79
+ fugashi==1.3.2
80
+ fuzzywuzzy==0.18.0
81
+ gast==0.5.4
82
+ gitdb==4.0.11
83
+ gitpython==3.1.43
84
+ google-auth-oauthlib==0.4.6
85
+ google-auth==2.27.0
86
+ graphsurgeon==0.4.6
87
+ greenlet==3.0.3
88
+ grpcio==1.60.1
89
+ h11==0.14.0
90
+ httpcore==1.0.5
91
+ httpx==0.27.0
92
+ huggingface-hub==0.24.5
93
+ hydra-core==1.3.2
94
+ hypothesis==5.35.1
95
+ idna==3.6
96
+ importlib-metadata==7.0.1
97
+ iniconfig==2.0.0
98
+ intel-openmp==2021.4.0
99
+ ipadic==1.0.0
100
+ ipykernel==6.29.2
101
+ ipython-genutils==0.2.0
102
+ ipython==8.21.0
103
+ isort==5.13.2
104
+ jedi==0.19.1
105
+ jinja2==3.1.3
106
+ jiter==0.5.0
107
+ joblib==1.3.2
108
+ json5==0.9.14
109
+ jsonargparse==3.13.1
110
+ jsonlines==4.0.0
111
+ jsonnet==0.19.1
112
+ jsonpatch==1.33
113
+ jsonpointer==3.0.0
114
+ jsonschema-specifications==2023.12.1
115
+ jsonschema==4.21.1
116
+ jupyter-client==8.6.0
117
+ jupyter-core==5.7.1
118
+ jupyter-tensorboard==0.2.0
119
+ jupyterlab-pygments==0.3.0
120
+ jupyterlab-server==1.2.0
121
+ jupyterlab==2.3.2
122
+ jupytext==1.16.1
123
+ kiwisolver==1.4.5
124
+ langchain-community==0.2.12
125
+ langchain-core==0.2.31
126
+ langchain-huggingface==0.0.2
127
+ langchain-openai==0.1.21
128
+ langchain-text-splitters==0.2.2
129
+ langchain==0.2.13
130
+ langcodes==3.3.0
131
+ langsmith==0.1.99
132
+ lazy-loader==0.3
133
+ levenshtein==0.25.1
134
+ librosa==0.10.1
135
+ lightning-utilities==0.11.6
136
+ llm-jp-eval==1.4.0
137
+ llvmlite==0.40.1
138
+ lm-eval==0.3.0
139
+ locket==1.0.0
140
+ logzero==1.7.0
141
+ lxml==5.2.2
142
+ markdown-it-py==3.0.0
143
+ markdown==3.5.2
144
+ markupsafe==2.1.4
145
+ marshmallow==3.21.3
146
+ matplotlib-inline==0.1.6
147
+ matplotlib==3.8.2
148
+ mbstrdecoder==1.1.3
149
+ mccabe==0.7.0
150
+ mdit-py-plugins==0.4.0
151
+ mdurl==0.1.2
152
+ mecab-python3==1.0.6
153
+ mistune==3.0.2
154
+ mkl-devel==2021.1.1
155
+ mkl-include==2021.1.1
156
+ mkl==2021.1.1
157
+ mock==5.1.0
158
+ mojimoji==0.0.13
159
+ more-itertools==9.1.0
160
+ mpmath==1.3.0
161
+ msgpack==1.0.7
162
+ multidict==6.0.4
163
+ multiprocess==0.70.16
164
+ murmurhash==1.0.10
165
+ mypy-extensions==1.0.0
166
+ nbclient==0.9.0
167
+ nbconvert==7.16.0
168
+ nbformat==5.9.2
169
+ neologdn==0.5.3
170
+ nest-asyncio==1.6.0
171
+ networkx==2.6.3
172
+ ninja==1.11.1.1
173
+ nltk==3.8.1
174
+ notebook==6.4.10
175
+ numba==0.57.1+1.g1ff679645
176
+ numexpr==2.10.1
177
+ numpy==1.24.4
178
+ nvfuser==0.1.4a0+d0bb811
179
+ nvidia-dali-cuda120==1.34.0
180
+ nvidia-pyindex==1.0.9
181
+ nvtx==0.2.5
182
+ oauthlib==3.2.2
183
+ omegaconf==2.3.0
184
+ onnx==1.15.0rc2
185
+ openai==1.40.6
186
+ opencv==4.7.0
187
+ optree==0.10.0
188
+ orjson==3.10.7
189
+ packaging==23.2
190
+ pandas==2.2.2
191
+ pandocfilters==1.5.1
192
+ parso==0.8.3
193
+ partd==1.4.1
194
+ pathvalidate==3.2.0
195
+ peft==0.5.0
196
+ pexpect==4.9.0
197
+ pillow==10.2.0
198
+ pip==24.0
199
+ plac==1.4.3
200
+ platformdirs==4.2.0
201
+ pluggy==1.4.0
202
+ ply==3.11
203
+ polygraphy==0.49.4
204
+ pooch==1.8.0
205
+ portalocker==2.10.1
206
+ preshed==3.0.9
207
+ prettytable==3.9.0
208
+ prometheus-client==0.19.0
209
+ prompt-toolkit==3.0.43
210
+ protobuf==4.24.4
211
+ psutil==5.9.4
212
+ ptxcompiler==0.8.1+2.g0d406d6
213
+ ptyprocess==0.7.0
214
+ pure-eval==0.2.2
215
+ pyarrow-hotfix==0.6
216
+ pyarrow==15.0.2
217
+ pyasn1-modules==0.3.0
218
+ pyasn1==0.5.1
219
+ pybind11-global==2.11.1
220
+ pybind11==2.11.1
221
+ pycocotools==2.0+nv0.8.0
222
+ pycountry==24.6.1
223
+ pycparser==2.21
224
+ pydantic-core==2.16.2
225
+ pydantic==2.6.1
226
+ pygments==2.17.2
227
+ pylibcugraph==23.12.0
228
+ pylibcugraphops==23.12.0
229
+ pylibraft==23.12.0
230
+ pylint==3.2.6
231
+ pynvml==11.4.1
232
+ pyparsing==3.1.1
233
+ pytablewriter==1.2.0
234
+ pytest-flakefinder==1.1.0
235
+ pytest-rerunfailures==13.0
236
+ pytest-shard==0.1.2
237
+ pytest-xdist==3.5.0
238
+ pytest==8.0.0
239
+ python-dateutil==2.8.2
240
+ python-dotenv==1.0.0
241
+ python-hostlist==1.23.0
242
+ python-levenshtein==0.25.1
243
+ pytorch-lightning==2.4.0
244
+ pytorch-quantization==2.1.2
245
+ pytz==2023.3.post1
246
+ pyyaml==6.0.1
247
+ pyzmq==25.1.2
248
+ raft-dask==23.12.0
249
+ rapidfuzz==3.9.6
250
+ rapids-dask-dependency==23.12.1
251
+ referencing==0.33.0
252
+ regex==2023.12.25
253
+ requests-oauthlib==1.3.1
254
+ requests==2.32.3
255
+ rhoknp==1.7.0
256
+ rich==13.7.0
257
+ rmm==23.12.0
258
+ rouge-score==0.1.2
259
+ rpds-py==0.17.1
260
+ rsa==4.9
261
+ sacrebleu==2.4.2
262
+ safetensors==0.4.3
263
+ scikit-learn==1.5.1
264
+ scipy==1.12.0
265
+ send2trash==1.8.2
266
+ sentence-transformers==3.0.1
267
+ sentencepiece==0.1.99
268
+ sentry-sdk==2.12.0
269
+ setproctitle==1.3.3
270
+ setuptools==68.2.2
271
+ six==1.16.0
272
+ smart-open==6.4.0
273
+ smmap==5.0.1
274
+ sniffio==1.3.1
275
+ sortedcontainers==2.4.0
276
+ soundfile==0.12.1
277
+ soupsieve==2.5
278
+ soxr==0.3.7
279
+ spacy-legacy==3.0.12
280
+ spacy-loggers==1.0.5
281
+ spacy==3.7.2
282
+ sphinx-glpi-theme==0.6
283
+ sqlalchemy==2.0.32
284
+ sqlitedict==2.1.0
285
+ srsly==2.4.8
286
+ stack-data==0.6.3
287
+ sumeval==0.2.2
288
+ sympy==1.12
289
+ tabledata==1.3.3
290
+ tabulate==0.9.0
291
+ tbb==2021.11.0
292
+ tblib==3.0.0
293
+ tcolorpy==0.1.6
294
+ tenacity==8.5.0
295
+ tensorboard-data-server==0.6.1
296
+ tensorboard-plugin-wit==1.8.1
297
+ tensorboard==2.9.0
298
+ tensorrt==8.6.3
299
+ terminado==0.18.0
300
+ termplotlib==0.3.9
301
+ text-generation==0.7.0
302
+ thinc==8.2.3
303
+ threadpoolctl==3.2.0
304
+ thriftpy2==0.4.17
305
+ tiktoken==0.7.0
306
+ tinycss2==1.2.1
307
+ tokenizers==0.19.1
308
+ toml==0.10.2
309
+ tomli==2.0.1
310
+ tomlkit==0.13.2
311
+ toolz==0.12.1
312
+ torch-tensorrt==2.3.0a0
313
+ torch==2.3.0a0+ebedce2
314
+ torchdata==0.7.1a0
315
+ torchmetrics==0.10.3
316
+ torchtext==0.17.0a0
317
+ torchvision==0.18.0a0
318
+ tornado==6.4
319
+ tqdm-multiprocess==0.0.11
320
+ tqdm==4.66.5
321
+ traitlets==5.9.0
322
+ transformer-engine==1.3.0+5b90b7f
323
+ transformers==4.43.3
324
+ treelite-runtime==3.9.1
325
+ treelite==3.9.1
326
+ triton==2.2.0+e28a256
327
+ typepy==1.3.2
328
+ typer==0.9.0
329
+ types-dataclasses==0.6.6
330
+ typing-extensions==4.12.2
331
+ typing-inspect==0.9.0
332
+ tzdata==2024.1
333
+ ucx-py==0.35.0
334
+ uff==0.6.9
335
+ ujson==5.8.0
336
+ unbabel-comet==2.2.2
337
+ unidic-lite==1.0.8
338
+ urllib3==1.26.18
339
+ wandb==0.16.3
340
+ wasabi==1.1.2
341
+ wcwidth==0.2.13
342
+ weasel==0.3.4
343
+ webencodings==0.5.1
344
+ werkzeug==3.0.1
345
+ wheel==0.42.0
346
+ word2number==1.1
347
+ xdoctest==1.0.2
348
+ xgboost==1.7.6
349
+ xmltodict==0.13.0
350
+ xxhash==3.4.1
351
+ yarl==1.9.4
352
+ zict==3.0.0
353
+ zipp==3.17.0
354
+ zstandard==0.23.0
wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-14T19:15:35.173102",
5
+ "startedAt": "2024-08-14T19:15:34.633818",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
23
+ "--train-data-path",
24
+ "304771887",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "304771887",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "304771887",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "10",
56
+ "--eval-interval",
57
+ "10",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-qwen2-0.5B",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-qwen2-0.5B_train_2024-08-15-04:15:21"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0389999999993,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.039,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 86}, "training/loss": 2.442603349685669, "training/perplexity": 11.502947992429535, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1161, "optimizer/lr": 1.9946184158325198e-05, "optimizer/variance_l2": 0.0046823736576586325, "optimizer/variance_sqrt_l2": 0.5343142380105511, "optimizer/momentum_l2": 0.12459250428605805, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.284942626953125, "optimizer/variance_sqrt_l1": 4625.0, "optimizer/momentum_l1": 977.875, "optimizer/weight_l1": 6918144.0, "optimizer/variance_abs_max": 0.0030059814453125, "optimizer/variance_sqrt_abs_max": 0.054931640625, "optimizer/momentum_abs_max": 0.0108642578125, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.21714400200017, "stats/tokens_per_sec": 17664.9211934734, "stats/tokens_per_sec_per_gpu": 17664.9211934734, "stats/tflops": 71.0304706218284, "_timestamp": 1723663016.4553976, "_runtime": 81.8087706565857, "_step": 1161}
wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-15 04:15:34,649 INFO StreamThr :12253 [internal.py:wandb_internal():86] W&B internal server running at pid: 12253, started at: 2024-08-15 04:15:34.648066
2
+ 2024-08-15 04:15:34,650 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-15 04:15:34,652 INFO WriterThread:12253 [datastore.py:open_for_write():87] open: /project/wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb
4
+ 2024-08-15 04:15:34,653 DEBUG SenderThread:12253 [sender.py:send():382] send: header
5
+ 2024-08-15 04:15:34,666 DEBUG SenderThread:12253 [sender.py:send():382] send: run
6
+ 2024-08-15 04:15:35,078 INFO SenderThread:12253 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240815_041534-1ld4rgmy/files
7
+ 2024-08-15 04:15:35,078 INFO SenderThread:12253 [sender.py:_start_run_threads():1136] run started: 1ld4rgmy with start time 1723662934.646627
8
+ 2024-08-15 04:15:35,084 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-15 04:15:35,084 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-15 04:15:35,155 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-15 04:15:35,161 DEBUG HandlerThread:12253 [system_info.py:__init__():27] System info init
12
+ 2024-08-15 04:15:35,161 DEBUG HandlerThread:12253 [system_info.py:__init__():42] System info init done
13
+ 2024-08-15 04:15:35,161 INFO HandlerThread:12253 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-15 04:15:35,161 INFO SystemMonitor:12253 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-15 04:15:35,161 INFO HandlerThread:12253 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-15 04:15:35,162 INFO SystemMonitor:12253 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-15 04:15:35,162 INFO SystemMonitor:12253 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-15 04:15:35,163 INFO SystemMonitor:12253 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-15 04:15:35,164 INFO SystemMonitor:12253 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-15 04:15:35,164 INFO SystemMonitor:12253 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-15 04:15:35,173 DEBUG HandlerThread:12253 [system_info.py:probe():151] Probing system
22
+ 2024-08-15 04:15:35,175 DEBUG HandlerThread:12253 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-15 04:15:35,188 DEBUG HandlerThread:12253 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-15 04:15:35,188 DEBUG HandlerThread:12253 [system_info.py:probe():199] Probing system done
25
+ 2024-08-15 04:15:35,188 DEBUG HandlerThread:12253 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-14T19:15:35.173102', 'startedAt': '2024-08-14T19:15:34.633818', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-15-04:15:21'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-15 04:15:35,188 INFO HandlerThread:12253 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-15 04:15:35,188 INFO HandlerThread:12253 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-15 04:15:35,189 INFO HandlerThread:12253 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-15 04:15:35,195 DEBUG SenderThread:12253 [sender.py:send():382] send: files
30
+ 2024-08-15 04:15:35,195 INFO SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-15 04:15:35,207 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-15 04:15:35,207 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-15 04:15:35,207 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: python_packages
34
+ 2024-08-15 04:15:35,208 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-15 04:15:35,209 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-15 04:15:35,448 DEBUG SenderThread:12253 [sender.py:send():382] send: telemetry
37
+ 2024-08-15 04:15:35,826 INFO wandb-upload_0:12253 [upload_job.py:push():131] Uploaded file /tmp/tmprvuc38znwandb/8jb1h2yo-wandb-metadata.json
38
+ 2024-08-15 04:15:36,080 INFO Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt
39
+ 2024-08-15 04:15:36,080 INFO Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json
40
+ 2024-08-15 04:15:36,081 INFO Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
41
+ 2024-08-15 04:15:38,081 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
42
+ 2024-08-15 04:15:40,019 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-15 04:15:40,082 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
44
+ 2024-08-15 04:15:41,878 DEBUG SenderThread:12253 [sender.py:send():382] send: config
45
+ 2024-08-15 04:15:41,878 DEBUG SenderThread:12253 [sender.py:send():382] send: config
46
+ 2024-08-15 04:15:42,083 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
47
+ 2024-08-15 04:15:44,084 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
48
+ 2024-08-15 04:15:45,879 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
49
+ 2024-08-15 04:15:50,206 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
50
+ 2024-08-15 04:15:50,206 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
51
+ 2024-08-15 04:15:50,208 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
52
+ 2024-08-15 04:15:51,411 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
53
+ 2024-08-15 04:15:56,411 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
54
+ 2024-08-15 04:16:01,412 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-08-15 04:16:05,206 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
56
+ 2024-08-15 04:16:05,206 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
57
+ 2024-08-15 04:16:05,246 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
58
+ 2024-08-15 04:16:06,461 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
59
+ 2024-08-15 04:16:08,114 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/config.yaml
60
+ 2024-08-15 04:16:12,324 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
61
+ 2024-08-15 04:16:17,325 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-15 04:16:20,207 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
63
+ 2024-08-15 04:16:20,207 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
64
+ 2024-08-15 04:16:20,250 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
65
+ 2024-08-15 04:16:22,438 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
66
+ 2024-08-15 04:16:27,438 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
67
+ 2024-08-15 04:16:32,439 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
68
+ 2024-08-15 04:16:35,164 DEBUG SystemMonitor:12253 [system_monitor.py:_start():172] Starting system metrics aggregation loop
69
+ 2024-08-15 04:16:35,166 DEBUG SenderThread:12253 [sender.py:send():382] send: stats
70
+ 2024-08-15 04:16:35,206 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
71
+ 2024-08-15 04:16:35,206 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
72
+ 2024-08-15 04:16:35,250 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
73
+ 2024-08-15 04:16:38,433 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
74
+ 2024-08-15 04:16:43,434 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
75
+ 2024-08-15 04:16:48,434 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
76
+ 2024-08-15 04:16:50,206 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
77
+ 2024-08-15 04:16:50,206 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
78
+ 2024-08-15 04:16:50,250 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
79
+ 2024-08-15 04:16:54,406 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
80
+ 2024-08-15 04:16:56,456 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: partial_history
81
+ 2024-08-15 04:16:58,142 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
82
+ 2024-08-15 04:16:59,499 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
83
+ 2024-08-15 04:17:02,068 DEBUG SenderThread:12253 [sender.py:send():382] send: exit
84
+ 2024-08-15 04:17:02,069 INFO SenderThread:12253 [sender.py:send_exit():589] handling exit code: 255
85
+ 2024-08-15 04:17:02,069 INFO SenderThread:12253 [sender.py:send_exit():591] handling runtime: 86
86
+ 2024-08-15 04:17:02,070 INFO SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
87
+ 2024-08-15 04:17:02,070 INFO SenderThread:12253 [sender.py:send_exit():597] send defer
88
+ 2024-08-15 04:17:02,071 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
89
+ 2024-08-15 04:17:02,071 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 0
90
+ 2024-08-15 04:17:02,071 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
91
+ 2024-08-15 04:17:02,071 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 0
92
+ 2024-08-15 04:17:02,071 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 1
93
+ 2024-08-15 04:17:02,071 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
94
+ 2024-08-15 04:17:02,071 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 1
95
+ 2024-08-15 04:17:02,071 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
96
+ 2024-08-15 04:17:02,071 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 1
97
+ 2024-08-15 04:17:02,071 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 2
98
+ 2024-08-15 04:17:02,071 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
99
+ 2024-08-15 04:17:02,072 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 2
100
+ 2024-08-15 04:17:02,072 INFO HandlerThread:12253 [system_monitor.py:finish():203] Stopping system monitor
101
+ 2024-08-15 04:17:02,072 DEBUG SystemMonitor:12253 [system_monitor.py:_start():179] Finished system metrics aggregation loop
102
+ 2024-08-15 04:17:02,072 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined cpu monitor
103
+ 2024-08-15 04:17:02,072 DEBUG SystemMonitor:12253 [system_monitor.py:_start():183] Publishing last batch of metrics
104
+ 2024-08-15 04:17:02,072 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined disk monitor
105
+ 2024-08-15 04:17:02,107 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined gpu monitor
106
+ 2024-08-15 04:17:02,107 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined memory monitor
107
+ 2024-08-15 04:17:02,107 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined network monitor
108
+ 2024-08-15 04:17:02,108 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
109
+ 2024-08-15 04:17:02,108 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 2
110
+ 2024-08-15 04:17:02,108 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 3
111
+ 2024-08-15 04:17:02,108 DEBUG SenderThread:12253 [sender.py:send():382] send: stats
112
+ 2024-08-15 04:17:02,108 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
113
+ 2024-08-15 04:17:02,108 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 3
114
+ 2024-08-15 04:17:02,111 DEBUG SenderThread:12253 [sender.py:send():382] send: history
115
+ 2024-08-15 04:17:02,111 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: summary_record
116
+ 2024-08-15 04:17:02,112 INFO SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
117
+ 2024-08-15 04:17:02,113 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
118
+ 2024-08-15 04:17:02,113 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 3
119
+ 2024-08-15 04:17:02,113 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 4
120
+ 2024-08-15 04:17:02,113 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
121
+ 2024-08-15 04:17:02,113 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 4
122
+ 2024-08-15 04:17:02,113 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
123
+ 2024-08-15 04:17:02,113 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 4
124
+ 2024-08-15 04:17:02,113 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 5
125
+ 2024-08-15 04:17:02,113 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
126
+ 2024-08-15 04:17:02,113 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 5
127
+ 2024-08-15 04:17:02,114 DEBUG SenderThread:12253 [sender.py:send():382] send: summary
128
+ 2024-08-15 04:17:02,115 INFO SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
129
+ 2024-08-15 04:17:02,115 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
130
+ 2024-08-15 04:17:02,115 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 5
131
+ 2024-08-15 04:17:02,115 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 6
132
+ 2024-08-15 04:17:02,115 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
133
+ 2024-08-15 04:17:02,115 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 6
134
+ 2024-08-15 04:17:02,115 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
135
+ 2024-08-15 04:17:02,115 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 6
136
+ 2024-08-15 04:17:02,116 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 7
137
+ 2024-08-15 04:17:02,116 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
138
+ 2024-08-15 04:17:02,116 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
139
+ 2024-08-15 04:17:02,116 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 7
140
+ 2024-08-15 04:17:02,116 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
141
+ 2024-08-15 04:17:02,116 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 7
142
+ 2024-08-15 04:17:02,145 INFO Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json
143
+ 2024-08-15 04:17:03,068 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: poll_exit
144
+ 2024-08-15 04:17:03,854 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 8
145
+ 2024-08-15 04:17:03,854 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: poll_exit
146
+ 2024-08-15 04:17:03,854 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
147
+ 2024-08-15 04:17:03,855 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 8
148
+ 2024-08-15 04:17:03,855 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
149
+ 2024-08-15 04:17:03,855 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 8
150
+ 2024-08-15 04:17:03,855 INFO SenderThread:12253 [job_builder.py:build():296] Attempting to build job artifact
151
+ 2024-08-15 04:17:03,856 INFO SenderThread:12253 [job_builder.py:_get_source_type():426] is repo sourced job
152
+ 2024-08-15 04:17:03,871 INFO SenderThread:12253 [job_builder.py:build():402] adding wandb-job metadata file
153
+ 2024-08-15 04:17:03,880 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 9
154
+ 2024-08-15 04:17:03,880 DEBUG SenderThread:12253 [sender.py:send():382] send: artifact
155
+ 2024-08-15 04:17:03,880 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
156
+ 2024-08-15 04:17:03,881 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 9
157
+ 2024-08-15 04:17:04,069 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: poll_exit
158
+ 2024-08-15 04:17:04,146 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
159
+ 2024-08-15 04:17:35,760 WARNING StreamThr :12253 [internal.py:is_dead():414] Internal process exiting, parent pid 12182 disappeared
160
+ 2024-08-15 04:17:35,760 ERROR StreamThr :12253 [internal.py:wandb_internal():152] Internal process shutdown.
161
+ 2024-08-15 04:17:36,070 INFO WriterThread:12253 [datastore.py:close():296] close: /project/wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb
162
+ 2024-08-15 04:17:36,071 INFO HandlerThread:12253 [handler.py:finish():869] shutting down handler
wandb/run-20240815_041534-1ld4rgmy/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-15 04:15:34,639 INFO MainThread:12182 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Configure stats pid to 12182
3
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
6
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240815_041534-1ld4rgmy/logs/debug.log
9
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log
10
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-15-04:15:21', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
13
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:init():616] starting backend
14
+ 2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-15 04:15:34,645 INFO MainThread:12182 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-15 04:15:34,646 INFO MainThread:12182 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-15 04:15:34,651 INFO MainThread:12182 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-15 04:15:34,662 INFO MainThread:12182 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-15 04:15:35,083 INFO MainThread:12182 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-15 04:15:35,107 INFO MainThread:12182 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-15 04:15:35,107 INFO MainThread:12182 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-15 04:15:35,205 INFO MainThread:12182 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-15 04:15:35,206 INFO MainThread:12182 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-15 04:15:35,206 INFO MainThread:12182 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-15 04:15:35,206 INFO MainThread:12182 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-15 04:15:35,207 INFO MainThread:12182 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-15 04:15:41,877 INFO MainThread:12182 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-15 04:15:41,877 INFO MainThread:12182 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb ADDED
Binary file (18 kB). View file
 
wandb/run-20240824_202022-z2bjbf6e/files/config.yaml ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: NO_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '1754785366'
31
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
32
+ - '28623823675'
33
+ - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
34
+ valid_data_path:
35
+ desc: null
36
+ value:
37
+ - '1205770'
38
+ - /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
39
+ test_data_path:
40
+ desc: null
41
+ value:
42
+ - '1205770'
43
+ - /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
44
+ data_cache_path:
45
+ desc: null
46
+ value: null
47
+ vocab_size:
48
+ desc: null
49
+ value: null
50
+ vocab_file:
51
+ desc: null
52
+ value: null
53
+ merge_file:
54
+ desc: null
55
+ value: null
56
+ seq_length:
57
+ desc: null
58
+ value: 1024
59
+ num_workers:
60
+ desc: null
61
+ value: 4
62
+ tokenizer_type:
63
+ desc: null
64
+ value: HFPreTrainedTokenizer
65
+ tokenizer_model:
66
+ desc: null
67
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
68
+ reset_position_ids:
69
+ desc: null
70
+ value: false
71
+ reset_attention_mask:
72
+ desc: null
73
+ value: false
74
+ eod_mask_loss:
75
+ desc: null
76
+ value: false
77
+ retro_return_doc_ids:
78
+ desc: null
79
+ value: false
80
+ short_seq_prob:
81
+ desc: null
82
+ value: 0.1
83
+ vocab_extra_ids:
84
+ desc: null
85
+ value: 0
86
+ seed:
87
+ desc: null
88
+ value: 1234
89
+ use_mpi:
90
+ desc: null
91
+ value: false
92
+ wandb_entity:
93
+ desc: null
94
+ value: iwakawa-koichi-q5-tohoku-nlp6723
95
+ wandb_name:
96
+ desc: null
97
+ value: yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07
98
+ wandb_project:
99
+ desc: null
100
+ value: yans_experiment
101
+ quantization:
102
+ desc: null
103
+ value: false
104
+ use_freeze_layers:
105
+ desc: null
106
+ value: false
107
+ freeze_layers:
108
+ desc: null
109
+ value: null
110
+ bf16:
111
+ desc: null
112
+ value: true
113
+ fp16:
114
+ desc: null
115
+ value: false
116
+ mixed_precision:
117
+ desc: null
118
+ value: true
119
+ param_dtype:
120
+ desc: null
121
+ value: null
122
+ load:
123
+ desc: null
124
+ value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B
125
+ save:
126
+ desc: null
127
+ value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B
128
+ base_model:
129
+ desc: null
130
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
131
+ use_better_transformer:
132
+ desc: null
133
+ value: false
134
+ grad_clip_norm:
135
+ desc: null
136
+ value: 1.0
137
+ eval_interval:
138
+ desc: null
139
+ value: 200
140
+ save_interval:
141
+ desc: null
142
+ value: 200
143
+ eval_iters:
144
+ desc: null
145
+ value: 10
146
+ optimizer:
147
+ desc: null
148
+ value: anyprecision
149
+ lr:
150
+ desc: null
151
+ value: 3.5e-06
152
+ lr_decay_style:
153
+ desc: null
154
+ value: cosine
155
+ lr_decay_iters:
156
+ desc: null
157
+ value: 23178
158
+ lr_warmup_iters:
159
+ desc: null
160
+ value: 500
161
+ min_lr:
162
+ desc: null
163
+ value: 3.5e-07
164
+ train_iters:
165
+ desc: null
166
+ value: 23178
167
+ train_samples:
168
+ desc: null
169
+ value: null
170
+ global_batch_size:
171
+ desc: null
172
+ value: 1280
173
+ micro_batch_size:
174
+ desc: null
175
+ value: 16
176
+ make_vocab_size_divisible_by:
177
+ desc: null
178
+ value: 128
179
+ sliding_window_size:
180
+ desc: null
181
+ value: 131072
182
+ skip_batch:
183
+ desc: null
184
+ value: null
185
+ no_save_optimizer_state:
186
+ desc: null
187
+ value: false
188
+ continual_pretraining:
189
+ desc: null
190
+ value: false
191
+ instruction_tuning:
192
+ desc: null
193
+ value: false
194
+ direct_preference_optimization:
195
+ desc: null
196
+ value: false
197
+ attention_dropout:
198
+ desc: null
199
+ value: 0.1
200
+ hidden_dropout:
201
+ desc: null
202
+ value: 0.1
203
+ weight_decay:
204
+ desc: null
205
+ value: 0.1
206
+ adam_beta1:
207
+ desc: null
208
+ value: 0.9
209
+ adam_beta2:
210
+ desc: null
211
+ value: 0.95
212
+ adam_eps:
213
+ desc: null
214
+ value: 1.0e-08
215
+ hf_transformer_model_dir:
216
+ desc: null
217
+ value: null
218
+ instruction_train_data_path:
219
+ desc: null
220
+ value: null
221
+ instruction_valid_data_path:
222
+ desc: null
223
+ value: null
224
+ epoch:
225
+ desc: null
226
+ value: null
227
+ instruction_dataset_size:
228
+ desc: null
229
+ value: null
230
+ save_sampler_state:
231
+ desc: null
232
+ value: false
233
+ label_smoothing:
234
+ desc: null
235
+ value: 0.0
236
+ save_n_checkpoints:
237
+ desc: null
238
+ value: 10
239
+ hf_repo_id:
240
+ desc: null
241
+ value: koichi12/yans-baseline-qwen2-0.5B
242
+ create_public_hf_repo:
243
+ desc: null
244
+ value: false
245
+ upload_all_checkpoints_to_hf:
246
+ desc: null
247
+ value: true
248
+ hf_upload_retry_limit:
249
+ desc: null
250
+ value: 2
251
+ exit_duration_in_mins:
252
+ desc: null
253
+ value: null
254
+ source_key:
255
+ desc: null
256
+ value: null
257
+ target_key:
258
+ desc: null
259
+ value: null
260
+ attn_implementation:
261
+ desc: null
262
+ value: flash_attention_2
263
+ efficient_instruction_tuning:
264
+ desc: null
265
+ value: false
266
+ remove_padding_masking:
267
+ desc: null
268
+ value: false
269
+ save_start_iter:
270
+ desc: null
271
+ value: null
272
+ valid_micro_batch_size:
273
+ desc: null
274
+ value: 1
275
+ rank:
276
+ desc: null
277
+ value: 0
278
+ world_size:
279
+ desc: null
280
+ value: 8
281
+ padded_vocab_size:
282
+ desc: null
283
+ value: 151680
284
+ gradient_accumulation_steps:
285
+ desc: null
286
+ value: 10
287
+ _wandb:
288
+ desc: null
289
+ value:
290
+ python_version: 3.10.12
291
+ cli_version: 0.16.3
292
+ framework: huggingface
293
+ huggingface_version: 4.43.3
294
+ is_jupyter_run: false
295
+ is_kaggle_kernel: false
296
+ start_time: 1724498422.652614
297
+ t:
298
+ 1:
299
+ - 1
300
+ - 11
301
+ - 49
302
+ - 55
303
+ - 71
304
+ - 105
305
+ 2:
306
+ - 1
307
+ - 11
308
+ - 49
309
+ - 55
310
+ - 71
311
+ - 105
312
+ 3:
313
+ - 13
314
+ - 16
315
+ - 23
316
+ 4: 3.10.12
317
+ 5: 0.16.3
318
+ 6: 4.43.3
319
+ 8:
320
+ - 5
321
+ 13: linux-x86_64
wandb/run-20240824_202022-z2bjbf6e/files/output.log ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-baseline-qwen2-0.5B.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
6
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
7
+ File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
8
+ Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
9
+ File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
10
+ Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
11
+ No checkpoint found in /work/llm_recipes/models/yans-baseline-qwen2-0.5B, skipping model loading
12
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
13
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
14
+ BFloat16 enabled for mixed precision - using bfSixteen policy
15
+ Let split = None
16
+ Unable to save the indexes because path_to_cache is None
17
+ Traceback (most recent call last):
18
+ File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 270, in build_generic_dataset
19
+ dataset = cls(*args)
20
+ File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 359, in __init__
21
+ self.initialize(path_prefix, multimodal)
22
+ File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 374, in initialize
23
+ self.index = _IndexReader(get_idx_path(self.path_prefix), self.multimodal)
24
+ File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 233, in __init__
25
+ with open(idx_path, "rb") as stream:
26
+ FileNotFoundError: [Errno 2] No such file or directory: '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document.idx'
27
+ The above exception was the direct cause of the following exception:
28
+ Traceback (most recent call last):
29
+ File "/project/examples/finetuning.py", line 13, in <module>
30
+ main()
31
+ File "/project/src/llama_recipes/finetuning.py", line 162, in main
32
+ train_dataset, validation_dataset, test_dataset = build_train_valid_test_datasets()
33
+ File "/project/src/llama_recipes/datasets/pretrain_dataset.py", line 76, in build_train_valid_test_datasets
34
+ return train_valid_test_datasets_provider(train_val_test_num_samples)
35
+ File "/project/src/llama_recipes/datasets/pretrain_dataset.py", line 46, in train_valid_test_datasets_provider
36
+ ).build()
37
+ File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 56, in build
38
+ return self._build_blended_dataset_splits()
39
+ File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 162, in _build_blended_dataset_splits
40
+ self._build_megatron_dataset_splits(
41
+ File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 199, in _build_megatron_dataset_splits
42
+ indexed_dataset = self.build_generic_dataset(
43
+ File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 278, in build_generic_dataset
44
+ raise Exception(log) from err
45
+ Exception: Failed to write dataset materials to the data cache directory. Please supply a directory to which you have write access via the path_to_cache attribute in BlendedMegatronDatasetConfig and retry. Refer to the preserved traceback above for more information.
46
+ --> applying fsdp activation checkpointing...
47
+ > datasets target sizes (minimum size):
48
+ train: 29667840
49
+ validation: 1484800
50
+ test: 12800
51
+ > building train, validation, and test datasets for GPT ...
wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0
wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json ADDED
@@ -0,0 +1,880 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-24T11:20:23.248321",
5
+ "startedAt": "2024-08-24T11:20:22.637930",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "1024",
11
+ "--sliding-window-size",
12
+ "131072",
13
+ "--micro-batch-size",
14
+ "16",
15
+ "--valid_micro_batch_size",
16
+ "1",
17
+ "--global-batch-size",
18
+ "1280",
19
+ "--train-iters",
20
+ "23178",
21
+ "--tokenizer-type",
22
+ "HFPreTrainedTokenizer",
23
+ "--tokenizer-model",
24
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
25
+ "--train-data-path",
26
+ "1754785366",
27
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
28
+ "28623823675",
29
+ "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
30
+ "--valid-data-path",
31
+ "1205770",
32
+ "/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document",
33
+ "--test-data-path",
34
+ "1205770",
35
+ "/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document",
36
+ "--lr",
37
+ "3.5e-6",
38
+ "--min-lr",
39
+ "3.5e-7",
40
+ "--lr-decay-style",
41
+ "cosine",
42
+ "--lr-warmup-iters",
43
+ "500",
44
+ "--lr-decay-iters",
45
+ "23178",
46
+ "--weight-decay",
47
+ "0.1",
48
+ "--grad-clip-norm",
49
+ "1.0",
50
+ "--optimizer",
51
+ "anyprecision",
52
+ "--adam-beta1",
53
+ "0.9",
54
+ "--adam-beta2",
55
+ "0.95",
56
+ "--adam-eps",
57
+ "1e-8",
58
+ "--save-interval",
59
+ "200",
60
+ "--eval-interval",
61
+ "200",
62
+ "--eval-iters",
63
+ "10",
64
+ "--bf16",
65
+ "--mixed-precision",
66
+ "--base-model",
67
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
68
+ "--save",
69
+ "/work/llm_recipes/models/yans-baseline-qwen2-0.5B",
70
+ "--load",
71
+ "/work/llm_recipes/models/yans-baseline-qwen2-0.5B",
72
+ "--num-workers",
73
+ "4",
74
+ "--fsdp-activation-checkpointing",
75
+ "--sharding-strategy",
76
+ "NO_SHARD",
77
+ "--checkpoint-type",
78
+ "LOCAL_STATE_DICT",
79
+ "--save-n-checkpoints",
80
+ "10",
81
+ "--upload-all-checkpoints-to-hf",
82
+ "--hf-upload-retry-limit",
83
+ "2",
84
+ "--hf-repo-id",
85
+ "koichi12/yans-baseline-qwen2-0.5B",
86
+ "--wandb-entity",
87
+ "iwakawa-koichi-q5-tohoku-nlp6723",
88
+ "--wandb-project",
89
+ "yans_experiment",
90
+ "--wandb-name",
91
+ "yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07"
92
+ ],
93
+ "state": "running",
94
+ "program": "/project/examples/finetuning.py",
95
+ "codePathLocal": "examples/finetuning.py",
96
+ "codePath": "examples/finetuning.py",
97
+ "git": {
98
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
99
+ "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
100
+ },
101
+ "email": null,
102
+ "root": "/project",
103
+ "host": "gpu-koiwa-00",
104
+ "username": "koiwa",
105
+ "executable": "/usr/bin/python",
106
+ "cpu_count": 144,
107
+ "cpu_count_logical": 144,
108
+ "cpu_freq": {
109
+ "current": 2400.0340000000015,
110
+ "min": 0.0,
111
+ "max": 0.0
112
+ },
113
+ "cpu_freq_per_core": [
114
+ {
115
+ "current": 2400.034,
116
+ "min": 0.0,
117
+ "max": 0.0
118
+ },
119
+ {
120
+ "current": 2400.034,
121
+ "min": 0.0,
122
+ "max": 0.0
123
+ },
124
+ {
125
+ "current": 2400.034,
126
+ "min": 0.0,
127
+ "max": 0.0
128
+ },
129
+ {
130
+ "current": 2400.034,
131
+ "min": 0.0,
132
+ "max": 0.0
133
+ },
134
+ {
135
+ "current": 2400.034,
136
+ "min": 0.0,
137
+ "max": 0.0
138
+ },
139
+ {
140
+ "current": 2400.034,
141
+ "min": 0.0,
142
+ "max": 0.0
143
+ },
144
+ {
145
+ "current": 2400.034,
146
+ "min": 0.0,
147
+ "max": 0.0
148
+ },
149
+ {
150
+ "current": 2400.034,
151
+ "min": 0.0,
152
+ "max": 0.0
153
+ },
154
+ {
155
+ "current": 2400.034,
156
+ "min": 0.0,
157
+ "max": 0.0
158
+ },
159
+ {
160
+ "current": 2400.034,
161
+ "min": 0.0,
162
+ "max": 0.0
163
+ },
164
+ {
165
+ "current": 2400.034,
166
+ "min": 0.0,
167
+ "max": 0.0
168
+ },
169
+ {
170
+ "current": 2400.034,
171
+ "min": 0.0,
172
+ "max": 0.0
173
+ },
174
+ {
175
+ "current": 2400.034,
176
+ "min": 0.0,
177
+ "max": 0.0
178
+ },
179
+ {
180
+ "current": 2400.034,
181
+ "min": 0.0,
182
+ "max": 0.0
183
+ },
184
+ {
185
+ "current": 2400.034,
186
+ "min": 0.0,
187
+ "max": 0.0
188
+ },
189
+ {
190
+ "current": 2400.034,
191
+ "min": 0.0,
192
+ "max": 0.0
193
+ },
194
+ {
195
+ "current": 2400.034,
196
+ "min": 0.0,
197
+ "max": 0.0
198
+ },
199
+ {
200
+ "current": 2400.034,
201
+ "min": 0.0,
202
+ "max": 0.0
203
+ },
204
+ {
205
+ "current": 2400.034,
206
+ "min": 0.0,
207
+ "max": 0.0
208
+ },
209
+ {
210
+ "current": 2400.034,
211
+ "min": 0.0,
212
+ "max": 0.0
213
+ },
214
+ {
215
+ "current": 2400.034,
216
+ "min": 0.0,
217
+ "max": 0.0
218
+ },
219
+ {
220
+ "current": 2400.034,
221
+ "min": 0.0,
222
+ "max": 0.0
223
+ },
224
+ {
225
+ "current": 2400.034,
226
+ "min": 0.0,
227
+ "max": 0.0
228
+ },
229
+ {
230
+ "current": 2400.034,
231
+ "min": 0.0,
232
+ "max": 0.0
233
+ },
234
+ {
235
+ "current": 2400.034,
236
+ "min": 0.0,
237
+ "max": 0.0
238
+ },
239
+ {
240
+ "current": 2400.034,
241
+ "min": 0.0,
242
+ "max": 0.0
243
+ },
244
+ {
245
+ "current": 2400.034,
246
+ "min": 0.0,
247
+ "max": 0.0
248
+ },
249
+ {
250
+ "current": 2400.034,
251
+ "min": 0.0,
252
+ "max": 0.0
253
+ },
254
+ {
255
+ "current": 2400.034,
256
+ "min": 0.0,
257
+ "max": 0.0
258
+ },
259
+ {
260
+ "current": 2400.034,
261
+ "min": 0.0,
262
+ "max": 0.0
263
+ },
264
+ {
265
+ "current": 2400.034,
266
+ "min": 0.0,
267
+ "max": 0.0
268
+ },
269
+ {
270
+ "current": 2400.034,
271
+ "min": 0.0,
272
+ "max": 0.0
273
+ },
274
+ {
275
+ "current": 2400.034,
276
+ "min": 0.0,
277
+ "max": 0.0
278
+ },
279
+ {
280
+ "current": 2400.034,
281
+ "min": 0.0,
282
+ "max": 0.0
283
+ },
284
+ {
285
+ "current": 2400.034,
286
+ "min": 0.0,
287
+ "max": 0.0
288
+ },
289
+ {
290
+ "current": 2400.034,
291
+ "min": 0.0,
292
+ "max": 0.0
293
+ },
294
+ {
295
+ "current": 2400.034,
296
+ "min": 0.0,
297
+ "max": 0.0
298
+ },
299
+ {
300
+ "current": 2400.034,
301
+ "min": 0.0,
302
+ "max": 0.0
303
+ },
304
+ {
305
+ "current": 2400.034,
306
+ "min": 0.0,
307
+ "max": 0.0
308
+ },
309
+ {
310
+ "current": 2400.034,
311
+ "min": 0.0,
312
+ "max": 0.0
313
+ },
314
+ {
315
+ "current": 2400.034,
316
+ "min": 0.0,
317
+ "max": 0.0
318
+ },
319
+ {
320
+ "current": 2400.034,
321
+ "min": 0.0,
322
+ "max": 0.0
323
+ },
324
+ {
325
+ "current": 2400.034,
326
+ "min": 0.0,
327
+ "max": 0.0
328
+ },
329
+ {
330
+ "current": 2400.034,
331
+ "min": 0.0,
332
+ "max": 0.0
333
+ },
334
+ {
335
+ "current": 2400.034,
336
+ "min": 0.0,
337
+ "max": 0.0
338
+ },
339
+ {
340
+ "current": 2400.034,
341
+ "min": 0.0,
342
+ "max": 0.0
343
+ },
344
+ {
345
+ "current": 2400.034,
346
+ "min": 0.0,
347
+ "max": 0.0
348
+ },
349
+ {
350
+ "current": 2400.034,
351
+ "min": 0.0,
352
+ "max": 0.0
353
+ },
354
+ {
355
+ "current": 2400.034,
356
+ "min": 0.0,
357
+ "max": 0.0
358
+ },
359
+ {
360
+ "current": 2400.034,
361
+ "min": 0.0,
362
+ "max": 0.0
363
+ },
364
+ {
365
+ "current": 2400.034,
366
+ "min": 0.0,
367
+ "max": 0.0
368
+ },
369
+ {
370
+ "current": 2400.034,
371
+ "min": 0.0,
372
+ "max": 0.0
373
+ },
374
+ {
375
+ "current": 2400.034,
376
+ "min": 0.0,
377
+ "max": 0.0
378
+ },
379
+ {
380
+ "current": 2400.034,
381
+ "min": 0.0,
382
+ "max": 0.0
383
+ },
384
+ {
385
+ "current": 2400.034,
386
+ "min": 0.0,
387
+ "max": 0.0
388
+ },
389
+ {
390
+ "current": 2400.034,
391
+ "min": 0.0,
392
+ "max": 0.0
393
+ },
394
+ {
395
+ "current": 2400.034,
396
+ "min": 0.0,
397
+ "max": 0.0
398
+ },
399
+ {
400
+ "current": 2400.034,
401
+ "min": 0.0,
402
+ "max": 0.0
403
+ },
404
+ {
405
+ "current": 2400.034,
406
+ "min": 0.0,
407
+ "max": 0.0
408
+ },
409
+ {
410
+ "current": 2400.034,
411
+ "min": 0.0,
412
+ "max": 0.0
413
+ },
414
+ {
415
+ "current": 2400.034,
416
+ "min": 0.0,
417
+ "max": 0.0
418
+ },
419
+ {
420
+ "current": 2400.034,
421
+ "min": 0.0,
422
+ "max": 0.0
423
+ },
424
+ {
425
+ "current": 2400.034,
426
+ "min": 0.0,
427
+ "max": 0.0
428
+ },
429
+ {
430
+ "current": 2400.034,
431
+ "min": 0.0,
432
+ "max": 0.0
433
+ },
434
+ {
435
+ "current": 2400.034,
436
+ "min": 0.0,
437
+ "max": 0.0
438
+ },
439
+ {
440
+ "current": 2400.034,
441
+ "min": 0.0,
442
+ "max": 0.0
443
+ },
444
+ {
445
+ "current": 2400.034,
446
+ "min": 0.0,
447
+ "max": 0.0
448
+ },
449
+ {
450
+ "current": 2400.034,
451
+ "min": 0.0,
452
+ "max": 0.0
453
+ },
454
+ {
455
+ "current": 2400.034,
456
+ "min": 0.0,
457
+ "max": 0.0
458
+ },
459
+ {
460
+ "current": 2400.034,
461
+ "min": 0.0,
462
+ "max": 0.0
463
+ },
464
+ {
465
+ "current": 2400.034,
466
+ "min": 0.0,
467
+ "max": 0.0
468
+ },
469
+ {
470
+ "current": 2400.034,
471
+ "min": 0.0,
472
+ "max": 0.0
473
+ },
474
+ {
475
+ "current": 2400.034,
476
+ "min": 0.0,
477
+ "max": 0.0
478
+ },
479
+ {
480
+ "current": 2400.034,
481
+ "min": 0.0,
482
+ "max": 0.0
483
+ },
484
+ {
485
+ "current": 2400.034,
486
+ "min": 0.0,
487
+ "max": 0.0
488
+ },
489
+ {
490
+ "current": 2400.034,
491
+ "min": 0.0,
492
+ "max": 0.0
493
+ },
494
+ {
495
+ "current": 2400.034,
496
+ "min": 0.0,
497
+ "max": 0.0
498
+ },
499
+ {
500
+ "current": 2400.034,
501
+ "min": 0.0,
502
+ "max": 0.0
503
+ },
504
+ {
505
+ "current": 2400.034,
506
+ "min": 0.0,
507
+ "max": 0.0
508
+ },
509
+ {
510
+ "current": 2400.034,
511
+ "min": 0.0,
512
+ "max": 0.0
513
+ },
514
+ {
515
+ "current": 2400.034,
516
+ "min": 0.0,
517
+ "max": 0.0
518
+ },
519
+ {
520
+ "current": 2400.034,
521
+ "min": 0.0,
522
+ "max": 0.0
523
+ },
524
+ {
525
+ "current": 2400.034,
526
+ "min": 0.0,
527
+ "max": 0.0
528
+ },
529
+ {
530
+ "current": 2400.034,
531
+ "min": 0.0,
532
+ "max": 0.0
533
+ },
534
+ {
535
+ "current": 2400.034,
536
+ "min": 0.0,
537
+ "max": 0.0
538
+ },
539
+ {
540
+ "current": 2400.034,
541
+ "min": 0.0,
542
+ "max": 0.0
543
+ },
544
+ {
545
+ "current": 2400.034,
546
+ "min": 0.0,
547
+ "max": 0.0
548
+ },
549
+ {
550
+ "current": 2400.034,
551
+ "min": 0.0,
552
+ "max": 0.0
553
+ },
554
+ {
555
+ "current": 2400.034,
556
+ "min": 0.0,
557
+ "max": 0.0
558
+ },
559
+ {
560
+ "current": 2400.034,
561
+ "min": 0.0,
562
+ "max": 0.0
563
+ },
564
+ {
565
+ "current": 2400.034,
566
+ "min": 0.0,
567
+ "max": 0.0
568
+ },
569
+ {
570
+ "current": 2400.034,
571
+ "min": 0.0,
572
+ "max": 0.0
573
+ },
574
+ {
575
+ "current": 2400.034,
576
+ "min": 0.0,
577
+ "max": 0.0
578
+ },
579
+ {
580
+ "current": 2400.034,
581
+ "min": 0.0,
582
+ "max": 0.0
583
+ },
584
+ {
585
+ "current": 2400.034,
586
+ "min": 0.0,
587
+ "max": 0.0
588
+ },
589
+ {
590
+ "current": 2400.034,
591
+ "min": 0.0,
592
+ "max": 0.0
593
+ },
594
+ {
595
+ "current": 2400.034,
596
+ "min": 0.0,
597
+ "max": 0.0
598
+ },
599
+ {
600
+ "current": 2400.034,
601
+ "min": 0.0,
602
+ "max": 0.0
603
+ },
604
+ {
605
+ "current": 2400.034,
606
+ "min": 0.0,
607
+ "max": 0.0
608
+ },
609
+ {
610
+ "current": 2400.034,
611
+ "min": 0.0,
612
+ "max": 0.0
613
+ },
614
+ {
615
+ "current": 2400.034,
616
+ "min": 0.0,
617
+ "max": 0.0
618
+ },
619
+ {
620
+ "current": 2400.034,
621
+ "min": 0.0,
622
+ "max": 0.0
623
+ },
624
+ {
625
+ "current": 2400.034,
626
+ "min": 0.0,
627
+ "max": 0.0
628
+ },
629
+ {
630
+ "current": 2400.034,
631
+ "min": 0.0,
632
+ "max": 0.0
633
+ },
634
+ {
635
+ "current": 2400.034,
636
+ "min": 0.0,
637
+ "max": 0.0
638
+ },
639
+ {
640
+ "current": 2400.034,
641
+ "min": 0.0,
642
+ "max": 0.0
643
+ },
644
+ {
645
+ "current": 2400.034,
646
+ "min": 0.0,
647
+ "max": 0.0
648
+ },
649
+ {
650
+ "current": 2400.034,
651
+ "min": 0.0,
652
+ "max": 0.0
653
+ },
654
+ {
655
+ "current": 2400.034,
656
+ "min": 0.0,
657
+ "max": 0.0
658
+ },
659
+ {
660
+ "current": 2400.034,
661
+ "min": 0.0,
662
+ "max": 0.0
663
+ },
664
+ {
665
+ "current": 2400.034,
666
+ "min": 0.0,
667
+ "max": 0.0
668
+ },
669
+ {
670
+ "current": 2400.034,
671
+ "min": 0.0,
672
+ "max": 0.0
673
+ },
674
+ {
675
+ "current": 2400.034,
676
+ "min": 0.0,
677
+ "max": 0.0
678
+ },
679
+ {
680
+ "current": 2400.034,
681
+ "min": 0.0,
682
+ "max": 0.0
683
+ },
684
+ {
685
+ "current": 2400.034,
686
+ "min": 0.0,
687
+ "max": 0.0
688
+ },
689
+ {
690
+ "current": 2400.034,
691
+ "min": 0.0,
692
+ "max": 0.0
693
+ },
694
+ {
695
+ "current": 2400.034,
696
+ "min": 0.0,
697
+ "max": 0.0
698
+ },
699
+ {
700
+ "current": 2400.034,
701
+ "min": 0.0,
702
+ "max": 0.0
703
+ },
704
+ {
705
+ "current": 2400.034,
706
+ "min": 0.0,
707
+ "max": 0.0
708
+ },
709
+ {
710
+ "current": 2400.034,
711
+ "min": 0.0,
712
+ "max": 0.0
713
+ },
714
+ {
715
+ "current": 2400.034,
716
+ "min": 0.0,
717
+ "max": 0.0
718
+ },
719
+ {
720
+ "current": 2400.034,
721
+ "min": 0.0,
722
+ "max": 0.0
723
+ },
724
+ {
725
+ "current": 2400.034,
726
+ "min": 0.0,
727
+ "max": 0.0
728
+ },
729
+ {
730
+ "current": 2400.034,
731
+ "min": 0.0,
732
+ "max": 0.0
733
+ },
734
+ {
735
+ "current": 2400.034,
736
+ "min": 0.0,
737
+ "max": 0.0
738
+ },
739
+ {
740
+ "current": 2400.034,
741
+ "min": 0.0,
742
+ "max": 0.0
743
+ },
744
+ {
745
+ "current": 2400.034,
746
+ "min": 0.0,
747
+ "max": 0.0
748
+ },
749
+ {
750
+ "current": 2400.034,
751
+ "min": 0.0,
752
+ "max": 0.0
753
+ },
754
+ {
755
+ "current": 2400.034,
756
+ "min": 0.0,
757
+ "max": 0.0
758
+ },
759
+ {
760
+ "current": 2400.034,
761
+ "min": 0.0,
762
+ "max": 0.0
763
+ },
764
+ {
765
+ "current": 2400.034,
766
+ "min": 0.0,
767
+ "max": 0.0
768
+ },
769
+ {
770
+ "current": 2400.034,
771
+ "min": 0.0,
772
+ "max": 0.0
773
+ },
774
+ {
775
+ "current": 2400.034,
776
+ "min": 0.0,
777
+ "max": 0.0
778
+ },
779
+ {
780
+ "current": 2400.034,
781
+ "min": 0.0,
782
+ "max": 0.0
783
+ },
784
+ {
785
+ "current": 2400.034,
786
+ "min": 0.0,
787
+ "max": 0.0
788
+ },
789
+ {
790
+ "current": 2400.034,
791
+ "min": 0.0,
792
+ "max": 0.0
793
+ },
794
+ {
795
+ "current": 2400.034,
796
+ "min": 0.0,
797
+ "max": 0.0
798
+ },
799
+ {
800
+ "current": 2400.034,
801
+ "min": 0.0,
802
+ "max": 0.0
803
+ },
804
+ {
805
+ "current": 2400.034,
806
+ "min": 0.0,
807
+ "max": 0.0
808
+ },
809
+ {
810
+ "current": 2400.034,
811
+ "min": 0.0,
812
+ "max": 0.0
813
+ },
814
+ {
815
+ "current": 2400.034,
816
+ "min": 0.0,
817
+ "max": 0.0
818
+ },
819
+ {
820
+ "current": 2400.034,
821
+ "min": 0.0,
822
+ "max": 0.0
823
+ },
824
+ {
825
+ "current": 2400.034,
826
+ "min": 0.0,
827
+ "max": 0.0
828
+ },
829
+ {
830
+ "current": 2400.034,
831
+ "min": 0.0,
832
+ "max": 0.0
833
+ }
834
+ ],
835
+ "disk": {
836
+ "/": {
837
+ "total": 0.0625,
838
+ "used": 1.1444091796875e-05
839
+ }
840
+ },
841
+ "gpu": "NVIDIA A100-SXM4-40GB",
842
+ "gpu_count": 8,
843
+ "gpu_devices": [
844
+ {
845
+ "name": "NVIDIA A100-SXM4-40GB",
846
+ "memory_total": 42949672960
847
+ },
848
+ {
849
+ "name": "NVIDIA A100-SXM4-40GB",
850
+ "memory_total": 42949672960
851
+ },
852
+ {
853
+ "name": "NVIDIA A100-SXM4-40GB",
854
+ "memory_total": 42949672960
855
+ },
856
+ {
857
+ "name": "NVIDIA A100-SXM4-40GB",
858
+ "memory_total": 42949672960
859
+ },
860
+ {
861
+ "name": "NVIDIA A100-SXM4-40GB",
862
+ "memory_total": 42949672960
863
+ },
864
+ {
865
+ "name": "NVIDIA A100-SXM4-40GB",
866
+ "memory_total": 42949672960
867
+ },
868
+ {
869
+ "name": "NVIDIA A100-SXM4-40GB",
870
+ "memory_total": 42949672960
871
+ },
872
+ {
873
+ "name": "NVIDIA A100-SXM4-40GB",
874
+ "memory_total": 42949672960
875
+ }
876
+ ],
877
+ "memory": {
878
+ "total": 453.4449462890625
879
+ }
880
+ }
wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 4}}
wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-24 20:20:22,655 INFO StreamThr :25836 [internal.py:wandb_internal():86] W&B internal server running at pid: 25836, started at: 2024-08-24 20:20:22.654049
2
+ 2024-08-24 20:20:22,656 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-24 20:20:22,659 INFO WriterThread:25836 [datastore.py:open_for_write():87] open: /project/wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb
4
+ 2024-08-24 20:20:22,660 DEBUG SenderThread:25836 [sender.py:send():382] send: header
5
+ 2024-08-24 20:20:22,676 DEBUG SenderThread:25836 [sender.py:send():382] send: run
6
+ 2024-08-24 20:20:23,101 INFO SenderThread:25836 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240824_202022-z2bjbf6e/files
7
+ 2024-08-24 20:20:23,101 INFO SenderThread:25836 [sender.py:_start_run_threads():1136] run started: z2bjbf6e with start time 1724498422.652614
8
+ 2024-08-24 20:20:23,106 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-24 20:20:23,106 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-24 20:20:23,175 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-24 20:20:23,182 DEBUG HandlerThread:25836 [system_info.py:__init__():27] System info init
12
+ 2024-08-24 20:20:23,182 DEBUG HandlerThread:25836 [system_info.py:__init__():42] System info init done
13
+ 2024-08-24 20:20:23,182 INFO HandlerThread:25836 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-24 20:20:23,182 INFO SystemMonitor:25836 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-24 20:20:23,183 INFO HandlerThread:25836 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-24 20:20:23,183 INFO SystemMonitor:25836 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-24 20:20:23,183 INFO SystemMonitor:25836 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-24 20:20:23,184 INFO SystemMonitor:25836 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-24 20:20:23,185 INFO SystemMonitor:25836 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-24 20:20:23,186 INFO SystemMonitor:25836 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-24 20:20:23,248 DEBUG HandlerThread:25836 [system_info.py:probe():151] Probing system
22
+ 2024-08-24 20:20:23,250 DEBUG HandlerThread:25836 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-24 20:20:23,264 DEBUG HandlerThread:25836 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-24 20:20:23,264 DEBUG HandlerThread:25836 [system_info.py:probe():199] Probing system done
25
+ 2024-08-24 20:20:23,264 DEBUG HandlerThread:25836 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-24T11:20:23.248321', 'startedAt': '2024-08-24T11:20:22.637930', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '131072', '--micro-batch-size', '16', '--valid_micro_batch_size', '1', '--global-batch-size', '1280', '--train-iters', '23178', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document', '--test-data-path', '1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document', '--lr', '3.5e-6', '--min-lr', '3.5e-7', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '23178', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', '--num-workers', '4', '--fsdp-activation-checkpointing', '--sharding-strategy', 'NO_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-baseline-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'yans_experiment', '--wandb-name', 'yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 144, 'cpu_count_logical': 144, 'cpu_freq': {'current': 2400.0340000000015, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 453.4449462890625}}
26
+ 2024-08-24 20:20:23,264 INFO HandlerThread:25836 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-24 20:20:23,264 INFO HandlerThread:25836 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-24 20:20:23,266 INFO HandlerThread:25836 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-24 20:20:23,272 DEBUG SenderThread:25836 [sender.py:send():382] send: files
30
+ 2024-08-24 20:20:23,272 INFO SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-24 20:20:23,283 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-24 20:20:23,284 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-24 20:20:23,284 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-24 20:20:23,284 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-24 20:20:23,286 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-24 20:20:23,526 DEBUG SenderThread:25836 [sender.py:send():382] send: telemetry
37
+ 2024-08-24 20:20:23,973 INFO wandb-upload_0:25836 [upload_job.py:push():131] Uploaded file /tmp/tmpwjpjqs3pwandb/55szr5f9-wandb-metadata.json
38
+ 2024-08-24 20:20:24,103 INFO Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
39
+ 2024-08-24 20:20:24,103 INFO Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json
40
+ 2024-08-24 20:20:24,103 INFO Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt
41
+ 2024-08-24 20:20:26,103 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
42
+ 2024-08-24 20:20:27,701 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-24 20:20:27,737 DEBUG SenderThread:25836 [sender.py:send():382] send: exit
44
+ 2024-08-24 20:20:27,737 INFO SenderThread:25836 [sender.py:send_exit():589] handling exit code: 1
45
+ 2024-08-24 20:20:27,737 INFO SenderThread:25836 [sender.py:send_exit():591] handling runtime: 4
46
+ 2024-08-24 20:20:27,739 INFO SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
47
+ 2024-08-24 20:20:27,739 INFO SenderThread:25836 [sender.py:send_exit():597] send defer
48
+ 2024-08-24 20:20:27,739 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
49
+ 2024-08-24 20:20:27,739 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 0
50
+ 2024-08-24 20:20:27,740 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
51
+ 2024-08-24 20:20:27,740 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 0
52
+ 2024-08-24 20:20:27,740 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 1
53
+ 2024-08-24 20:20:27,740 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
54
+ 2024-08-24 20:20:27,740 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 1
55
+ 2024-08-24 20:20:27,740 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
56
+ 2024-08-24 20:20:27,740 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 1
57
+ 2024-08-24 20:20:27,740 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 2
58
+ 2024-08-24 20:20:27,740 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
59
+ 2024-08-24 20:20:27,740 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 2
60
+ 2024-08-24 20:20:27,740 INFO HandlerThread:25836 [system_monitor.py:finish():203] Stopping system monitor
61
+ 2024-08-24 20:20:27,740 DEBUG SystemMonitor:25836 [system_monitor.py:_start():172] Starting system metrics aggregation loop
62
+ 2024-08-24 20:20:27,741 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined cpu monitor
63
+ 2024-08-24 20:20:27,741 DEBUG SystemMonitor:25836 [system_monitor.py:_start():179] Finished system metrics aggregation loop
64
+ 2024-08-24 20:20:27,741 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined disk monitor
65
+ 2024-08-24 20:20:27,741 DEBUG SystemMonitor:25836 [system_monitor.py:_start():183] Publishing last batch of metrics
66
+ 2024-08-24 20:20:28,105 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
67
+ 2024-08-24 20:20:28,106 INFO Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
68
+ 2024-08-24 20:20:28,918 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined gpu monitor
69
+ 2024-08-24 20:20:28,918 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined memory monitor
70
+ 2024-08-24 20:20:28,918 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined network monitor
71
+ 2024-08-24 20:20:28,918 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
72
+ 2024-08-24 20:20:28,920 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
73
+ 2024-08-24 20:20:28,920 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 2
74
+ 2024-08-24 20:20:28,920 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 3
75
+ 2024-08-24 20:20:28,920 DEBUG SenderThread:25836 [sender.py:send():382] send: stats
76
+ 2024-08-24 20:20:28,920 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
77
+ 2024-08-24 20:20:28,921 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
78
+ 2024-08-24 20:20:28,921 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 3
79
+ 2024-08-24 20:20:28,921 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
80
+ 2024-08-24 20:20:28,921 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 3
81
+ 2024-08-24 20:20:28,921 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 4
82
+ 2024-08-24 20:20:28,921 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
83
+ 2024-08-24 20:20:28,921 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 4
84
+ 2024-08-24 20:20:28,922 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
85
+ 2024-08-24 20:20:28,922 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 4
86
+ 2024-08-24 20:20:28,922 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 5
87
+ 2024-08-24 20:20:28,922 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
88
+ 2024-08-24 20:20:28,922 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 5
89
+ 2024-08-24 20:20:28,922 DEBUG SenderThread:25836 [sender.py:send():382] send: summary
90
+ 2024-08-24 20:20:28,923 INFO SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
91
+ 2024-08-24 20:20:28,923 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
92
+ 2024-08-24 20:20:28,923 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 5
93
+ 2024-08-24 20:20:28,923 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 6
94
+ 2024-08-24 20:20:28,923 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
95
+ 2024-08-24 20:20:28,924 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 6
96
+ 2024-08-24 20:20:28,924 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
97
+ 2024-08-24 20:20:28,924 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 6
98
+ 2024-08-24 20:20:28,927 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
99
+ 2024-08-24 20:20:29,107 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
100
+ 2024-08-24 20:20:29,126 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 7
101
+ 2024-08-24 20:20:29,126 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
102
+ 2024-08-24 20:20:29,126 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 7
103
+ 2024-08-24 20:20:29,126 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
104
+ 2024-08-24 20:20:29,126 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 7
105
+ 2024-08-24 20:20:29,738 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
106
+ 2024-08-24 20:20:30,108 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml
107
+ 2024-08-24 20:20:30,108 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
108
+ 2024-08-24 20:20:31,391 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 8
109
+ 2024-08-24 20:20:31,392 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
110
+ 2024-08-24 20:20:31,392 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
111
+ 2024-08-24 20:20:31,392 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 8
112
+ 2024-08-24 20:20:31,392 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
113
+ 2024-08-24 20:20:31,392 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 8
114
+ 2024-08-24 20:20:31,392 INFO SenderThread:25836 [job_builder.py:build():296] Attempting to build job artifact
115
+ 2024-08-24 20:20:31,393 INFO SenderThread:25836 [job_builder.py:_get_source_type():426] is repo sourced job
116
+ 2024-08-24 20:20:31,408 INFO SenderThread:25836 [job_builder.py:build():402] adding wandb-job metadata file
117
+ 2024-08-24 20:20:31,417 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 9
118
+ 2024-08-24 20:20:31,418 DEBUG SenderThread:25836 [sender.py:send():382] send: artifact
119
+ 2024-08-24 20:20:31,418 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
120
+ 2024-08-24 20:20:31,419 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 9
121
+ 2024-08-24 20:20:31,738 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
122
+ 2024-08-24 20:20:32,109 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
123
+ 2024-08-24 20:20:34,782 INFO SenderThread:25836 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MzU1Mzg0Mw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjgwMzg3NA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MzU1Mzg0Mw==', 'versionIndex': 0}}}
124
+ 2024-08-24 20:20:34,782 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
125
+ 2024-08-24 20:20:34,782 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
126
+ 2024-08-24 20:20:34,782 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 9
127
+ 2024-08-24 20:20:34,783 INFO SenderThread:25836 [dir_watcher.py:finish():358] shutting down directory watcher
128
+ 2024-08-24 20:20:35,110 INFO SenderThread:25836 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240824_202022-z2bjbf6e/files
129
+ 2024-08-24 20:20:35,110 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt requirements.txt
130
+ 2024-08-24 20:20:35,110 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml config.yaml
131
+ 2024-08-24 20:20:35,112 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json wandb-metadata.json
132
+ 2024-08-24 20:20:35,112 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json wandb-summary.json
133
+ 2024-08-24 20:20:35,113 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log output.log
134
+ 2024-08-24 20:20:35,115 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 10
135
+ 2024-08-24 20:20:35,115 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
136
+ 2024-08-24 20:20:35,116 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
137
+ 2024-08-24 20:20:35,117 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 10
138
+ 2024-08-24 20:20:35,117 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
139
+ 2024-08-24 20:20:35,117 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 10
140
+ 2024-08-24 20:20:35,117 INFO SenderThread:25836 [file_pusher.py:finish():172] shutting down file pusher
141
+ 2024-08-24 20:20:35,574 INFO wandb-upload_1:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml
142
+ 2024-08-24 20:20:35,574 INFO wandb-upload_0:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt
143
+ 2024-08-24 20:20:35,580 INFO wandb-upload_2:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
144
+ 2024-08-24 20:20:35,588 INFO wandb-upload_3:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
145
+ 2024-08-24 20:20:35,739 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
146
+ 2024-08-24 20:20:35,740 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
147
+ 2024-08-24 20:20:35,788 INFO Thread-11 (_thread_body):25836 [sender.py:transition_state():617] send defer: 11
148
+ 2024-08-24 20:20:35,788 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
149
+ 2024-08-24 20:20:35,788 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 11
150
+ 2024-08-24 20:20:35,789 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
151
+ 2024-08-24 20:20:35,789 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 11
152
+ 2024-08-24 20:20:35,789 INFO SenderThread:25836 [file_pusher.py:join():178] waiting for file pusher
153
+ 2024-08-24 20:20:35,789 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 12
154
+ 2024-08-24 20:20:35,789 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
155
+ 2024-08-24 20:20:35,789 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 12
156
+ 2024-08-24 20:20:35,789 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
157
+ 2024-08-24 20:20:35,789 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 12
158
+ 2024-08-24 20:20:35,789 INFO SenderThread:25836 [file_stream.py:finish():595] file stream finish called
159
+ 2024-08-24 20:20:36,056 INFO SenderThread:25836 [file_stream.py:finish():599] file stream finish is done
160
+ 2024-08-24 20:20:36,056 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 13
161
+ 2024-08-24 20:20:36,056 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
162
+ 2024-08-24 20:20:36,056 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 13
163
+ 2024-08-24 20:20:36,056 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
164
+ 2024-08-24 20:20:36,056 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 13
165
+ 2024-08-24 20:20:36,056 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 14
166
+ 2024-08-24 20:20:36,057 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
167
+ 2024-08-24 20:20:36,057 DEBUG SenderThread:25836 [sender.py:send():382] send: final
168
+ 2024-08-24 20:20:36,057 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 14
169
+ 2024-08-24 20:20:36,057 DEBUG SenderThread:25836 [sender.py:send():382] send: footer
170
+ 2024-08-24 20:20:36,057 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
171
+ 2024-08-24 20:20:36,057 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 14
172
+ 2024-08-24 20:20:36,057 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
173
+ 2024-08-24 20:20:36,057 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
174
+ 2024-08-24 20:20:36,058 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
175
+ 2024-08-24 20:20:36,058 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: server_info
176
+ 2024-08-24 20:20:36,058 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
177
+ 2024-08-24 20:20:36,058 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: server_info
178
+ 2024-08-24 20:20:36,060 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: get_summary
179
+ 2024-08-24 20:20:36,060 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: sampled_history
180
+ 2024-08-24 20:20:36,060 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: internal_messages
181
+ 2024-08-24 20:20:36,060 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: job_info
182
+ 2024-08-24 20:20:36,224 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: job_info
183
+ 2024-08-24 20:20:36,224 INFO MainThread:25836 [wandb_run.py:_footer_history_summary_info():3866] rendering history
184
+ 2024-08-24 20:20:36,224 INFO MainThread:25836 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
185
+ 2024-08-24 20:20:36,224 INFO MainThread:25836 [wandb_run.py:_footer_sync_info():3825] logging synced files
186
+ 2024-08-24 20:20:36,225 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: shutdown
187
+ 2024-08-24 20:20:36,225 INFO HandlerThread:25836 [handler.py:finish():869] shutting down handler
188
+ 2024-08-24 20:20:37,061 INFO WriterThread:25836 [datastore.py:close():296] close: /project/wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb
189
+ 2024-08-24 20:20:37,224 INFO SenderThread:25836 [sender.py:finish():1572] shutting down sender
190
+ 2024-08-24 20:20:37,224 INFO SenderThread:25836 [file_pusher.py:finish():172] shutting down file pusher
191
+ 2024-08-24 20:20:37,224 INFO SenderThread:25836 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240824_202022-z2bjbf6e/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Configure stats pid to 25210
3
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train baseline'}
6
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240824_202022-z2bjbf6e/logs/debug.log
9
+ 2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log
10
+ 2024-08-24 20:20:22,646 INFO MainThread:25210 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-24 20:20:22,646 INFO MainThread:25210 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document'], 'test_data_path': ['1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 4, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07', 'wandb_project': 'yans_experiment', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 3.5e-06, 'lr_decay_style': 'cosine', 'lr_decay_iters': 23178, 'lr_warmup_iters': 500, 'min_lr': 3.5e-07, 'train_iters': 23178, 'train_samples': None, 'global_batch_size': 1280, 'micro_batch_size': 16, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-baseline-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 8, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 10}
13
+ 2024-08-24 20:20:22,646 INFO MainThread:25210 [wandb_init.py:init():616] starting backend
14
+ 2024-08-24 20:20:22,646 INFO MainThread:25210 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-24 20:20:22,651 INFO MainThread:25210 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-24 20:20:22,652 INFO MainThread:25210 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-24 20:20:22,659 INFO MainThread:25210 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-24 20:20:22,672 INFO MainThread:25210 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-24 20:20:23,105 INFO MainThread:25210 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-24 20:20:23,127 INFO MainThread:25210 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-24 20:20:23,127 INFO MainThread:25210 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-24 20:20:23,283 INFO MainThread:25210 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-24 20:20:23,283 INFO MainThread:25210 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-24 20:20:23,283 INFO MainThread:25210 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-24 20:20:23,283 INFO MainThread:25210 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-24 20:20:23,284 INFO MainThread:25210 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-24 20:20:37,225 WARNING MsgRouterThr:25210 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb ADDED
Binary file (18.8 kB). View file
 
wandb/run-20240826_221726-7jzdp89j/files/config.yaml ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '1754785366'
31
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
32
+ - '28623823675'
33
+ - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
34
+ valid_data_path:
35
+ desc: null
36
+ value:
37
+ - '1205770'
38
+ - /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
39
+ test_data_path:
40
+ desc: null
41
+ value:
42
+ - '1205770'
43
+ - /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
44
+ data_cache_path:
45
+ desc: null
46
+ value: null
47
+ vocab_size:
48
+ desc: null
49
+ value: null
50
+ vocab_file:
51
+ desc: null
52
+ value: null
53
+ merge_file:
54
+ desc: null
55
+ value: null
56
+ seq_length:
57
+ desc: null
58
+ value: 1024
59
+ num_workers:
60
+ desc: null
61
+ value: 4
62
+ tokenizer_type:
63
+ desc: null
64
+ value: HFPreTrainedTokenizer
65
+ tokenizer_model:
66
+ desc: null
67
+ value: /share/pretrained_lm/Qwen/Qwen2-1.5B
68
+ reset_position_ids:
69
+ desc: null
70
+ value: false
71
+ reset_attention_mask:
72
+ desc: null
73
+ value: false
74
+ eod_mask_loss:
75
+ desc: null
76
+ value: false
77
+ retro_return_doc_ids:
78
+ desc: null
79
+ value: false
80
+ short_seq_prob:
81
+ desc: null
82
+ value: 0.1
83
+ vocab_extra_ids:
84
+ desc: null
85
+ value: 0
86
+ seed:
87
+ desc: null
88
+ value: 1234
89
+ use_mpi:
90
+ desc: null
91
+ value: false
92
+ wandb_entity:
93
+ desc: null
94
+ value: iwakawa-koichi-q5-tohoku-nlp6723
95
+ wandb_name:
96
+ desc: null
97
+ value: yans-baseline-qwen2-1.5B-3.5e-5_train_2024-08-26-22:17:00
98
+ wandb_project:
99
+ desc: null
100
+ value: yans_experiment
101
+ quantization:
102
+ desc: null
103
+ value: false
104
+ use_freeze_layers:
105
+ desc: null
106
+ value: false
107
+ freeze_layers:
108
+ desc: null
109
+ value: null
110
+ bf16:
111
+ desc: null
112
+ value: true
113
+ fp16:
114
+ desc: null
115
+ value: false
116
+ mixed_precision:
117
+ desc: null
118
+ value: true
119
+ param_dtype:
120
+ desc: null
121
+ value: null
122
+ load:
123
+ desc: null
124
+ value: /work/llm_recipes/models/yans-baseline-qwen2-1.5B-3.5e-5
125
+ save:
126
+ desc: null
127
+ value: /work/llm_recipes/models/yans-baseline-qwen2-1.5B-3.5e-5
128
+ base_model:
129
+ desc: null
130
+ value: /share/pretrained_lm/Qwen/Qwen2-1.5B
131
+ use_better_transformer:
132
+ desc: null
133
+ value: false
134
+ grad_clip_norm:
135
+ desc: null
136
+ value: 1.0
137
+ eval_interval:
138
+ desc: null
139
+ value: 200
140
+ save_interval:
141
+ desc: null
142
+ value: 200
143
+ eval_iters:
144
+ desc: null
145
+ value: 10
146
+ optimizer:
147
+ desc: null
148
+ value: anyprecision
149
+ lr:
150
+ desc: null
151
+ value: 3.5e-05
152
+ lr_decay_style:
153
+ desc: null
154
+ value: cosine
155
+ lr_decay_iters:
156
+ desc: null
157
+ value: 23178
158
+ lr_warmup_iters:
159
+ desc: null
160
+ value: 500
161
+ min_lr:
162
+ desc: null
163
+ value: 3.5e-06
164
+ train_iters:
165
+ desc: null
166
+ value: 23178
167
+ train_samples:
168
+ desc: null
169
+ value: null
170
+ global_batch_size:
171
+ desc: null
172
+ value: 1280
173
+ micro_batch_size:
174
+ desc: null
175
+ value: 16
176
+ make_vocab_size_divisible_by:
177
+ desc: null
178
+ value: 128
179
+ sliding_window_size:
180
+ desc: null
181
+ value: 131072
182
+ skip_batch:
183
+ desc: null
184
+ value: null
185
+ no_save_optimizer_state:
186
+ desc: null
187
+ value: false
188
+ continual_pretraining:
189
+ desc: null
190
+ value: false
191
+ instruction_tuning:
192
+ desc: null
193
+ value: false
194
+ direct_preference_optimization:
195
+ desc: null
196
+ value: false
197
+ attention_dropout:
198
+ desc: null
199
+ value: 0.1
200
+ hidden_dropout:
201
+ desc: null
202
+ value: 0.1
203
+ weight_decay:
204
+ desc: null
205
+ value: 0.1
206
+ adam_beta1:
207
+ desc: null
208
+ value: 0.9
209
+ adam_beta2:
210
+ desc: null
211
+ value: 0.95
212
+ adam_eps:
213
+ desc: null
214
+ value: 1.0e-08
215
+ hf_transformer_model_dir:
216
+ desc: null
217
+ value: null
218
+ instruction_train_data_path:
219
+ desc: null
220
+ value: null
221
+ instruction_valid_data_path:
222
+ desc: null
223
+ value: null
224
+ epoch:
225
+ desc: null
226
+ value: null
227
+ instruction_dataset_size:
228
+ desc: null
229
+ value: null
230
+ save_sampler_state:
231
+ desc: null
232
+ value: false
233
+ label_smoothing:
234
+ desc: null
235
+ value: 0.0
236
+ save_n_checkpoints:
237
+ desc: null
238
+ value: 10
239
+ hf_repo_id:
240
+ desc: null
241
+ value: koichi12/yans-baseline-qwen2-1.5B-3.5e-5
242
+ create_public_hf_repo:
243
+ desc: null
244
+ value: false
245
+ upload_all_checkpoints_to_hf:
246
+ desc: null
247
+ value: true
248
+ hf_upload_retry_limit:
249
+ desc: null
250
+ value: 2
251
+ exit_duration_in_mins:
252
+ desc: null
253
+ value: null
254
+ source_key:
255
+ desc: null
256
+ value: null
257
+ target_key:
258
+ desc: null
259
+ value: null
260
+ attn_implementation:
261
+ desc: null
262
+ value: flash_attention_2
263
+ efficient_instruction_tuning:
264
+ desc: null
265
+ value: false
266
+ remove_padding_masking:
267
+ desc: null
268
+ value: false
269
+ save_start_iter:
270
+ desc: null
271
+ value: null
272
+ valid_micro_batch_size:
273
+ desc: null
274
+ value: 1
275
+ rank:
276
+ desc: null
277
+ value: 0
278
+ world_size:
279
+ desc: null
280
+ value: 8
281
+ padded_vocab_size:
282
+ desc: null
283
+ value: 151680
284
+ gradient_accumulation_steps:
285
+ desc: null
286
+ value: 10
287
+ _wandb:
288
+ desc: null
289
+ value:
290
+ python_version: 3.10.12
291
+ cli_version: 0.16.3
292
+ framework: huggingface
293
+ huggingface_version: 4.43.3
294
+ is_jupyter_run: false
295
+ is_kaggle_kernel: false
296
+ start_time: 1724678246.995911
297
+ t:
298
+ 1:
299
+ - 1
300
+ - 11
301
+ - 49
302
+ - 55
303
+ - 71
304
+ - 105
305
+ 2:
306
+ - 1
307
+ - 11
308
+ - 49
309
+ - 55
310
+ - 71
311
+ - 105
312
+ 3:
313
+ - 13
314
+ - 16
315
+ - 23
316
+ 4: 3.10.12
317
+ 5: 0.16.3
318
+ 6: 4.43.3
319
+ 8:
320
+ - 5
321
+ 13: linux-x86_64
322
+ model_architecture:
323
+ desc: null
324
+ value: Qwen2ForCausalLM
325
+ activation_function:
326
+ desc: null
327
+ value: silu
328
+ hidden_size:
329
+ desc: null
330
+ value: 1536
331
+ model_type:
332
+ desc: null
333
+ value: qwen2
334
+ max_position_embeddings:
335
+ desc: null
336
+ value: 1024
337
+ num_attention_heads:
338
+ desc: null
339
+ value: 12
340
+ num_hidden_layers:
341
+ desc: null
342
+ value: 28
wandb/run-20240826_221726-7jzdp89j/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240826_221726-7jzdp89j/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0