koichi12 commited on
Commit
1cf1fd6
·
verified ·
1 Parent(s): ca3e41a

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. wandb/run-20240803_191521-sg37tylz/files/config.yaml +335 -0
  2. wandb/run-20240803_191521-sg37tylz/files/output.log +103 -0
  3. wandb/run-20240803_191521-sg37tylz/files/requirements.txt +271 -0
  4. wandb/run-20240803_191521-sg37tylz/files/wandb-metadata.json +215 -0
  5. wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json +1 -0
  6. wandb/run-20240803_191521-sg37tylz/logs/debug-internal.log +194 -0
  7. wandb/run-20240803_191521-sg37tylz/logs/debug.log +30 -0
  8. wandb/run-20240803_191521-sg37tylz/run-sg37tylz.wandb +0 -0
  9. wandb/run-20240803_191815-jdwps0z3/files/config.yaml +335 -0
  10. wandb/run-20240803_191815-jdwps0z3/files/output.log +239 -0
  11. wandb/run-20240803_191815-jdwps0z3/files/requirements.txt +271 -0
  12. wandb/run-20240803_191815-jdwps0z3/files/wandb-metadata.json +215 -0
  13. wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json +1 -0
  14. wandb/run-20240803_191815-jdwps0z3/logs/debug-internal.log +524 -0
  15. wandb/run-20240803_191815-jdwps0z3/logs/debug.log +29 -0
  16. wandb/run-20240803_191815-jdwps0z3/run-jdwps0z3.wandb +0 -0
  17. wandb/run-20240803_192355-n3hnzq4n/files/config.yaml +335 -0
  18. wandb/run-20240803_192355-n3hnzq4n/files/output.log +0 -0
  19. wandb/run-20240803_192355-n3hnzq4n/files/requirements.txt +271 -0
  20. wandb/run-20240803_192355-n3hnzq4n/files/wandb-metadata.json +215 -0
  21. wandb/run-20240803_192355-n3hnzq4n/files/wandb-summary.json +1 -0
  22. wandb/run-20240803_192355-n3hnzq4n/logs/debug-internal.log +0 -0
  23. wandb/run-20240803_192355-n3hnzq4n/logs/debug.log +29 -0
  24. wandb/run-20240812_063027-j1htzx7q/files/config.yaml +335 -0
  25. wandb/run-20240812_063027-j1htzx7q/files/requirements.txt +271 -0
  26. wandb/run-20240812_063027-j1htzx7q/files/wandb-metadata.json +215 -0
  27. wandb/run-20240812_063027-j1htzx7q/logs/debug-internal.log +261 -0
  28. wandb/run-20240812_063027-j1htzx7q/logs/debug.log +30 -0
  29. wandb/run-20240812_063027-j1htzx7q/run-j1htzx7q.wandb +0 -0
  30. wandb/run-20240823_163849-faey1t8u/files/config.yaml +342 -0
  31. wandb/run-20240823_163849-faey1t8u/files/output.log +126 -0
  32. wandb/run-20240823_163849-faey1t8u/files/requirements.txt +375 -0
  33. wandb/run-20240823_163849-faey1t8u/files/wandb-metadata.json +220 -0
  34. wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json +1 -0
  35. wandb/run-20240823_163849-faey1t8u/logs/debug-internal.log +439 -0
  36. wandb/run-20240823_163849-faey1t8u/logs/debug.log +30 -0
  37. wandb/run-20240823_163849-faey1t8u/run-faey1t8u.wandb +0 -0
  38. wandb/run-20240823_202540-om09pls8/files/config.yaml +342 -0
  39. wandb/run-20240823_202540-om09pls8/files/output.log +133 -0
  40. wandb/run-20240823_202540-om09pls8/files/requirements.txt +375 -0
  41. wandb/run-20240823_202540-om09pls8/files/wandb-metadata.json +502 -0
  42. wandb/run-20240823_202540-om09pls8/files/wandb-summary.json +1 -0
  43. wandb/run-20240823_202540-om09pls8/logs/debug-internal.log +312 -0
  44. wandb/run-20240823_202540-om09pls8/logs/debug.log +30 -0
  45. wandb/run-20240823_202540-om09pls8/run-om09pls8.wandb +0 -0
  46. wandb/run-20240831_192346-5vo4p2k7/files/config.yaml +313 -0
  47. wandb/run-20240831_192346-5vo4p2k7/files/output.log +15 -0
  48. wandb/run-20240831_192346-5vo4p2k7/files/requirements.txt +375 -0
  49. wandb/run-20240831_192346-5vo4p2k7/files/wandb-metadata.json +221 -0
  50. wandb/run-20240831_192346-5vo4p2k7/files/wandb-summary.json +1 -0
wandb/run-20240803_191521-sg37tylz/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-mistral-sample_train_2024-08-03-19:14:48
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-mistral-sample
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-mistral-sample
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/custom/tiny-mistral
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 1600
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 40
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-mistral-sample
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32768
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722680121.573481
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 256
321
+ model_type:
322
+ desc: null
323
+ value: mistral
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 512
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 4
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 4
333
+ model_architecture:
334
+ desc: null
335
+ value: MistralForCausalLM
wandb/run-20240803_191521-sg37tylz/files/output.log ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-mistral-sample.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
5
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
6
+ warnings.warn(
7
+ Let split = None
8
+ Loading model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
9
+ Loaded model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
10
+ --> Model /share/pretrained_lm/custom/tiny-mistral
11
+ --> /share/pretrained_lm/custom/tiny-mistral has 19.925248 Million params
12
+ BFloat16 enabled for mixed precision - using bfSixteen policy
13
+ --> applying fsdp activation checkpointing...
14
+ > datasets target sizes (minimum size):
15
+ train: 32000000
16
+ validation: 1616000
17
+ test: 16000
18
+ > building train, validation, and test datasets for GPT ...
19
+ Building a BlendedDataset for a single MegatronDataset
20
+ Unable to save the indexes because path_to_cache is None
21
+ Building a BlendedDataset for a single MegatronDataset
22
+ Unable to save the indexes because path_to_cache is None
23
+ Building a BlendedDataset for a single MegatronDataset
24
+ Unable to save the indexes because path_to_cache is None
25
+ > finished creating GPT datasets ...
26
+ Loading optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
27
+ Loaded optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
28
+ model info: FullyShardedDataParallel(
29
+ (_fsdp_wrapped_module): MistralForCausalLM(
30
+ (model): MistralModel(
31
+ (embed_tokens): Embedding(32768, 256)
32
+ (layers): ModuleList(
33
+ (0-3): 4 x FullyShardedDataParallel(
34
+ (_fsdp_wrapped_module): CheckpointWrapper(
35
+ (_checkpoint_wrapped_module): MistralDecoderLayer(
36
+ (self_attn): MistralFlashAttention2(
37
+ (q_proj): Linear(in_features=256, out_features=512, bias=False)
38
+ (k_proj): Linear(in_features=256, out_features=256, bias=False)
39
+ (v_proj): Linear(in_features=256, out_features=256, bias=False)
40
+ (o_proj): Linear(in_features=512, out_features=256, bias=False)
41
+ (rotary_emb): MistralRotaryEmbedding()
42
+ )
43
+ (mlp): MistralMLP(
44
+ (gate_proj): Linear(in_features=256, out_features=512, bias=False)
45
+ (up_proj): Linear(in_features=256, out_features=512, bias=False)
46
+ (down_proj): Linear(in_features=512, out_features=256, bias=False)
47
+ (act_fn): SiLU()
48
+ )
49
+ (input_layernorm): MistralRMSNorm()
50
+ (post_attention_layernorm): MistralRMSNorm()
51
+ )
52
+ )
53
+ )
54
+ )
55
+ (norm): MistralRMSNorm()
56
+ )
57
+ (lm_head): Linear(in_features=256, out_features=32768, bias=False)
58
+ )
59
+ )
60
+ model config: MistralConfig {
61
+ "_name_or_path": "/share/pretrained_lm/custom/tiny-mistral",
62
+ "architectures": [
63
+ "MistralForCausalLM"
64
+ ],
65
+ "attention_dropout": 0.0,
66
+ "bos_token_id": 1,
67
+ "eos_token_id": 2,
68
+ "head_dim": 128,
69
+ "hidden_act": "silu",
70
+ "hidden_size": 256,
71
+ "initializer_range": 0.02,
72
+ "intermediate_size": 512,
73
+ "label_smoothing": 0.0,
74
+ "max_position_embeddings": 512,
75
+ "model_type": "mistral",
76
+ "num_attention_heads": 4,
77
+ "num_hidden_layers": 4,
78
+ "num_key_value_heads": 2,
79
+ "rms_norm_eps": 1e-05,
80
+ "rope_theta": 1000000.0,
81
+ "sliding_window": 4096,
82
+ "tie_word_embeddings": false,
83
+ "torch_dtype": "float32",
84
+ "transformers_version": "4.43.3",
85
+ "use_cache": false,
86
+ "vocab_size": 32768
87
+ }
88
+ Saving checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000
89
+ Saving model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
90
+ Saved model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
91
+ Saving optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
92
+ Saved optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
93
+ Saving scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
94
+ Saved scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
95
+ Saving RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
96
+ Saved RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
97
+ Saved checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000, took 0.18s
98
+ [rank0]:[2024-08-03 19:15:37,067] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
99
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
100
+ warnings.warn(
101
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
102
+ warnings.warn(
103
+ [rank0]:[2024-08-03 19:15:37,199] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.0010301990000698424, 'preprocessing_with_comm': 0.009796595000352681, 'state_converting': 0.007119276000139507, <Type.ALL: 'all'>: 0.018263464000028762})
wandb/run-20240803_191521-sg37tylz/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240803_191521-sg37tylz/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-03T10:15:22.462858",
5
+ "startedAt": "2024-08-03T10:15:21.560082",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "40",
15
+ "--global-batch-size",
16
+ "1600",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/custom/tiny-mistral",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-mistral-sample",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-mistral-sample",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-mistral-sample",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-mistral-sample_train_2024-08-03-19:14:48"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.034,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.034,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.034,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.034,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.034,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.034,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.034,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.034,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.034,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.034,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.034,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.034,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.034,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.034,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.034,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.034,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.034,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.034,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.034,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 14}}
wandb/run-20240803_191521-sg37tylz/logs/debug-internal.log ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-03 19:15:21,574 INFO StreamThr :9246 [internal.py:wandb_internal():86] W&B internal server running at pid: 9246, started at: 2024-08-03 19:15:21.573011
2
+ 2024-08-03 19:15:21,575 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-03 19:15:21,578 INFO WriterThread:9246 [datastore.py:open_for_write():87] open: /project/wandb/run-20240803_191521-sg37tylz/run-sg37tylz.wandb
4
+ 2024-08-03 19:15:21,579 DEBUG SenderThread:9246 [sender.py:send():382] send: header
5
+ 2024-08-03 19:15:21,755 DEBUG SenderThread:9246 [sender.py:send():382] send: run
6
+ 2024-08-03 19:15:22,256 INFO SenderThread:9246 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240803_191521-sg37tylz/files
7
+ 2024-08-03 19:15:22,257 INFO SenderThread:9246 [sender.py:_start_run_threads():1136] run started: sg37tylz with start time 1722680121.573481
8
+ 2024-08-03 19:15:22,262 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-03 19:15:22,262 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-03 19:15:22,442 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-03 19:15:22,448 DEBUG HandlerThread:9246 [system_info.py:__init__():27] System info init
12
+ 2024-08-03 19:15:22,448 DEBUG HandlerThread:9246 [system_info.py:__init__():42] System info init done
13
+ 2024-08-03 19:15:22,448 INFO HandlerThread:9246 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-03 19:15:22,448 INFO SystemMonitor:9246 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-03 19:15:22,449 INFO HandlerThread:9246 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-03 19:15:22,449 INFO SystemMonitor:9246 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-03 19:15:22,450 INFO SystemMonitor:9246 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-03 19:15:22,450 INFO SystemMonitor:9246 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-03 19:15:22,451 INFO SystemMonitor:9246 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-03 19:15:22,452 INFO SystemMonitor:9246 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-03 19:15:22,462 DEBUG HandlerThread:9246 [system_info.py:probe():151] Probing system
22
+ 2024-08-03 19:15:22,464 DEBUG HandlerThread:9246 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-03 19:15:22,476 DEBUG HandlerThread:9246 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-03 19:15:22,476 DEBUG HandlerThread:9246 [system_info.py:probe():199] Probing system done
25
+ 2024-08-03 19:15:22,476 DEBUG HandlerThread:9246 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T10:15:22.462858', 'startedAt': '2024-08-03T10:15:21.560082', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '40', '--global-batch-size', '1600', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample', '--load', '/work/llm_recipes/models/tiny-mistral-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample_train_2024-08-03-19:14:48'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-03 19:15:22,476 INFO HandlerThread:9246 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-03 19:15:22,476 INFO HandlerThread:9246 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-03 19:15:22,478 INFO HandlerThread:9246 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-03 19:15:22,505 DEBUG SenderThread:9246 [sender.py:send():382] send: files
30
+ 2024-08-03 19:15:22,505 INFO SenderThread:9246 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-03 19:15:22,514 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-03 19:15:22,514 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-03 19:15:22,514 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-03 19:15:22,514 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-03 19:15:22,527 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-03 19:15:22,755 DEBUG SenderThread:9246 [sender.py:send():382] send: telemetry
37
+ 2024-08-03 19:15:23,258 INFO Thread-12 :9246 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191521-sg37tylz/files/requirements.txt
38
+ 2024-08-03 19:15:23,258 INFO Thread-12 :9246 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191521-sg37tylz/files/wandb-metadata.json
39
+ 2024-08-03 19:15:25,596 INFO wandb-upload_0:9246 [upload_job.py:push():131] Uploaded file /tmp/tmp1lfwq_epwandb/7v7ji8nj-wandb-metadata.json
40
+ 2024-08-03 19:15:26,756 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status_report
41
+ 2024-08-03 19:15:28,261 INFO Thread-12 :9246 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
42
+ 2024-08-03 19:15:30,262 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
43
+ 2024-08-03 19:15:31,262 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
44
+ 2024-08-03 19:15:31,904 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status_report
45
+ 2024-08-03 19:15:33,263 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
46
+ 2024-08-03 19:15:37,068 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status_report
47
+ 2024-08-03 19:15:37,110 DEBUG SenderThread:9246 [sender.py:send():382] send: config
48
+ 2024-08-03 19:15:37,111 DEBUG SenderThread:9246 [sender.py:send():382] send: config
49
+ 2024-08-03 19:15:37,265 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
50
+ 2024-08-03 19:15:37,316 DEBUG SenderThread:9246 [sender.py:send():382] send: exit
51
+ 2024-08-03 19:15:37,316 INFO SenderThread:9246 [sender.py:send_exit():589] handling exit code: 0
52
+ 2024-08-03 19:15:37,317 INFO SenderThread:9246 [sender.py:send_exit():591] handling runtime: 14
53
+ 2024-08-03 19:15:37,318 INFO SenderThread:9246 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
54
+ 2024-08-03 19:15:37,318 INFO SenderThread:9246 [sender.py:send_exit():597] send defer
55
+ 2024-08-03 19:15:37,318 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
56
+ 2024-08-03 19:15:37,318 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 0
57
+ 2024-08-03 19:15:37,318 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
58
+ 2024-08-03 19:15:37,318 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 0
59
+ 2024-08-03 19:15:37,318 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 1
60
+ 2024-08-03 19:15:37,319 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
61
+ 2024-08-03 19:15:37,319 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 1
62
+ 2024-08-03 19:15:37,319 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
63
+ 2024-08-03 19:15:37,319 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 1
64
+ 2024-08-03 19:15:37,319 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 2
65
+ 2024-08-03 19:15:37,319 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
66
+ 2024-08-03 19:15:37,319 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 2
67
+ 2024-08-03 19:15:37,319 INFO HandlerThread:9246 [system_monitor.py:finish():203] Stopping system monitor
68
+ 2024-08-03 19:15:37,319 DEBUG SystemMonitor:9246 [system_monitor.py:_start():172] Starting system metrics aggregation loop
69
+ 2024-08-03 19:15:37,319 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined cpu monitor
70
+ 2024-08-03 19:15:37,320 DEBUG SystemMonitor:9246 [system_monitor.py:_start():179] Finished system metrics aggregation loop
71
+ 2024-08-03 19:15:37,320 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined disk monitor
72
+ 2024-08-03 19:15:37,320 DEBUG SystemMonitor:9246 [system_monitor.py:_start():183] Publishing last batch of metrics
73
+ 2024-08-03 19:15:37,351 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined gpu monitor
74
+ 2024-08-03 19:15:37,351 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined memory monitor
75
+ 2024-08-03 19:15:37,351 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined network monitor
76
+ 2024-08-03 19:15:37,352 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
77
+ 2024-08-03 19:15:37,352 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 2
78
+ 2024-08-03 19:15:37,352 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 3
79
+ 2024-08-03 19:15:37,352 DEBUG SenderThread:9246 [sender.py:send():382] send: stats
80
+ 2024-08-03 19:15:37,352 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
81
+ 2024-08-03 19:15:37,352 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 3
82
+ 2024-08-03 19:15:37,353 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
83
+ 2024-08-03 19:15:37,353 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 3
84
+ 2024-08-03 19:15:37,353 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 4
85
+ 2024-08-03 19:15:37,353 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
86
+ 2024-08-03 19:15:37,353 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 4
87
+ 2024-08-03 19:15:37,353 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
88
+ 2024-08-03 19:15:37,353 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 4
89
+ 2024-08-03 19:15:37,353 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 5
90
+ 2024-08-03 19:15:37,353 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
91
+ 2024-08-03 19:15:37,353 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 5
92
+ 2024-08-03 19:15:37,353 DEBUG SenderThread:9246 [sender.py:send():382] send: summary
93
+ 2024-08-03 19:15:37,354 INFO SenderThread:9246 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
94
+ 2024-08-03 19:15:37,354 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
95
+ 2024-08-03 19:15:37,354 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 5
96
+ 2024-08-03 19:15:37,354 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 6
97
+ 2024-08-03 19:15:37,355 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
98
+ 2024-08-03 19:15:37,355 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 6
99
+ 2024-08-03 19:15:37,355 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
100
+ 2024-08-03 19:15:37,355 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 6
101
+ 2024-08-03 19:15:37,357 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status_report
102
+ 2024-08-03 19:15:37,549 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 7
103
+ 2024-08-03 19:15:37,550 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
104
+ 2024-08-03 19:15:37,550 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 7
105
+ 2024-08-03 19:15:37,550 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
106
+ 2024-08-03 19:15:37,550 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 7
107
+ 2024-08-03 19:15:38,267 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/config.yaml
108
+ 2024-08-03 19:15:38,267 INFO Thread-12 :9246 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json
109
+ 2024-08-03 19:15:38,316 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
110
+ 2024-08-03 19:15:39,265 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 8
111
+ 2024-08-03 19:15:39,265 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
112
+ 2024-08-03 19:15:39,265 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
113
+ 2024-08-03 19:15:39,265 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 8
114
+ 2024-08-03 19:15:39,265 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
115
+ 2024-08-03 19:15:39,265 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 8
116
+ 2024-08-03 19:15:39,266 INFO SenderThread:9246 [job_builder.py:build():296] Attempting to build job artifact
117
+ 2024-08-03 19:15:39,266 INFO SenderThread:9246 [job_builder.py:_get_source_type():426] is repo sourced job
118
+ 2024-08-03 19:15:39,267 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
119
+ 2024-08-03 19:15:39,281 INFO SenderThread:9246 [job_builder.py:build():402] adding wandb-job metadata file
120
+ 2024-08-03 19:15:39,289 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 9
121
+ 2024-08-03 19:15:39,290 DEBUG SenderThread:9246 [sender.py:send():382] send: artifact
122
+ 2024-08-03 19:15:39,290 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
123
+ 2024-08-03 19:15:39,291 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 9
124
+ 2024-08-03 19:15:39,317 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
125
+ 2024-08-03 19:15:40,500 INFO wandb-upload_1:9246 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpftiijukc
126
+ 2024-08-03 19:15:40,873 INFO wandb-upload_0:9246 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp8adahfv9
127
+ 2024-08-03 19:15:42,354 INFO SenderThread:9246 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA4NTUyMzkyNA==', 'versionIndex': 0}}}
128
+ 2024-08-03 19:15:42,354 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
129
+ 2024-08-03 19:15:42,354 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 9
130
+ 2024-08-03 19:15:42,354 INFO SenderThread:9246 [dir_watcher.py:finish():358] shutting down directory watcher
131
+ 2024-08-03 19:15:43,268 INFO SenderThread:9246 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240803_191521-sg37tylz/files
132
+ 2024-08-03 19:15:43,268 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/requirements.txt requirements.txt
133
+ 2024-08-03 19:15:43,269 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/config.yaml config.yaml
134
+ 2024-08-03 19:15:43,270 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/wandb-metadata.json wandb-metadata.json
135
+ 2024-08-03 19:15:43,270 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json wandb-summary.json
136
+ 2024-08-03 19:15:43,271 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/output.log output.log
137
+ 2024-08-03 19:15:43,273 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 10
138
+ 2024-08-03 19:15:43,273 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
139
+ 2024-08-03 19:15:43,273 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
140
+ 2024-08-03 19:15:43,275 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 10
141
+ 2024-08-03 19:15:43,275 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
142
+ 2024-08-03 19:15:43,275 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 10
143
+ 2024-08-03 19:15:43,275 INFO SenderThread:9246 [file_pusher.py:finish():172] shutting down file pusher
144
+ 2024-08-03 19:15:43,318 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
145
+ 2024-08-03 19:15:43,318 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
146
+ 2024-08-03 19:15:43,685 INFO wandb-upload_1:9246 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191521-sg37tylz/files/requirements.txt
147
+ 2024-08-03 19:15:43,839 INFO wandb-upload_0:9246 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191521-sg37tylz/files/config.yaml
148
+ 2024-08-03 19:15:43,870 INFO wandb-upload_2:9246 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json
149
+ 2024-08-03 19:15:43,873 INFO wandb-upload_3:9246 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191521-sg37tylz/files/output.log
150
+ 2024-08-03 19:15:44,073 INFO Thread-11 (_thread_body):9246 [sender.py:transition_state():617] send defer: 11
151
+ 2024-08-03 19:15:44,074 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
152
+ 2024-08-03 19:15:44,074 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 11
153
+ 2024-08-03 19:15:44,074 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
154
+ 2024-08-03 19:15:44,074 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 11
155
+ 2024-08-03 19:15:44,074 INFO SenderThread:9246 [file_pusher.py:join():178] waiting for file pusher
156
+ 2024-08-03 19:15:44,074 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 12
157
+ 2024-08-03 19:15:44,074 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
158
+ 2024-08-03 19:15:44,074 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 12
159
+ 2024-08-03 19:15:44,074 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
160
+ 2024-08-03 19:15:44,074 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 12
161
+ 2024-08-03 19:15:44,074 INFO SenderThread:9246 [file_stream.py:finish():595] file stream finish called
162
+ 2024-08-03 19:15:44,248 INFO SenderThread:9246 [file_stream.py:finish():599] file stream finish is done
163
+ 2024-08-03 19:15:44,248 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 13
164
+ 2024-08-03 19:15:44,248 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
165
+ 2024-08-03 19:15:44,248 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 13
166
+ 2024-08-03 19:15:44,248 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
167
+ 2024-08-03 19:15:44,248 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 13
168
+ 2024-08-03 19:15:44,249 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 14
169
+ 2024-08-03 19:15:44,249 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
170
+ 2024-08-03 19:15:44,249 DEBUG SenderThread:9246 [sender.py:send():382] send: final
171
+ 2024-08-03 19:15:44,249 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 14
172
+ 2024-08-03 19:15:44,249 DEBUG SenderThread:9246 [sender.py:send():382] send: footer
173
+ 2024-08-03 19:15:44,249 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
174
+ 2024-08-03 19:15:44,249 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 14
175
+ 2024-08-03 19:15:44,250 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
176
+ 2024-08-03 19:15:44,250 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
177
+ 2024-08-03 19:15:44,250 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
178
+ 2024-08-03 19:15:44,250 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: server_info
179
+ 2024-08-03 19:15:44,251 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: get_summary
180
+ 2024-08-03 19:15:44,251 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
181
+ 2024-08-03 19:15:44,251 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: server_info
182
+ 2024-08-03 19:15:44,252 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: sampled_history
183
+ 2024-08-03 19:15:44,252 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: internal_messages
184
+ 2024-08-03 19:15:44,253 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: job_info
185
+ 2024-08-03 19:15:44,413 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: job_info
186
+ 2024-08-03 19:15:44,413 INFO MainThread:9246 [wandb_run.py:_footer_history_summary_info():3866] rendering history
187
+ 2024-08-03 19:15:44,413 INFO MainThread:9246 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
188
+ 2024-08-03 19:15:44,414 INFO MainThread:9246 [wandb_run.py:_footer_sync_info():3825] logging synced files
189
+ 2024-08-03 19:15:44,414 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: shutdown
190
+ 2024-08-03 19:15:44,414 INFO HandlerThread:9246 [handler.py:finish():869] shutting down handler
191
+ 2024-08-03 19:15:45,253 INFO WriterThread:9246 [datastore.py:close():296] close: /project/wandb/run-20240803_191521-sg37tylz/run-sg37tylz.wandb
192
+ 2024-08-03 19:15:45,413 INFO SenderThread:9246 [sender.py:finish():1572] shutting down sender
193
+ 2024-08-03 19:15:45,413 INFO SenderThread:9246 [file_pusher.py:finish():172] shutting down file pusher
194
+ 2024-08-03 19:15:45,414 INFO SenderThread:9246 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240803_191521-sg37tylz/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-03 19:15:21,565 INFO MainThread:9173 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Configure stats pid to 9173
3
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
6
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240803_191521-sg37tylz/logs/debug.log
9
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240803_191521-sg37tylz/logs/debug-internal.log
10
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-03-19:14:48', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 1600, 'micro_batch_size': 40, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
13
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:init():616] starting backend
14
+ 2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-03 19:15:21,571 INFO MainThread:9173 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-03 19:15:21,573 INFO MainThread:9173 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-03 19:15:21,578 INFO MainThread:9173 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-03 19:15:21,751 INFO MainThread:9173 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-03 19:15:22,261 INFO MainThread:9173 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-03 19:15:22,435 INFO MainThread:9173 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-03 19:15:22,435 INFO MainThread:9173 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-03 19:15:22,513 INFO MainThread:9173 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-03 19:15:22,514 INFO MainThread:9173 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-03 19:15:22,514 INFO MainThread:9173 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-03 19:15:22,514 INFO MainThread:9173 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-03 19:15:22,514 INFO MainThread:9173 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-03 19:15:37,109 INFO MainThread:9173 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
29
+ 2024-08-03 19:15:37,110 INFO MainThread:9173 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-03 19:15:45,414 WARNING MsgRouterThr:9173 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240803_191521-sg37tylz/run-sg37tylz.wandb ADDED
Binary file (17.1 kB). View file
 
wandb/run-20240803_191815-jdwps0z3/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-mistral-sample2_train_2024-08-03-19:18:05
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-mistral-sample2
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-mistral-sample2
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/custom/tiny-mistral
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 1600
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 40
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-mistral-sample2
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32768
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722680295.872336
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 256
321
+ model_type:
322
+ desc: null
323
+ value: mistral
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 512
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 4
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 4
333
+ model_architecture:
334
+ desc: null
335
+ value: MistralForCausalLM
wandb/run-20240803_191815-jdwps0z3/files/output.log ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-mistral-sample2.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/tiny-mistral-sample2, skipping model loading
11
+ --> Model /share/pretrained_lm/custom/tiny-mistral
12
+ --> /share/pretrained_lm/custom/tiny-mistral has 19.925248 Million params
13
+ BFloat16 enabled for mixed precision - using bfSixteen policy
14
+ --> applying fsdp activation checkpointing...
15
+ > datasets target sizes (minimum size):
16
+ train: 32000000
17
+ validation: 1616000
18
+ test: 16000
19
+ > building train, validation, and test datasets for GPT ...
20
+ > finished creating GPT datasets ...
21
+ File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
22
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
23
+ No checkpoint found in /work/llm_recipes/models/tiny-mistral-sample2, skipping optimizer loading
24
+ File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
25
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
26
+ model info: FullyShardedDataParallel(
27
+ (_fsdp_wrapped_module): MistralForCausalLM(
28
+ (model): MistralModel(
29
+ (embed_tokens): Embedding(32768, 256)
30
+ (layers): ModuleList(
31
+ (0-3): 4 x FullyShardedDataParallel(
32
+ (_fsdp_wrapped_module): CheckpointWrapper(
33
+ (_checkpoint_wrapped_module): MistralDecoderLayer(
34
+ (self_attn): MistralFlashAttention2(
35
+ (q_proj): Linear(in_features=256, out_features=512, bias=False)
36
+ (k_proj): Linear(in_features=256, out_features=256, bias=False)
37
+ (v_proj): Linear(in_features=256, out_features=256, bias=False)
38
+ (o_proj): Linear(in_features=512, out_features=256, bias=False)
39
+ (rotary_emb): MistralRotaryEmbedding()
40
+ )
41
+ (mlp): MistralMLP(
42
+ (gate_proj): Linear(in_features=256, out_features=512, bias=False)
43
+ (up_proj): Linear(in_features=256, out_features=512, bias=False)
44
+ (down_proj): Linear(in_features=512, out_features=256, bias=False)
45
+ (act_fn): SiLU()
46
+ )
47
+ (input_layernorm): MistralRMSNorm()
48
+ (post_attention_layernorm): MistralRMSNorm()
49
+ )
50
+ )
51
+ )
52
+ )
53
+ (norm): MistralRMSNorm()
54
+ )
55
+ (lm_head): Linear(in_features=256, out_features=32768, bias=False)
56
+ )
57
+ )
58
+ model config: MistralConfig {
59
+ "_name_or_path": "/share/pretrained_lm/custom/tiny-mistral",
60
+ "architectures": [
61
+ "MistralForCausalLM"
62
+ ],
63
+ "attention_dropout": 0.0,
64
+ "bos_token_id": 1,
65
+ "eos_token_id": 2,
66
+ "head_dim": 128,
67
+ "hidden_act": "silu",
68
+ "hidden_size": 256,
69
+ "initializer_range": 0.02,
70
+ "intermediate_size": 512,
71
+ "label_smoothing": 0.0,
72
+ "max_position_embeddings": 512,
73
+ "model_type": "mistral",
74
+ "num_attention_heads": 4,
75
+ "num_hidden_layers": 4,
76
+ "num_key_value_heads": 2,
77
+ "rms_norm_eps": 1e-05,
78
+ "rope_theta": 1000000.0,
79
+ "sliding_window": 4096,
80
+ "tie_word_embeddings": false,
81
+ "torch_dtype": "float32",
82
+ "transformers_version": "4.43.3",
83
+ "use_cache": false,
84
+ "vocab_size": 32768
85
+ }
86
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
87
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
88
+ warnings.warn(
89
+ Let split = None
90
+ Building a BlendedDataset for a single MegatronDataset
91
+ Unable to save the indexes because path_to_cache is None
92
+ Building a BlendedDataset for a single MegatronDataset
93
+ Unable to save the indexes because path_to_cache is None
94
+ Building a BlendedDataset for a single MegatronDataset
95
+ Unable to save the indexes because path_to_cache is None
96
+ ------------------------------------------------------------------
97
+ iteration: 1 , TFLOPS: 7.474408749953364, Tokens per sec: 105583.91620199519, Loss: 10.45809555053711
98
+ ------------------------------------------------------------------
99
+ ------------------------------------------------------------------
100
+ iteration: 2 , TFLOPS: 19.626322889381036, Tokens per sec: 277242.53524650185, Loss: 10.456090927124023
101
+ ------------------------------------------------------------------
102
+ ------------------------------------------------------------------
103
+ iteration: 3 , TFLOPS: 19.74558354343518, Tokens per sec: 278927.2179184158, Loss: 10.458115577697754
104
+ ------------------------------------------------------------------
105
+ ------------------------------------------------------------------
106
+ iteration: 4 , TFLOPS: 19.588618586475842, Tokens per sec: 276709.92215407215, Loss: 10.456512451171875
107
+ ------------------------------------------------------------------
108
+ ------------------------------------------------------------------
109
+ iteration: 5 , TFLOPS: 19.72619917981179, Tokens per sec: 278653.39331329847, Loss: 10.458467483520508
110
+ ------------------------------------------------------------------
111
+ ------------------------------------------------------------------
112
+ iteration: 6 , TFLOPS: 19.627595003583973, Tokens per sec: 277260.5052029086, Loss: 10.457172393798828
113
+ ------------------------------------------------------------------
114
+ ------------------------------------------------------------------
115
+ iteration: 7 , TFLOPS: 19.77370988376223, Tokens per sec: 279324.53217557067, Loss: 10.457324981689453
116
+ ------------------------------------------------------------------
117
+ ------------------------------------------------------------------
118
+ iteration: 8 , TFLOPS: 19.77766998452317, Tokens per sec: 279380.47278049105, Loss: 10.457306861877441
119
+ ------------------------------------------------------------------
120
+ ------------------------------------------------------------------
121
+ iteration: 9 , TFLOPS: 19.749958254856136, Tokens per sec: 278989.01533671736, Loss: 10.457944869995117
122
+ ------------------------------------------------------------------
123
+ ------------------------------------------------------------------
124
+ iteration: 10 , TFLOPS: 19.752309517490676, Tokens per sec: 279022.22940424824, Loss: 10.45663833618164
125
+ ------------------------------------------------------------------
126
+ ------------------------------------------------------------------
127
+ iteration: 11 , TFLOPS: 19.74392548352723, Tokens per sec: 278903.7960713861, Loss: 10.456352233886719
128
+ ------------------------------------------------------------------
129
+ ------------------------------------------------------------------
130
+ iteration: 12 , TFLOPS: 19.758394357809003, Tokens per sec: 279108.18419903744, Loss: 10.455950736999512
131
+ ------------------------------------------------------------------
132
+ ------------------------------------------------------------------
133
+ iteration: 13 , TFLOPS: 19.729624964309398, Tokens per sec: 278701.78613678756, Loss: 10.45804214477539
134
+ ------------------------------------------------------------------
135
+ ------------------------------------------------------------------
136
+ iteration: 14 , TFLOPS: 19.626730348672773, Tokens per sec: 277248.29103925463, Loss: 10.457955360412598
137
+ ------------------------------------------------------------------
138
+ ------------------------------------------------------------------
139
+ iteration: 15 , TFLOPS: 19.724026466287597, Tokens per sec: 278622.7014404904, Loss: 10.459123611450195
140
+ ------------------------------------------------------------------
141
+ ------------------------------------------------------------------
142
+ iteration: 16 , TFLOPS: 19.108803574684035, Tokens per sec: 269932.0284514028, Loss: 10.45695686340332
143
+ ------------------------------------------------------------------
144
+ ------------------------------------------------------------------
145
+ iteration: 17 , TFLOPS: 19.73958477407385, Tokens per sec: 278842.4789667809, Loss: 10.456724166870117
146
+ ------------------------------------------------------------------
147
+ ------------------------------------------------------------------
148
+ iteration: 18 , TFLOPS: 19.47597897335848, Tokens per sec: 275118.76867688465, Loss: 10.456052780151367
149
+ ------------------------------------------------------------------
150
+ ------------------------------------------------------------------
151
+ iteration: 19 , TFLOPS: 19.72614156699911, Tokens per sec: 278652.57947148307, Loss: 10.458043098449707
152
+ ------------------------------------------------------------------
153
+ ------------------------------------------------------------------
154
+ iteration: 20 , TFLOPS: 19.701494428407866, Tokens per sec: 278304.4126127127, Loss: 10.455198287963867
155
+ ------------------------------------------------------------------
156
+ ------------------------------------------------------------------
157
+ iteration: 21 , TFLOPS: 19.74596953791029, Tokens per sec: 278932.6704979679, Loss: 10.457011222839355
158
+ ------------------------------------------------------------------
159
+ ------------------------------------------------------------------
160
+ iteration: 22 , TFLOPS: 19.615963035173014, Tokens per sec: 277096.1913663158, Loss: 10.457304954528809
161
+ ------------------------------------------------------------------
162
+ ------------------------------------------------------------------
163
+ iteration: 23 , TFLOPS: 19.718852381531324, Tokens per sec: 278549.61202972836, Loss: 10.456293106079102
164
+ ------------------------------------------------------------------
165
+ ------------------------------------------------------------------
166
+ iteration: 24 , TFLOPS: 19.67743842750437, Tokens per sec: 277964.5961979942, Loss: 10.456793785095215
167
+ ------------------------------------------------------------------
168
+ ------------------------------------------------------------------
169
+ iteration: 25 , TFLOPS: 19.72792104268366, Tokens per sec: 278677.7164445664, Loss: 10.45568561553955
170
+ ------------------------------------------------------------------
171
+ ------------------------------------------------------------------
172
+ iteration: 26 , TFLOPS: 19.766049190998213, Tokens per sec: 279216.31680096325, Loss: 10.455270767211914
173
+ ------------------------------------------------------------------
174
+ ------------------------------------------------------------------
175
+ iteration: 27 , TFLOPS: 19.748005203155174, Tokens per sec: 278961.4264191145, Loss: 10.456525802612305
176
+ ------------------------------------------------------------------
177
+ ------------------------------------------------------------------
178
+ iteration: 28 , TFLOPS: 19.788746629218007, Tokens per sec: 279536.9420831989, Loss: 10.458827018737793
179
+ ------------------------------------------------------------------
180
+ ------------------------------------------------------------------
181
+ iteration: 29 , TFLOPS: 19.64595613343327, Tokens per sec: 277519.8755504821, Loss: 10.454755783081055
182
+ ------------------------------------------------------------------
183
+ ------------------------------------------------------------------
184
+ iteration: 30 , TFLOPS: 19.791751212574006, Tokens per sec: 279579.38499579503, Loss: 10.455424308776855
185
+ ------------------------------------------------------------------
186
+ ------------------------------------------------------------------
187
+ iteration: 31 , TFLOPS: 19.647830180890995, Tokens per sec: 277546.34844972467, Loss: 10.455726623535156
188
+ ------------------------------------------------------------------
189
+ ------------------------------------------------------------------
190
+ iteration: 32 , TFLOPS: 19.108735127334096, Tokens per sec: 269931.06156030786, Loss: 10.456134796142578
191
+ ------------------------------------------------------------------
192
+ ------------------------------------------------------------------
193
+ iteration: 33 , TFLOPS: 19.790956814139804, Tokens per sec: 279568.16327906615, Loss: 10.45483684539795
194
+ ------------------------------------------------------------------
195
+ ------------------------------------------------------------------
196
+ iteration: 34 , TFLOPS: 19.809776481063633, Tokens per sec: 279834.0109470101, Loss: 10.455580711364746
197
+ ------------------------------------------------------------------
198
+ ------------------------------------------------------------------
199
+ iteration: 35 , TFLOPS: 19.78921404344017, Tokens per sec: 279543.5448026535, Loss: 10.4553861618042
200
+ ------------------------------------------------------------------
201
+ ------------------------------------------------------------------
202
+ iteration: 36 , TFLOPS: 19.760295167208643, Tokens per sec: 279135.03513896884, Loss: 10.455459594726562
203
+ ------------------------------------------------------------------
204
+ ------------------------------------------------------------------
205
+ iteration: 37 , TFLOPS: 19.77547756611738, Tokens per sec: 279349.502555423, Loss: 10.45541000366211
206
+ ------------------------------------------------------------------
207
+ ------------------------------------------------------------------
208
+ iteration: 38 , TFLOPS: 19.777886920468827, Tokens per sec: 279383.53722979716, Loss: 10.455677032470703
209
+ ------------------------------------------------------------------
210
+ ------------------------------------------------------------------
211
+ iteration: 39 , TFLOPS: 19.686203609180314, Tokens per sec: 278088.4136447687, Loss: 10.454801559448242
212
+ ------------------------------------------------------------------
213
+ ------------------------------------------------------------------
214
+ iteration: 40 , TFLOPS: 19.733730730492624, Tokens per sec: 278759.7844196133, Loss: 10.45442008972168
215
+ ------------------------------------------------------------------
216
+ ------------------------------------------------------------------
217
+ iteration: 41 , TFLOPS: 19.781936717382273, Tokens per sec: 279440.74488758645, Loss: 10.453974723815918
218
+ ------------------------------------------------------------------
219
+ ------------------------------------------------------------------
220
+ iteration: 42 , TFLOPS: 19.713193575703478, Tokens per sec: 278469.67542198877, Loss: 10.454327583312988
221
+ ------------------------------------------------------------------
222
+ ------------------------------------------------------------------
223
+ iteration: 43 , TFLOPS: 19.765683899965286, Tokens per sec: 279211.1566793938, Loss: 10.453243255615234
224
+ ------------------------------------------------------------------
225
+ ------------------------------------------------------------------
226
+ iteration: 44 , TFLOPS: 19.737930705062368, Tokens per sec: 278819.11349537794, Loss: 10.452882766723633
227
+ ------------------------------------------------------------------
228
+ ------------------------------------------------------------------
229
+ iteration: 45 , TFLOPS: 19.768669599163157, Tokens per sec: 279253.3328333156, Loss: 10.452847480773926
230
+ ------------------------------------------------------------------
231
+ ------------------------------------------------------------------
232
+ iteration: 46 , TFLOPS: 19.195477136883916, Tokens per sec: 271156.3840405051, Loss: 10.452914237976074
233
+ ------------------------------------------------------------------
234
+ ------------------------------------------------------------------
235
+ iteration: 47 , TFLOPS: 19.78773782868223, Tokens per sec: 279522.69171038724, Loss: 10.452442169189453
236
+ ------------------------------------------------------------------
237
+ ------------------------------------------------------------------
238
+ iteration: 48 , TFLOPS: 19.79785807451266, Tokens per sec: 279665.6508692251, Loss: 10.452777862548828
239
+ ------------------------------------------------------------------
wandb/run-20240803_191815-jdwps0z3/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240803_191815-jdwps0z3/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-03T10:18:16.495932",
5
+ "startedAt": "2024-08-03T10:18:15.860061",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "40",
15
+ "--global-batch-size",
16
+ "1600",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/custom/tiny-mistral",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-mistral-sample2",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-mistral-sample2",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-mistral-sample2",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-mistral-sample2_train_2024-08-03-19:18:05"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.034,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.034,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.034,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.034,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.034,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.034,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.034,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.034,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.034,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.034,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.034,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.034,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.034,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.034,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.034,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.034,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.034,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.034,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.034,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 10.452777862548828, "training/perplexity": 34640.467638738424, "utils/batch_size": 40, "utils/global_batch_size": 1600, "utils/seq_len": 513, "utils/gradient_accumulation_steps": 40, "utils/iteration": 48, "optimizer/lr": 2.8240000000000004e-06, "optimizer/variance_l2": 0.0037958921404718368, "optimizer/variance_sqrt_l2": 0.9574357404136762, "optimizer/momentum_l2": 0.9945472829864366, "optimizer/weight_l2": 101.38293742045552, "optimizer/variance_l1": 0.91748046875, "optimizer/variance_sqrt_l1": 723.0, "optimizer/momentum_l1": 740.5, "optimizer/weight_l1": 320512.0, "optimizer/variance_abs_max": 0.0003185272216796875, "optimizer/variance_sqrt_abs_max": 0.017822265625, "optimizer/momentum_abs_max": 0.0184326171875, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 2.9349331869998423, "stats/tokens_per_sec": 279665.6508692251, "stats/tokens_per_sec_per_gpu": 279665.6508692251, "stats/tflops": 19.79785807451266, "_timestamp": 1722680446.903461, "_runtime": 151.03112506866455, "_step": 48, "_wandb": {"runtime": 151}}
wandb/run-20240803_191815-jdwps0z3/logs/debug-internal.log ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-03 19:18:15,873 INFO StreamThr :9504 [internal.py:wandb_internal():86] W&B internal server running at pid: 9504, started at: 2024-08-03 19:18:15.872881
2
+ 2024-08-03 19:18:15,875 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-03 19:18:15,877 INFO WriterThread:9504 [datastore.py:open_for_write():87] open: /project/wandb/run-20240803_191815-jdwps0z3/run-jdwps0z3.wandb
4
+ 2024-08-03 19:18:15,878 DEBUG SenderThread:9504 [sender.py:send():382] send: header
5
+ 2024-08-03 19:18:15,892 DEBUG SenderThread:9504 [sender.py:send():382] send: run
6
+ 2024-08-03 19:18:16,382 INFO SenderThread:9504 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240803_191815-jdwps0z3/files
7
+ 2024-08-03 19:18:16,382 INFO SenderThread:9504 [sender.py:_start_run_threads():1136] run started: jdwps0z3 with start time 1722680295.872336
8
+ 2024-08-03 19:18:16,387 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-03 19:18:16,387 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-03 19:18:16,477 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-03 19:18:16,483 DEBUG HandlerThread:9504 [system_info.py:__init__():27] System info init
12
+ 2024-08-03 19:18:16,483 DEBUG HandlerThread:9504 [system_info.py:__init__():42] System info init done
13
+ 2024-08-03 19:18:16,483 INFO HandlerThread:9504 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-03 19:18:16,483 INFO SystemMonitor:9504 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-03 19:18:16,483 INFO HandlerThread:9504 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-03 19:18:16,484 INFO SystemMonitor:9504 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-03 19:18:16,485 INFO SystemMonitor:9504 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-03 19:18:16,486 INFO SystemMonitor:9504 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-03 19:18:16,486 INFO SystemMonitor:9504 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-03 19:18:16,487 INFO SystemMonitor:9504 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-03 19:18:16,495 DEBUG HandlerThread:9504 [system_info.py:probe():151] Probing system
22
+ 2024-08-03 19:18:16,497 DEBUG HandlerThread:9504 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-03 19:18:16,509 DEBUG HandlerThread:9504 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-03 19:18:16,509 DEBUG HandlerThread:9504 [system_info.py:probe():199] Probing system done
25
+ 2024-08-03 19:18:16,509 DEBUG HandlerThread:9504 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T10:18:16.495932', 'startedAt': '2024-08-03T10:18:15.860061', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '40', '--global-batch-size', '1600', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample2', '--load', '/work/llm_recipes/models/tiny-mistral-sample2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample2_train_2024-08-03-19:18:05'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-03 19:18:16,509 INFO HandlerThread:9504 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-03 19:18:16,509 INFO HandlerThread:9504 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-03 19:18:16,510 INFO HandlerThread:9504 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-03 19:18:16,516 DEBUG SenderThread:9504 [sender.py:send():382] send: files
30
+ 2024-08-03 19:18:16,516 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-03 19:18:16,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-03 19:18:16,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-03 19:18:16,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-03 19:18:16,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-03 19:18:16,528 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-03 19:18:16,829 DEBUG SenderThread:9504 [sender.py:send():382] send: telemetry
37
+ 2024-08-03 19:18:17,199 INFO wandb-upload_0:9504 [upload_job.py:push():131] Uploaded file /tmp/tmp76a9qs5lwandb/q74vxtjq-wandb-metadata.json
38
+ 2024-08-03 19:18:17,385 INFO Thread-12 :9504 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-metadata.json
39
+ 2024-08-03 19:18:17,386 INFO Thread-12 :9504 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191815-jdwps0z3/files/requirements.txt
40
+ 2024-08-03 19:18:17,386 INFO Thread-12 :9504 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
41
+ 2024-08-03 19:18:17,990 DEBUG SenderThread:9504 [sender.py:send():382] send: config
42
+ 2024-08-03 19:18:17,991 DEBUG SenderThread:9504 [sender.py:send():382] send: config
43
+ 2024-08-03 19:18:19,388 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
44
+ 2024-08-03 19:18:20,991 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
45
+ 2024-08-03 19:18:25,992 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
46
+ 2024-08-03 19:18:28,096 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
47
+ 2024-08-03 19:18:30,394 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
48
+ 2024-08-03 19:18:31,059 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
49
+ 2024-08-03 19:18:31,062 DEBUG SenderThread:9504 [sender.py:send():382] send: history
50
+ 2024-08-03 19:18:31,063 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
51
+ 2024-08-03 19:18:31,063 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
52
+ 2024-08-03 19:18:31,064 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
53
+ 2024-08-03 19:18:31,395 INFO Thread-12 :9504 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
54
+ 2024-08-03 19:18:31,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
55
+ 2024-08-03 19:18:31,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
56
+ 2024-08-03 19:18:31,568 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
57
+ 2024-08-03 19:18:34,003 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
58
+ 2024-08-03 19:18:34,005 DEBUG SenderThread:9504 [sender.py:send():382] send: history
59
+ 2024-08-03 19:18:34,005 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
60
+ 2024-08-03 19:18:34,006 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
61
+ 2024-08-03 19:18:34,397 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
62
+ 2024-08-03 19:18:34,397 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
63
+ 2024-08-03 19:18:36,398 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
64
+ 2024-08-03 19:18:36,971 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
65
+ 2024-08-03 19:18:36,974 DEBUG SenderThread:9504 [sender.py:send():382] send: history
66
+ 2024-08-03 19:18:36,975 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
67
+ 2024-08-03 19:18:36,976 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
68
+ 2024-08-03 19:18:36,977 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
69
+ 2024-08-03 19:18:37,399 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
70
+ 2024-08-03 19:18:39,919 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
71
+ 2024-08-03 19:18:39,921 DEBUG SenderThread:9504 [sender.py:send():382] send: history
72
+ 2024-08-03 19:18:39,921 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
73
+ 2024-08-03 19:18:39,922 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
74
+ 2024-08-03 19:18:40,401 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
75
+ 2024-08-03 19:18:40,401 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
76
+ 2024-08-03 19:18:42,402 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
77
+ 2024-08-03 19:18:42,881 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
78
+ 2024-08-03 19:18:42,883 DEBUG SenderThread:9504 [sender.py:send():382] send: history
79
+ 2024-08-03 19:18:42,883 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
80
+ 2024-08-03 19:18:42,884 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
81
+ 2024-08-03 19:18:42,885 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
82
+ 2024-08-03 19:18:43,403 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
83
+ 2024-08-03 19:18:44,403 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
84
+ 2024-08-03 19:18:45,821 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
85
+ 2024-08-03 19:18:45,823 DEBUG SenderThread:9504 [sender.py:send():382] send: history
86
+ 2024-08-03 19:18:45,824 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
87
+ 2024-08-03 19:18:45,825 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
88
+ 2024-08-03 19:18:46,405 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
89
+ 2024-08-03 19:18:46,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
90
+ 2024-08-03 19:18:46,525 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
91
+ 2024-08-03 19:18:46,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
92
+ 2024-08-03 19:18:48,406 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
93
+ 2024-08-03 19:18:48,697 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
94
+ 2024-08-03 19:18:48,761 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
95
+ 2024-08-03 19:18:48,885 DEBUG SenderThread:9504 [sender.py:send():382] send: history
96
+ 2024-08-03 19:18:48,885 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
97
+ 2024-08-03 19:18:48,887 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
98
+ 2024-08-03 19:18:49,407 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
99
+ 2024-08-03 19:18:49,407 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/config.yaml
100
+ 2024-08-03 19:18:50,407 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
101
+ 2024-08-03 19:18:51,705 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
102
+ 2024-08-03 19:18:51,707 DEBUG SenderThread:9504 [sender.py:send():382] send: history
103
+ 2024-08-03 19:18:51,708 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
104
+ 2024-08-03 19:18:51,709 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
105
+ 2024-08-03 19:18:52,409 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
106
+ 2024-08-03 19:18:54,410 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
107
+ 2024-08-03 19:18:54,648 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
108
+ 2024-08-03 19:18:54,651 DEBUG SenderThread:9504 [sender.py:send():382] send: history
109
+ 2024-08-03 19:18:54,651 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
110
+ 2024-08-03 19:18:54,651 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
111
+ 2024-08-03 19:18:54,652 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
112
+ 2024-08-03 19:18:55,411 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
113
+ 2024-08-03 19:18:56,412 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
114
+ 2024-08-03 19:18:57,593 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
115
+ 2024-08-03 19:18:57,596 DEBUG SenderThread:9504 [sender.py:send():382] send: history
116
+ 2024-08-03 19:18:57,596 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
117
+ 2024-08-03 19:18:57,597 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
118
+ 2024-08-03 19:18:58,413 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
119
+ 2024-08-03 19:19:00,414 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
120
+ 2024-08-03 19:19:00,536 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
121
+ 2024-08-03 19:19:00,540 DEBUG SenderThread:9504 [sender.py:send():382] send: history
122
+ 2024-08-03 19:19:00,540 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
123
+ 2024-08-03 19:19:00,541 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
124
+ 2024-08-03 19:19:00,542 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
125
+ 2024-08-03 19:19:01,415 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
126
+ 2024-08-03 19:19:01,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
127
+ 2024-08-03 19:19:01,525 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
128
+ 2024-08-03 19:19:01,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
129
+ 2024-08-03 19:19:02,416 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
130
+ 2024-08-03 19:19:03,483 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
131
+ 2024-08-03 19:19:03,484 DEBUG SenderThread:9504 [sender.py:send():382] send: history
132
+ 2024-08-03 19:19:03,485 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
133
+ 2024-08-03 19:19:03,486 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
134
+ 2024-08-03 19:19:04,417 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
135
+ 2024-08-03 19:19:06,418 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
136
+ 2024-08-03 19:19:06,446 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
137
+ 2024-08-03 19:19:06,448 DEBUG SenderThread:9504 [sender.py:send():382] send: history
138
+ 2024-08-03 19:19:06,448 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
139
+ 2024-08-03 19:19:06,448 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
140
+ 2024-08-03 19:19:06,449 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
141
+ 2024-08-03 19:19:07,420 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
142
+ 2024-08-03 19:19:08,420 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
143
+ 2024-08-03 19:19:09,393 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
144
+ 2024-08-03 19:19:09,396 DEBUG SenderThread:9504 [sender.py:send():382] send: history
145
+ 2024-08-03 19:19:09,396 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
146
+ 2024-08-03 19:19:09,397 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
147
+ 2024-08-03 19:19:09,421 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
148
+ 2024-08-03 19:19:10,422 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
149
+ 2024-08-03 19:19:12,398 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
150
+ 2024-08-03 19:19:12,436 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
151
+ 2024-08-03 19:19:12,438 DEBUG SenderThread:9504 [sender.py:send():382] send: history
152
+ 2024-08-03 19:19:12,438 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
153
+ 2024-08-03 19:19:12,440 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
154
+ 2024-08-03 19:19:13,424 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
155
+ 2024-08-03 19:19:14,424 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
156
+ 2024-08-03 19:19:15,381 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
157
+ 2024-08-03 19:19:15,383 DEBUG SenderThread:9504 [sender.py:send():382] send: history
158
+ 2024-08-03 19:19:15,384 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
159
+ 2024-08-03 19:19:15,385 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
160
+ 2024-08-03 19:19:15,425 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
161
+ 2024-08-03 19:19:16,426 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
162
+ 2024-08-03 19:19:16,487 DEBUG SystemMonitor:9504 [system_monitor.py:_start():172] Starting system metrics aggregation loop
163
+ 2024-08-03 19:19:16,489 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
164
+ 2024-08-03 19:19:16,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
165
+ 2024-08-03 19:19:16,525 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
166
+ 2024-08-03 19:19:16,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
167
+ 2024-08-03 19:19:17,757 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
168
+ 2024-08-03 19:19:18,366 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
169
+ 2024-08-03 19:19:18,368 DEBUG SenderThread:9504 [sender.py:send():382] send: history
170
+ 2024-08-03 19:19:18,368 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
171
+ 2024-08-03 19:19:18,369 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
172
+ 2024-08-03 19:19:18,427 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
173
+ 2024-08-03 19:19:20,428 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
174
+ 2024-08-03 19:19:21,314 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
175
+ 2024-08-03 19:19:21,316 DEBUG SenderThread:9504 [sender.py:send():382] send: history
176
+ 2024-08-03 19:19:21,317 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
177
+ 2024-08-03 19:19:21,318 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
178
+ 2024-08-03 19:19:21,429 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
179
+ 2024-08-03 19:19:22,430 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
180
+ 2024-08-03 19:19:23,318 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
181
+ 2024-08-03 19:19:24,265 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
182
+ 2024-08-03 19:19:24,267 DEBUG SenderThread:9504 [sender.py:send():382] send: history
183
+ 2024-08-03 19:19:24,268 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
184
+ 2024-08-03 19:19:24,269 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
185
+ 2024-08-03 19:19:24,431 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
186
+ 2024-08-03 19:19:26,432 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
187
+ 2024-08-03 19:19:27,210 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
188
+ 2024-08-03 19:19:27,212 DEBUG SenderThread:9504 [sender.py:send():382] send: history
189
+ 2024-08-03 19:19:27,213 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
190
+ 2024-08-03 19:19:27,214 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
191
+ 2024-08-03 19:19:27,433 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
192
+ 2024-08-03 19:19:28,434 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
193
+ 2024-08-03 19:19:29,214 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
194
+ 2024-08-03 19:19:30,174 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
195
+ 2024-08-03 19:19:30,175 DEBUG SenderThread:9504 [sender.py:send():382] send: history
196
+ 2024-08-03 19:19:30,176 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
197
+ 2024-08-03 19:19:30,177 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
198
+ 2024-08-03 19:19:30,435 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
199
+ 2024-08-03 19:19:31,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
200
+ 2024-08-03 19:19:31,525 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
201
+ 2024-08-03 19:19:31,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
202
+ 2024-08-03 19:19:32,436 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
203
+ 2024-08-03 19:19:33,125 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
204
+ 2024-08-03 19:19:33,126 DEBUG SenderThread:9504 [sender.py:send():382] send: history
205
+ 2024-08-03 19:19:33,126 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
206
+ 2024-08-03 19:19:33,127 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
207
+ 2024-08-03 19:19:33,437 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
208
+ 2024-08-03 19:19:34,438 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
209
+ 2024-08-03 19:19:35,169 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
210
+ 2024-08-03 19:19:36,079 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
211
+ 2024-08-03 19:19:36,081 DEBUG SenderThread:9504 [sender.py:send():382] send: history
212
+ 2024-08-03 19:19:36,082 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
213
+ 2024-08-03 19:19:36,084 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
214
+ 2024-08-03 19:19:36,439 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
215
+ 2024-08-03 19:19:38,441 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
216
+ 2024-08-03 19:19:39,026 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
217
+ 2024-08-03 19:19:39,029 DEBUG SenderThread:9504 [sender.py:send():382] send: history
218
+ 2024-08-03 19:19:39,029 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
219
+ 2024-08-03 19:19:39,030 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
220
+ 2024-08-03 19:19:39,442 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
221
+ 2024-08-03 19:19:40,443 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
222
+ 2024-08-03 19:19:41,031 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
223
+ 2024-08-03 19:19:41,968 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
224
+ 2024-08-03 19:19:41,970 DEBUG SenderThread:9504 [sender.py:send():382] send: history
225
+ 2024-08-03 19:19:41,970 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
226
+ 2024-08-03 19:19:41,972 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
227
+ 2024-08-03 19:19:42,444 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
228
+ 2024-08-03 19:19:44,445 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
229
+ 2024-08-03 19:19:44,912 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
230
+ 2024-08-03 19:19:44,914 DEBUG SenderThread:9504 [sender.py:send():382] send: history
231
+ 2024-08-03 19:19:44,915 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
232
+ 2024-08-03 19:19:44,916 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
233
+ 2024-08-03 19:19:45,447 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
234
+ 2024-08-03 19:19:46,447 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
235
+ 2024-08-03 19:19:46,489 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
236
+ 2024-08-03 19:19:46,490 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
237
+ 2024-08-03 19:19:46,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
238
+ 2024-08-03 19:19:46,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
239
+ 2024-08-03 19:19:46,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
240
+ 2024-08-03 19:19:47,851 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
241
+ 2024-08-03 19:19:47,852 DEBUG SenderThread:9504 [sender.py:send():382] send: history
242
+ 2024-08-03 19:19:47,852 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
243
+ 2024-08-03 19:19:47,853 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
244
+ 2024-08-03 19:19:48,449 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
245
+ 2024-08-03 19:19:50,450 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
246
+ 2024-08-03 19:19:50,810 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
247
+ 2024-08-03 19:19:50,814 DEBUG SenderThread:9504 [sender.py:send():382] send: history
248
+ 2024-08-03 19:19:50,814 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
249
+ 2024-08-03 19:19:50,815 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
250
+ 2024-08-03 19:19:51,451 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
251
+ 2024-08-03 19:19:51,815 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
252
+ 2024-08-03 19:19:52,452 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
253
+ 2024-08-03 19:19:53,748 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
254
+ 2024-08-03 19:19:53,750 DEBUG SenderThread:9504 [sender.py:send():382] send: history
255
+ 2024-08-03 19:19:53,750 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
256
+ 2024-08-03 19:19:53,752 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
257
+ 2024-08-03 19:19:54,454 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
258
+ 2024-08-03 19:19:56,455 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
259
+ 2024-08-03 19:19:56,707 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
260
+ 2024-08-03 19:19:56,709 DEBUG SenderThread:9504 [sender.py:send():382] send: history
261
+ 2024-08-03 19:19:56,709 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
262
+ 2024-08-03 19:19:56,711 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
263
+ 2024-08-03 19:19:57,456 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
264
+ 2024-08-03 19:19:57,712 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
265
+ 2024-08-03 19:19:58,456 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
266
+ 2024-08-03 19:19:59,749 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
267
+ 2024-08-03 19:19:59,752 DEBUG SenderThread:9504 [sender.py:send():382] send: history
268
+ 2024-08-03 19:19:59,752 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
269
+ 2024-08-03 19:19:59,754 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
270
+ 2024-08-03 19:20:00,458 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
271
+ 2024-08-03 19:20:01,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
272
+ 2024-08-03 19:20:01,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
273
+ 2024-08-03 19:20:01,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
274
+ 2024-08-03 19:20:02,459 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
275
+ 2024-08-03 19:20:02,687 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
276
+ 2024-08-03 19:20:02,689 DEBUG SenderThread:9504 [sender.py:send():382] send: history
277
+ 2024-08-03 19:20:02,689 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
278
+ 2024-08-03 19:20:02,690 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
279
+ 2024-08-03 19:20:02,729 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
280
+ 2024-08-03 19:20:03,460 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
281
+ 2024-08-03 19:20:04,460 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
282
+ 2024-08-03 19:20:05,622 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
283
+ 2024-08-03 19:20:05,624 DEBUG SenderThread:9504 [sender.py:send():382] send: history
284
+ 2024-08-03 19:20:05,625 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
285
+ 2024-08-03 19:20:05,626 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
286
+ 2024-08-03 19:20:06,462 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
287
+ 2024-08-03 19:20:08,463 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
288
+ 2024-08-03 19:20:08,560 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
289
+ 2024-08-03 19:20:08,563 DEBUG SenderThread:9504 [sender.py:send():382] send: history
290
+ 2024-08-03 19:20:08,563 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
291
+ 2024-08-03 19:20:08,563 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
292
+ 2024-08-03 19:20:08,564 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
293
+ 2024-08-03 19:20:09,464 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
294
+ 2024-08-03 19:20:10,464 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
295
+ 2024-08-03 19:20:11,503 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
296
+ 2024-08-03 19:20:11,505 DEBUG SenderThread:9504 [sender.py:send():382] send: history
297
+ 2024-08-03 19:20:11,506 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
298
+ 2024-08-03 19:20:11,507 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
299
+ 2024-08-03 19:20:12,466 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
300
+ 2024-08-03 19:20:12,466 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
301
+ 2024-08-03 19:20:14,443 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
302
+ 2024-08-03 19:20:14,468 DEBUG SenderThread:9504 [sender.py:send():382] send: history
303
+ 2024-08-03 19:20:14,469 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
304
+ 2024-08-03 19:20:14,469 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
305
+ 2024-08-03 19:20:14,470 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
306
+ 2024-08-03 19:20:15,470 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
307
+ 2024-08-03 19:20:16,471 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
308
+ 2024-08-03 19:20:16,490 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
309
+ 2024-08-03 19:20:16,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
310
+ 2024-08-03 19:20:16,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
311
+ 2024-08-03 19:20:16,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
312
+ 2024-08-03 19:20:17,382 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
313
+ 2024-08-03 19:20:17,384 DEBUG SenderThread:9504 [sender.py:send():382] send: history
314
+ 2024-08-03 19:20:17,384 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
315
+ 2024-08-03 19:20:17,385 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
316
+ 2024-08-03 19:20:17,471 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
317
+ 2024-08-03 19:20:18,472 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
318
+ 2024-08-03 19:20:20,336 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
319
+ 2024-08-03 19:20:20,338 DEBUG SenderThread:9504 [sender.py:send():382] send: history
320
+ 2024-08-03 19:20:20,339 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
321
+ 2024-08-03 19:20:20,339 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
322
+ 2024-08-03 19:20:20,340 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
323
+ 2024-08-03 19:20:20,473 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
324
+ 2024-08-03 19:20:22,474 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
325
+ 2024-08-03 19:20:23,282 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
326
+ 2024-08-03 19:20:23,284 DEBUG SenderThread:9504 [sender.py:send():382] send: history
327
+ 2024-08-03 19:20:23,285 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
328
+ 2024-08-03 19:20:23,286 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
329
+ 2024-08-03 19:20:23,475 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
330
+ 2024-08-03 19:20:24,476 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
331
+ 2024-08-03 19:20:26,221 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
332
+ 2024-08-03 19:20:26,223 DEBUG SenderThread:9504 [sender.py:send():382] send: history
333
+ 2024-08-03 19:20:26,224 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
334
+ 2024-08-03 19:20:26,224 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
335
+ 2024-08-03 19:20:26,225 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
336
+ 2024-08-03 19:20:26,477 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
337
+ 2024-08-03 19:20:28,478 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
338
+ 2024-08-03 19:20:29,171 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
339
+ 2024-08-03 19:20:29,173 DEBUG SenderThread:9504 [sender.py:send():382] send: history
340
+ 2024-08-03 19:20:29,174 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
341
+ 2024-08-03 19:20:29,175 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
342
+ 2024-08-03 19:20:29,479 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
343
+ 2024-08-03 19:20:30,480 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
344
+ 2024-08-03 19:20:31,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
345
+ 2024-08-03 19:20:31,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
346
+ 2024-08-03 19:20:31,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
347
+ 2024-08-03 19:20:31,709 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
348
+ 2024-08-03 19:20:32,112 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
349
+ 2024-08-03 19:20:32,114 DEBUG SenderThread:9504 [sender.py:send():382] send: history
350
+ 2024-08-03 19:20:32,114 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
351
+ 2024-08-03 19:20:32,115 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
352
+ 2024-08-03 19:20:32,482 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
353
+ 2024-08-03 19:20:34,483 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
354
+ 2024-08-03 19:20:35,058 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
355
+ 2024-08-03 19:20:35,060 DEBUG SenderThread:9504 [sender.py:send():382] send: history
356
+ 2024-08-03 19:20:35,061 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
357
+ 2024-08-03 19:20:35,062 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
358
+ 2024-08-03 19:20:35,484 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
359
+ 2024-08-03 19:20:36,485 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
360
+ 2024-08-03 19:20:37,062 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
361
+ 2024-08-03 19:20:37,999 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
362
+ 2024-08-03 19:20:38,001 DEBUG SenderThread:9504 [sender.py:send():382] send: history
363
+ 2024-08-03 19:20:38,002 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
364
+ 2024-08-03 19:20:38,003 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
365
+ 2024-08-03 19:20:38,486 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
366
+ 2024-08-03 19:20:40,487 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
367
+ 2024-08-03 19:20:41,029 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
368
+ 2024-08-03 19:20:41,031 DEBUG SenderThread:9504 [sender.py:send():382] send: history
369
+ 2024-08-03 19:20:41,032 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
370
+ 2024-08-03 19:20:41,033 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
371
+ 2024-08-03 19:20:41,488 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
372
+ 2024-08-03 19:20:42,489 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
373
+ 2024-08-03 19:20:43,034 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
374
+ 2024-08-03 19:20:43,967 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
375
+ 2024-08-03 19:20:43,971 DEBUG SenderThread:9504 [sender.py:send():382] send: history
376
+ 2024-08-03 19:20:43,971 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
377
+ 2024-08-03 19:20:43,972 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
378
+ 2024-08-03 19:20:44,490 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
379
+ 2024-08-03 19:20:46,491 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
380
+ 2024-08-03 19:20:46,492 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
381
+ 2024-08-03 19:20:46,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
382
+ 2024-08-03 19:20:46,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
383
+ 2024-08-03 19:20:46,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
384
+ 2024-08-03 19:20:46,904 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
385
+ 2024-08-03 19:20:46,905 DEBUG SenderThread:9504 [sender.py:send():382] send: history
386
+ 2024-08-03 19:20:46,906 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
387
+ 2024-08-03 19:20:46,907 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
388
+ 2024-08-03 19:20:47,493 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
389
+ 2024-08-03 19:20:47,496 DEBUG SenderThread:9504 [sender.py:send():382] send: exit
390
+ 2024-08-03 19:20:47,496 INFO SenderThread:9504 [sender.py:send_exit():589] handling exit code: 255
391
+ 2024-08-03 19:20:47,496 INFO SenderThread:9504 [sender.py:send_exit():591] handling runtime: 151
392
+ 2024-08-03 19:20:47,497 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
393
+ 2024-08-03 19:20:47,497 INFO SenderThread:9504 [sender.py:send_exit():597] send defer
394
+ 2024-08-03 19:20:47,497 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
395
+ 2024-08-03 19:20:47,497 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 0
396
+ 2024-08-03 19:20:47,497 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
397
+ 2024-08-03 19:20:47,497 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 0
398
+ 2024-08-03 19:20:47,497 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 1
399
+ 2024-08-03 19:20:47,498 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
400
+ 2024-08-03 19:20:47,498 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 1
401
+ 2024-08-03 19:20:47,498 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
402
+ 2024-08-03 19:20:47,498 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 1
403
+ 2024-08-03 19:20:47,498 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 2
404
+ 2024-08-03 19:20:47,498 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
405
+ 2024-08-03 19:20:47,498 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 2
406
+ 2024-08-03 19:20:47,498 INFO HandlerThread:9504 [system_monitor.py:finish():203] Stopping system monitor
407
+ 2024-08-03 19:20:47,498 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined cpu monitor
408
+ 2024-08-03 19:20:47,498 DEBUG SystemMonitor:9504 [system_monitor.py:_start():179] Finished system metrics aggregation loop
409
+ 2024-08-03 19:20:47,499 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined disk monitor
410
+ 2024-08-03 19:20:47,499 DEBUG SystemMonitor:9504 [system_monitor.py:_start():183] Publishing last batch of metrics
411
+ 2024-08-03 19:20:47,532 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined gpu monitor
412
+ 2024-08-03 19:20:47,532 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined memory monitor
413
+ 2024-08-03 19:20:47,532 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined network monitor
414
+ 2024-08-03 19:20:47,533 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
415
+ 2024-08-03 19:20:47,533 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 2
416
+ 2024-08-03 19:20:47,533 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 3
417
+ 2024-08-03 19:20:47,533 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
418
+ 2024-08-03 19:20:47,533 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
419
+ 2024-08-03 19:20:47,533 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 3
420
+ 2024-08-03 19:20:47,536 DEBUG SenderThread:9504 [sender.py:send():382] send: history
421
+ 2024-08-03 19:20:47,537 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
422
+ 2024-08-03 19:20:47,537 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
423
+ 2024-08-03 19:20:47,538 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
424
+ 2024-08-03 19:20:47,538 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 3
425
+ 2024-08-03 19:20:47,538 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 4
426
+ 2024-08-03 19:20:47,538 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
427
+ 2024-08-03 19:20:47,538 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 4
428
+ 2024-08-03 19:20:47,538 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
429
+ 2024-08-03 19:20:47,538 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 4
430
+ 2024-08-03 19:20:47,538 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 5
431
+ 2024-08-03 19:20:47,538 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
432
+ 2024-08-03 19:20:47,538 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 5
433
+ 2024-08-03 19:20:47,539 DEBUG SenderThread:9504 [sender.py:send():382] send: summary
434
+ 2024-08-03 19:20:47,540 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
435
+ 2024-08-03 19:20:47,540 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
436
+ 2024-08-03 19:20:47,540 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 5
437
+ 2024-08-03 19:20:47,540 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 6
438
+ 2024-08-03 19:20:47,541 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
439
+ 2024-08-03 19:20:47,541 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 6
440
+ 2024-08-03 19:20:47,541 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
441
+ 2024-08-03 19:20:47,541 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 6
442
+ 2024-08-03 19:20:47,541 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 7
443
+ 2024-08-03 19:20:47,541 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
444
+ 2024-08-03 19:20:47,541 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
445
+ 2024-08-03 19:20:47,541 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 7
446
+ 2024-08-03 19:20:47,541 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
447
+ 2024-08-03 19:20:47,541 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 7
448
+ 2024-08-03 19:20:47,875 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 8
449
+ 2024-08-03 19:20:47,875 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
450
+ 2024-08-03 19:20:47,875 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 8
451
+ 2024-08-03 19:20:47,875 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
452
+ 2024-08-03 19:20:47,875 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 8
453
+ 2024-08-03 19:20:47,875 INFO SenderThread:9504 [job_builder.py:build():296] Attempting to build job artifact
454
+ 2024-08-03 19:20:47,876 INFO SenderThread:9504 [job_builder.py:_get_source_type():426] is repo sourced job
455
+ 2024-08-03 19:20:47,890 INFO SenderThread:9504 [job_builder.py:build():402] adding wandb-job metadata file
456
+ 2024-08-03 19:20:47,898 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 9
457
+ 2024-08-03 19:20:47,899 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
458
+ 2024-08-03 19:20:47,899 DEBUG SenderThread:9504 [sender.py:send():382] send: artifact
459
+ 2024-08-03 19:20:47,899 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 9
460
+ 2024-08-03 19:20:48,494 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
461
+ 2024-08-03 19:20:48,494 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
462
+ 2024-08-03 19:20:48,496 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: poll_exit
463
+ 2024-08-03 19:20:49,057 INFO wandb-upload_1:9504 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmp1gtfugn3
464
+ 2024-08-03 19:20:49,437 INFO wandb-upload_0:9504 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp8rydwr53
465
+ 2024-08-03 19:20:50,875 INFO SenderThread:9504 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk4ODAyMA==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'versionIndex': 1}}}
466
+ 2024-08-03 19:20:50,876 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
467
+ 2024-08-03 19:20:50,876 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 9
468
+ 2024-08-03 19:20:50,876 INFO SenderThread:9504 [dir_watcher.py:finish():358] shutting down directory watcher
469
+ 2024-08-03 19:20:51,495 INFO SenderThread:9504 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240803_191815-jdwps0z3/files
470
+ 2024-08-03 19:20:51,495 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/requirements.txt requirements.txt
471
+ 2024-08-03 19:20:51,495 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/config.yaml config.yaml
472
+ 2024-08-03 19:20:51,497 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-metadata.json wandb-metadata.json
473
+ 2024-08-03 19:20:51,497 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json wandb-summary.json
474
+ 2024-08-03 19:20:51,498 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log output.log
475
+ 2024-08-03 19:20:51,500 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 10
476
+ 2024-08-03 19:20:51,500 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: poll_exit
477
+ 2024-08-03 19:20:51,502 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
478
+ 2024-08-03 19:20:51,502 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 10
479
+ 2024-08-03 19:20:51,502 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
480
+ 2024-08-03 19:20:51,502 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 10
481
+ 2024-08-03 19:20:51,502 INFO SenderThread:9504 [file_pusher.py:finish():172] shutting down file pusher
482
+ 2024-08-03 19:20:51,911 INFO wandb-upload_0:9504 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191815-jdwps0z3/files/config.yaml
483
+ 2024-08-03 19:20:51,994 INFO wandb-upload_1:9504 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191815-jdwps0z3/files/requirements.txt
484
+ 2024-08-03 19:20:52,073 INFO wandb-upload_2:9504 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
485
+ 2024-08-03 19:20:52,122 INFO wandb-upload_3:9504 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
486
+ 2024-08-03 19:20:52,322 INFO Thread-11 (_thread_body):9504 [sender.py:transition_state():617] send defer: 11
487
+ 2024-08-03 19:20:52,322 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
488
+ 2024-08-03 19:20:52,322 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 11
489
+ 2024-08-03 19:20:52,323 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
490
+ 2024-08-03 19:20:52,323 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 11
491
+ 2024-08-03 19:20:52,323 INFO SenderThread:9504 [file_pusher.py:join():178] waiting for file pusher
492
+ 2024-08-03 19:20:52,323 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 12
493
+ 2024-08-03 19:20:52,323 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
494
+ 2024-08-03 19:20:52,323 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 12
495
+ 2024-08-03 19:20:52,323 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
496
+ 2024-08-03 19:20:52,323 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 12
497
+ 2024-08-03 19:20:52,323 INFO SenderThread:9504 [file_stream.py:finish():595] file stream finish called
498
+ 2024-08-03 19:20:52,498 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: poll_exit
499
+ 2024-08-03 19:20:52,502 INFO SenderThread:9504 [file_stream.py:finish():599] file stream finish is done
500
+ 2024-08-03 19:20:52,502 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 13
501
+ 2024-08-03 19:20:52,502 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: poll_exit
502
+ 2024-08-03 19:20:52,502 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
503
+ 2024-08-03 19:20:52,502 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 13
504
+ 2024-08-03 19:20:52,503 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
505
+ 2024-08-03 19:20:52,503 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 13
506
+ 2024-08-03 19:20:52,503 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 14
507
+ 2024-08-03 19:20:52,503 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
508
+ 2024-08-03 19:20:52,503 DEBUG SenderThread:9504 [sender.py:send():382] send: final
509
+ 2024-08-03 19:20:52,503 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 14
510
+ 2024-08-03 19:20:52,503 DEBUG SenderThread:9504 [sender.py:send():382] send: footer
511
+ 2024-08-03 19:20:52,503 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
512
+ 2024-08-03 19:20:52,503 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 14
513
+ 2024-08-03 19:20:56,504 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
514
+ 2024-08-03 19:21:01,505 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
515
+ 2024-08-03 19:21:06,505 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
516
+ 2024-08-03 19:21:11,506 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
517
+ 2024-08-03 19:21:16,506 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
518
+ 2024-08-03 19:21:21,065 WARNING StreamThr :9504 [internal.py:is_dead():414] Internal process exiting, parent pid 9433 disappeared
519
+ 2024-08-03 19:21:21,065 ERROR StreamThr :9504 [internal.py:wandb_internal():152] Internal process shutdown.
520
+ 2024-08-03 19:21:21,507 INFO SenderThread:9504 [sender.py:finish():1572] shutting down sender
521
+ 2024-08-03 19:21:21,507 INFO SenderThread:9504 [file_pusher.py:finish():172] shutting down file pusher
522
+ 2024-08-03 19:21:21,507 INFO SenderThread:9504 [file_pusher.py:join():178] waiting for file pusher
523
+ 2024-08-03 19:21:21,507 INFO HandlerThread:9504 [handler.py:finish():869] shutting down handler
524
+ 2024-08-03 19:21:21,507 INFO WriterThread:9504 [datastore.py:close():296] close: /project/wandb/run-20240803_191815-jdwps0z3/run-jdwps0z3.wandb
wandb/run-20240803_191815-jdwps0z3/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-03 19:18:15,865 INFO MainThread:9433 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Configure stats pid to 9433
3
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
6
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240803_191815-jdwps0z3/logs/debug.log
9
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240803_191815-jdwps0z3/logs/debug-internal.log
10
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample2_train_2024-08-03-19:18:05', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample2', 'save': '/work/llm_recipes/models/tiny-mistral-sample2', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 1600, 'micro_batch_size': 40, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
13
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:init():616] starting backend
14
+ 2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-03 19:18:15,871 INFO MainThread:9433 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-03 19:18:15,872 INFO MainThread:9433 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-03 19:18:15,877 INFO MainThread:9433 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-03 19:18:15,887 INFO MainThread:9433 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-03 19:18:16,387 INFO MainThread:9433 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-03 19:18:16,470 INFO MainThread:9433 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-03 19:18:16,470 INFO MainThread:9433 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-03 19:18:16,525 INFO MainThread:9433 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-03 19:18:16,525 INFO MainThread:9433 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-03 19:18:16,525 INFO MainThread:9433 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-03 19:18:16,525 INFO MainThread:9433 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-03 19:18:16,526 INFO MainThread:9433 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-03 19:18:17,990 INFO MainThread:9433 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
29
+ 2024-08-03 19:18:17,990 INFO MainThread:9433 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
wandb/run-20240803_191815-jdwps0z3/run-jdwps0z3.wandb ADDED
Binary file (107 kB). View file
 
wandb/run-20240803_192355-n3hnzq4n/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-mistral-sample2_train_2024-08-03-19:23:42
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-mistral-sample2
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-mistral-sample2
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/custom/tiny-mistral
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 1600
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 40
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-mistral-sample2
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32768
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722680635.371313
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 256
321
+ model_type:
322
+ desc: null
323
+ value: mistral
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 512
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 4
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 4
333
+ model_architecture:
334
+ desc: null
335
+ value: MistralForCausalLM
wandb/run-20240803_192355-n3hnzq4n/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240803_192355-n3hnzq4n/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240803_192355-n3hnzq4n/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-03T10:23:55.999895",
5
+ "startedAt": "2024-08-03T10:23:55.358106",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "40",
15
+ "--global-batch-size",
16
+ "1600",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/custom/tiny-mistral",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-mistral-sample2",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-mistral-sample2",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-mistral-sample2",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-mistral-sample2_train_2024-08-03-19:23:42"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.034,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.034,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.034,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.034,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.034,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.034,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.034,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.034,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.034,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.034,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.034,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.034,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.034,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.034,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.034,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.034,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.034,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.034,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.034,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240803_192355-n3hnzq4n/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 8.676288604736328, "training/perplexity": 5862.24904336478, "utils/batch_size": 40, "utils/global_batch_size": 1600, "utils/seq_len": 513, "utils/gradient_accumulation_steps": 40, "utils/iteration": 2828, "optimizer/lr": 1.933962231874466e-05, "optimizer/variance_l2": 0.014160344368509084, "optimizer/variance_sqrt_l2": 0.9983195251588314, "optimizer/momentum_l2": 0.9847836932741917, "optimizer/weight_l2": 101.93656115447489, "optimizer/variance_l1": 1.0000762939453125, "optimizer/variance_sqrt_l1": 530.25, "optimizer/momentum_l1": 418.75, "optimizer/weight_l1": 333248.0, "optimizer/variance_abs_max": 0.00130462646484375, "optimizer/variance_sqrt_abs_max": 0.0361328125, "optimizer/momentum_abs_max": 0.03662109375, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 2.9201389469999413, "stats/tokens_per_sec": 281082.5152149914, "stats/tokens_per_sec_per_gpu": 281082.5152149914, "stats/tflops": 19.898159556447013, "_timestamp": 1722688970.2608888, "_runtime": 8334.889575719833, "_step": 2828, "evaluation/val_loss": 8.68109130859375, "evaluation/val_ppl": 5890.47216796875, "_wandb": {"runtime": 8336}}
wandb/run-20240803_192355-n3hnzq4n/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240803_192355-n3hnzq4n/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-03 19:23:55,363 INFO MainThread:10080 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Configure stats pid to 10080
3
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
6
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240803_192355-n3hnzq4n/logs/debug.log
9
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240803_192355-n3hnzq4n/logs/debug-internal.log
10
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample2_train_2024-08-03-19:23:42', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample2', 'save': '/work/llm_recipes/models/tiny-mistral-sample2', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 1600, 'micro_batch_size': 40, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
13
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:init():616] starting backend
14
+ 2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-03 19:23:55,369 INFO MainThread:10080 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-03 19:23:55,371 INFO MainThread:10080 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-03 19:23:55,375 INFO MainThread:10080 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-03 19:23:55,403 INFO MainThread:10080 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-03 19:23:55,888 INFO MainThread:10080 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-03 19:23:55,974 INFO MainThread:10080 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-03 19:23:55,974 INFO MainThread:10080 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-03 19:23:56,034 INFO MainThread:10080 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-03 19:23:56,034 INFO MainThread:10080 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-03 19:23:56,034 INFO MainThread:10080 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-03 19:23:56,035 INFO MainThread:10080 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-03 19:23:56,035 INFO MainThread:10080 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-03 19:23:57,141 INFO MainThread:10080 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
29
+ 2024-08-03 19:23:57,141 INFO MainThread:10080 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
wandb/run-20240812_063027-j1htzx7q/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '235289369'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '235289369'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '235289369'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/google/gemma-2-2b
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-sample-gemma-2-2b_train_2024-08-12-06:30:12
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-sample-gemma-2-2b
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-sample-gemma-2-2b
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/google/gemma-2-2b
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 3
138
+ save_interval:
139
+ desc: null
140
+ value: 3
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: anyprecision
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-sample-gemma-2-2b
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 256000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723411827.601845
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ model_architecture:
316
+ desc: null
317
+ value: Gemma2ForCausalLM
318
+ activation_function:
319
+ desc: null
320
+ value: gelu_pytorch_tanh
321
+ hidden_size:
322
+ desc: null
323
+ value: 2304
324
+ model_type:
325
+ desc: null
326
+ value: gemma2
327
+ max_position_embeddings:
328
+ desc: null
329
+ value: 4096
330
+ num_attention_heads:
331
+ desc: null
332
+ value: 8
333
+ num_hidden_layers:
334
+ desc: null
335
+ value: 26
wandb/run-20240812_063027-j1htzx7q/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240812_063027-j1htzx7q/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-11T21:30:28.265073",
5
+ "startedAt": "2024-08-11T21:30:27.589443",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/google/gemma-2-2b",
23
+ "--train-data-path",
24
+ "235289369",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "235289369",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "235289369",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "anyprecision",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "3",
56
+ "--eval-interval",
57
+ "3",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/google/gemma-2-2b",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-sample-gemma-2-2b",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-sample-gemma-2-2b",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-sample-gemma-2-2b",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-sample-gemma-2-2b_train_2024-08-12-06:30:12"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0429999999997,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.043,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.043,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.043,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.043,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.043,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.043,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.043,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.043,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.043,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.043,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.043,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.043,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.043,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.043,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.043,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.043,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.043,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.043,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.487823486328125
214
+ }
215
+ }
wandb/run-20240812_063027-j1htzx7q/logs/debug-internal.log ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 06:30:27,603 INFO StreamThr :12721 [internal.py:wandb_internal():86] W&B internal server running at pid: 12721, started at: 2024-08-12 06:30:27.602612
2
+ 2024-08-12 06:30:27,605 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-12 06:30:27,607 INFO WriterThread:12721 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_063027-j1htzx7q/run-j1htzx7q.wandb
4
+ 2024-08-12 06:30:27,608 DEBUG SenderThread:12721 [sender.py:send():382] send: header
5
+ 2024-08-12 06:30:27,640 DEBUG SenderThread:12721 [sender.py:send():382] send: run
6
+ 2024-08-12 06:30:28,148 INFO SenderThread:12721 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_063027-j1htzx7q/files
7
+ 2024-08-12 06:30:28,148 INFO SenderThread:12721 [sender.py:_start_run_threads():1136] run started: j1htzx7q with start time 1723411827.601845
8
+ 2024-08-12 06:30:28,154 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-12 06:30:28,154 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-12 06:30:28,244 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-12 06:30:28,251 DEBUG HandlerThread:12721 [system_info.py:__init__():27] System info init
12
+ 2024-08-12 06:30:28,251 DEBUG HandlerThread:12721 [system_info.py:__init__():42] System info init done
13
+ 2024-08-12 06:30:28,251 INFO HandlerThread:12721 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-12 06:30:28,251 INFO SystemMonitor:12721 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-12 06:30:28,251 INFO HandlerThread:12721 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-12 06:30:28,252 INFO SystemMonitor:12721 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-12 06:30:28,252 INFO SystemMonitor:12721 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-12 06:30:28,253 INFO SystemMonitor:12721 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-12 06:30:28,254 INFO SystemMonitor:12721 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-12 06:30:28,255 INFO SystemMonitor:12721 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-12 06:30:28,264 DEBUG HandlerThread:12721 [system_info.py:probe():151] Probing system
22
+ 2024-08-12 06:30:28,267 DEBUG HandlerThread:12721 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-12 06:30:28,279 DEBUG HandlerThread:12721 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-12 06:30:28,279 DEBUG HandlerThread:12721 [system_info.py:probe():199] Probing system done
25
+ 2024-08-12 06:30:28,279 DEBUG HandlerThread:12721 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T21:30:28.265073', 'startedAt': '2024-08-11T21:30:27.589443', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/google/gemma-2-2b', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '3', '--eval-interval', '3', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/google/gemma-2-2b', '--save', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--load', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-gemma-2-2b', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-gemma-2-2b_train_2024-08-12-06:30:12'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
26
+ 2024-08-12 06:30:28,279 INFO HandlerThread:12721 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-12 06:30:28,279 INFO HandlerThread:12721 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-12 06:30:28,281 INFO HandlerThread:12721 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-12 06:30:28,287 DEBUG SenderThread:12721 [sender.py:send():382] send: files
30
+ 2024-08-12 06:30:28,287 INFO SenderThread:12721 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-12 06:30:28,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-12 06:30:28,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: python_packages
33
+ 2024-08-12 06:30:28,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
34
+ 2024-08-12 06:30:28,298 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-12 06:30:28,299 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-12 06:30:28,566 DEBUG SenderThread:12721 [sender.py:send():382] send: telemetry
37
+ 2024-08-12 06:30:28,941 INFO wandb-upload_0:12721 [upload_job.py:push():131] Uploaded file /tmp/tmpagb8lhaywandb/h9wwuria-wandb-metadata.json
38
+ 2024-08-12 06:30:29,150 INFO Thread-12 :12721 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063027-j1htzx7q/files/requirements.txt
39
+ 2024-08-12 06:30:29,150 INFO Thread-12 :12721 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
40
+ 2024-08-12 06:30:29,150 INFO Thread-12 :12721 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-metadata.json
41
+ 2024-08-12 06:30:31,151 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
42
+ 2024-08-12 06:30:32,987 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-12 06:30:37,988 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
44
+ 2024-08-12 06:30:42,988 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
45
+ 2024-08-12 06:30:43,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
46
+ 2024-08-12 06:30:43,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
47
+ 2024-08-12 06:30:43,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
48
+ 2024-08-12 06:30:48,530 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
49
+ 2024-08-12 06:30:53,531 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
50
+ 2024-08-12 06:30:58,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
51
+ 2024-08-12 06:30:58,296 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
52
+ 2024-08-12 06:30:58,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
53
+ 2024-08-12 06:30:58,553 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
54
+ 2024-08-12 06:30:59,169 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/config.yaml
55
+ 2024-08-12 06:31:03,753 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
56
+ 2024-08-12 06:31:08,754 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
57
+ 2024-08-12 06:31:13,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
58
+ 2024-08-12 06:31:13,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
59
+ 2024-08-12 06:31:13,336 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
60
+ 2024-08-12 06:31:14,505 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
61
+ 2024-08-12 06:31:19,506 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-12 06:31:24,507 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
63
+ 2024-08-12 06:31:28,255 DEBUG SystemMonitor:12721 [system_monitor.py:_start():172] Starting system metrics aggregation loop
64
+ 2024-08-12 06:31:28,257 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
65
+ 2024-08-12 06:31:28,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
66
+ 2024-08-12 06:31:28,296 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
67
+ 2024-08-12 06:31:28,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
68
+ 2024-08-12 06:31:30,499 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
69
+ 2024-08-12 06:31:35,500 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
70
+ 2024-08-12 06:31:40,561 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
71
+ 2024-08-12 06:31:41,196 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
72
+ 2024-08-12 06:31:43,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
73
+ 2024-08-12 06:31:43,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
74
+ 2024-08-12 06:31:43,298 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
75
+ 2024-08-12 06:31:46,508 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
76
+ 2024-08-12 06:31:51,509 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
77
+ 2024-08-12 06:31:56,510 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
78
+ 2024-08-12 06:31:58,258 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
79
+ 2024-08-12 06:31:58,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
80
+ 2024-08-12 06:31:58,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
81
+ 2024-08-12 06:31:58,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
82
+ 2024-08-12 06:32:02,497 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
83
+ 2024-08-12 06:32:07,497 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
84
+ 2024-08-12 06:32:12,498 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
85
+ 2024-08-12 06:32:13,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
86
+ 2024-08-12 06:32:13,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
87
+ 2024-08-12 06:32:13,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
88
+ 2024-08-12 06:32:17,531 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-08-12 06:32:22,531 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
90
+ 2024-08-12 06:32:27,532 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
91
+ 2024-08-12 06:32:28,259 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
92
+ 2024-08-12 06:32:28,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
93
+ 2024-08-12 06:32:28,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
94
+ 2024-08-12 06:32:28,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
95
+ 2024-08-12 06:32:33,512 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
96
+ 2024-08-12 06:32:38,513 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
97
+ 2024-08-12 06:32:43,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
98
+ 2024-08-12 06:32:43,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
99
+ 2024-08-12 06:32:43,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
100
+ 2024-08-12 06:32:43,534 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
101
+ 2024-08-12 06:32:48,534 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
102
+ 2024-08-12 06:32:53,535 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
103
+ 2024-08-12 06:32:58,260 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
104
+ 2024-08-12 06:32:58,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
105
+ 2024-08-12 06:32:58,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
106
+ 2024-08-12 06:32:58,344 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
107
+ 2024-08-12 06:32:59,520 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
108
+ 2024-08-12 06:33:01,246 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
109
+ 2024-08-12 06:33:05,103 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
110
+ 2024-08-12 06:33:10,126 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
111
+ 2024-08-12 06:33:11,252 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
112
+ 2024-08-12 06:33:12,515 DEBUG SenderThread:12721 [sender.py:send():382] send: config
113
+ 2024-08-12 06:33:12,515 DEBUG SenderThread:12721 [sender.py:send():382] send: config
114
+ 2024-08-12 06:33:13,253 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
115
+ 2024-08-12 06:33:13,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
116
+ 2024-08-12 06:33:13,298 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
117
+ 2024-08-12 06:33:13,298 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
118
+ 2024-08-12 06:33:15,255 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
119
+ 2024-08-12 06:33:15,581 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
120
+ 2024-08-12 06:33:15,887 DEBUG SenderThread:12721 [sender.py:send():382] send: exit
121
+ 2024-08-12 06:33:15,887 INFO SenderThread:12721 [sender.py:send_exit():589] handling exit code: 1
122
+ 2024-08-12 06:33:15,887 INFO SenderThread:12721 [sender.py:send_exit():591] handling runtime: 167
123
+ 2024-08-12 06:33:15,889 INFO SenderThread:12721 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
124
+ 2024-08-12 06:33:15,889 INFO SenderThread:12721 [sender.py:send_exit():597] send defer
125
+ 2024-08-12 06:33:15,889 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
126
+ 2024-08-12 06:33:15,889 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 0
127
+ 2024-08-12 06:33:15,889 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
128
+ 2024-08-12 06:33:15,889 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 0
129
+ 2024-08-12 06:33:15,889 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 1
130
+ 2024-08-12 06:33:15,890 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
131
+ 2024-08-12 06:33:15,890 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 1
132
+ 2024-08-12 06:33:15,890 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
133
+ 2024-08-12 06:33:15,890 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 1
134
+ 2024-08-12 06:33:15,890 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 2
135
+ 2024-08-12 06:33:15,890 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
136
+ 2024-08-12 06:33:15,890 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 2
137
+ 2024-08-12 06:33:15,890 INFO HandlerThread:12721 [system_monitor.py:finish():203] Stopping system monitor
138
+ 2024-08-12 06:33:15,890 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined cpu monitor
139
+ 2024-08-12 06:33:15,890 DEBUG SystemMonitor:12721 [system_monitor.py:_start():179] Finished system metrics aggregation loop
140
+ 2024-08-12 06:33:15,891 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined disk monitor
141
+ 2024-08-12 06:33:15,891 DEBUG SystemMonitor:12721 [system_monitor.py:_start():183] Publishing last batch of metrics
142
+ 2024-08-12 06:33:15,924 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined gpu monitor
143
+ 2024-08-12 06:33:15,925 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined memory monitor
144
+ 2024-08-12 06:33:15,925 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined network monitor
145
+ 2024-08-12 06:33:15,925 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
146
+ 2024-08-12 06:33:15,925 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 2
147
+ 2024-08-12 06:33:15,925 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 3
148
+ 2024-08-12 06:33:15,926 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
149
+ 2024-08-12 06:33:15,926 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
150
+ 2024-08-12 06:33:15,926 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 3
151
+ 2024-08-12 06:33:15,926 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
152
+ 2024-08-12 06:33:15,926 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 3
153
+ 2024-08-12 06:33:15,926 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 4
154
+ 2024-08-12 06:33:15,926 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
155
+ 2024-08-12 06:33:15,926 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 4
156
+ 2024-08-12 06:33:15,926 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
157
+ 2024-08-12 06:33:15,926 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 4
158
+ 2024-08-12 06:33:15,927 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 5
159
+ 2024-08-12 06:33:15,927 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
160
+ 2024-08-12 06:33:15,927 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 5
161
+ 2024-08-12 06:33:15,927 DEBUG SenderThread:12721 [sender.py:send():382] send: summary
162
+ 2024-08-12 06:33:15,928 INFO SenderThread:12721 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
163
+ 2024-08-12 06:33:15,928 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
164
+ 2024-08-12 06:33:15,928 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 5
165
+ 2024-08-12 06:33:15,928 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 6
166
+ 2024-08-12 06:33:15,928 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
167
+ 2024-08-12 06:33:15,928 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 6
168
+ 2024-08-12 06:33:15,928 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
169
+ 2024-08-12 06:33:15,928 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 6
170
+ 2024-08-12 06:33:15,931 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
171
+ 2024-08-12 06:33:16,132 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 7
172
+ 2024-08-12 06:33:16,132 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
173
+ 2024-08-12 06:33:16,132 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 7
174
+ 2024-08-12 06:33:16,132 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
175
+ 2024-08-12 06:33:16,132 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 7
176
+ 2024-08-12 06:33:16,256 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/config.yaml
177
+ 2024-08-12 06:33:16,256 INFO Thread-12 :12721 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json
178
+ 2024-08-12 06:33:16,887 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
179
+ 2024-08-12 06:33:17,100 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 8
180
+ 2024-08-12 06:33:17,100 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
181
+ 2024-08-12 06:33:17,100 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
182
+ 2024-08-12 06:33:17,100 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 8
183
+ 2024-08-12 06:33:17,100 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
184
+ 2024-08-12 06:33:17,100 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 8
185
+ 2024-08-12 06:33:17,100 INFO SenderThread:12721 [job_builder.py:build():296] Attempting to build job artifact
186
+ 2024-08-12 06:33:17,101 INFO SenderThread:12721 [job_builder.py:_get_source_type():426] is repo sourced job
187
+ 2024-08-12 06:33:17,116 INFO SenderThread:12721 [job_builder.py:build():402] adding wandb-job metadata file
188
+ 2024-08-12 06:33:17,124 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 9
189
+ 2024-08-12 06:33:17,125 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
190
+ 2024-08-12 06:33:17,125 DEBUG SenderThread:12721 [sender.py:send():382] send: artifact
191
+ 2024-08-12 06:33:17,125 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 9
192
+ 2024-08-12 06:33:17,257 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
193
+ 2024-08-12 06:33:17,887 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
194
+ 2024-08-12 06:33:18,153 INFO SenderThread:12721 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE0MDAxODM0Nw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
195
+ 2024-08-12 06:33:18,154 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
196
+ 2024-08-12 06:33:18,154 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 9
197
+ 2024-08-12 06:33:18,154 INFO SenderThread:12721 [dir_watcher.py:finish():358] shutting down directory watcher
198
+ 2024-08-12 06:33:18,258 INFO SenderThread:12721 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_063027-j1htzx7q/files
199
+ 2024-08-12 06:33:18,258 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/requirements.txt requirements.txt
200
+ 2024-08-12 06:33:18,258 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/config.yaml config.yaml
201
+ 2024-08-12 06:33:18,259 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-metadata.json wandb-metadata.json
202
+ 2024-08-12 06:33:18,260 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json wandb-summary.json
203
+ 2024-08-12 06:33:18,262 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log output.log
204
+ 2024-08-12 06:33:18,262 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 10
205
+ 2024-08-12 06:33:18,262 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
206
+ 2024-08-12 06:33:18,264 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
207
+ 2024-08-12 06:33:18,265 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 10
208
+ 2024-08-12 06:33:18,266 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
209
+ 2024-08-12 06:33:18,266 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 10
210
+ 2024-08-12 06:33:18,266 INFO SenderThread:12721 [file_pusher.py:finish():172] shutting down file pusher
211
+ 2024-08-12 06:33:18,655 INFO wandb-upload_0:12721 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063027-j1htzx7q/files/requirements.txt
212
+ 2024-08-12 06:33:18,745 INFO wandb-upload_1:12721 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063027-j1htzx7q/files/config.yaml
213
+ 2024-08-12 06:33:18,843 INFO wandb-upload_2:12721 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json
214
+ 2024-08-12 06:33:18,858 INFO wandb-upload_3:12721 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
215
+ 2024-08-12 06:33:18,888 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
216
+ 2024-08-12 06:33:18,889 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
217
+ 2024-08-12 06:33:19,058 INFO Thread-11 (_thread_body):12721 [sender.py:transition_state():617] send defer: 11
218
+ 2024-08-12 06:33:19,059 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
219
+ 2024-08-12 06:33:19,059 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 11
220
+ 2024-08-12 06:33:19,059 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
221
+ 2024-08-12 06:33:19,059 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 11
222
+ 2024-08-12 06:33:19,059 INFO SenderThread:12721 [file_pusher.py:join():178] waiting for file pusher
223
+ 2024-08-12 06:33:19,059 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 12
224
+ 2024-08-12 06:33:19,059 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
225
+ 2024-08-12 06:33:19,059 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 12
226
+ 2024-08-12 06:33:19,059 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
227
+ 2024-08-12 06:33:19,059 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 12
228
+ 2024-08-12 06:33:19,059 INFO SenderThread:12721 [file_stream.py:finish():595] file stream finish called
229
+ 2024-08-12 06:33:19,821 INFO SenderThread:12721 [file_stream.py:finish():599] file stream finish is done
230
+ 2024-08-12 06:33:19,821 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 13
231
+ 2024-08-12 06:33:19,822 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
232
+ 2024-08-12 06:33:19,822 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 13
233
+ 2024-08-12 06:33:19,822 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
234
+ 2024-08-12 06:33:19,822 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 13
235
+ 2024-08-12 06:33:19,822 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 14
236
+ 2024-08-12 06:33:19,822 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
237
+ 2024-08-12 06:33:19,823 DEBUG SenderThread:12721 [sender.py:send():382] send: final
238
+ 2024-08-12 06:33:19,823 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 14
239
+ 2024-08-12 06:33:19,823 DEBUG SenderThread:12721 [sender.py:send():382] send: footer
240
+ 2024-08-12 06:33:19,823 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
241
+ 2024-08-12 06:33:19,823 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 14
242
+ 2024-08-12 06:33:19,823 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
243
+ 2024-08-12 06:33:19,823 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
244
+ 2024-08-12 06:33:19,824 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
245
+ 2024-08-12 06:33:19,824 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
246
+ 2024-08-12 06:33:19,824 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: server_info
247
+ 2024-08-12 06:33:19,824 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: get_summary
248
+ 2024-08-12 06:33:19,825 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: server_info
249
+ 2024-08-12 06:33:19,826 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: sampled_history
250
+ 2024-08-12 06:33:19,826 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
251
+ 2024-08-12 06:33:19,827 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: job_info
252
+ 2024-08-12 06:33:19,994 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: job_info
253
+ 2024-08-12 06:33:19,994 INFO MainThread:12721 [wandb_run.py:_footer_history_summary_info():3866] rendering history
254
+ 2024-08-12 06:33:19,994 INFO MainThread:12721 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
255
+ 2024-08-12 06:33:19,995 INFO MainThread:12721 [wandb_run.py:_footer_sync_info():3825] logging synced files
256
+ 2024-08-12 06:33:19,995 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: shutdown
257
+ 2024-08-12 06:33:19,995 INFO HandlerThread:12721 [handler.py:finish():869] shutting down handler
258
+ 2024-08-12 06:33:20,827 INFO WriterThread:12721 [datastore.py:close():296] close: /project/wandb/run-20240812_063027-j1htzx7q/run-j1htzx7q.wandb
259
+ 2024-08-12 06:33:20,994 INFO SenderThread:12721 [sender.py:finish():1572] shutting down sender
260
+ 2024-08-12 06:33:20,995 INFO SenderThread:12721 [file_pusher.py:finish():172] shutting down file pusher
261
+ 2024-08-12 06:33:20,995 INFO SenderThread:12721 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240812_063027-j1htzx7q/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Configure stats pid to 12650
3
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_063027-j1htzx7q/logs/debug.log
9
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_063027-j1htzx7q/logs/debug-internal.log
10
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-12-06:30:12', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 3, 'save_interval': 3, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 320}
13
+ 2024-08-12 06:30:27,596 INFO MainThread:12650 [wandb_init.py:init():616] starting backend
14
+ 2024-08-12 06:30:27,596 INFO MainThread:12650 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-12 06:30:27,600 INFO MainThread:12650 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-12 06:30:27,601 INFO MainThread:12650 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-12 06:30:27,606 INFO MainThread:12650 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-12 06:30:27,632 INFO MainThread:12650 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-12 06:30:28,153 INFO MainThread:12650 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-12 06:30:28,237 INFO MainThread:12650 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-12 06:30:28,237 INFO MainThread:12650 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-12 06:30:28,296 INFO MainThread:12650 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-12 06:30:28,296 INFO MainThread:12650 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-12 06:30:28,296 INFO MainThread:12650 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-12 06:30:28,296 INFO MainThread:12650 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-12 06:30:28,298 INFO MainThread:12650 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-12 06:33:12,514 INFO MainThread:12650 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Gemma2ForCausalLM', 'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 4096, 'num_attention_heads': 8, 'num_hidden_layers': 26}
29
+ 2024-08-12 06:33:12,515 INFO MainThread:12650 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-12 06:33:20,996 WARNING MsgRouterThr:12650 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240812_063027-j1htzx7q/run-j1htzx7q.wandb ADDED
Binary file (25 kB). View file
 
wandb/run-20240823_163849-faey1t8u/files/config.yaml ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '1754785366'
31
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
32
+ - '28623823675'
33
+ - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
34
+ valid_data_path:
35
+ desc: null
36
+ value:
37
+ - '1754785366'
38
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
39
+ test_data_path:
40
+ desc: null
41
+ value:
42
+ - '1754785366'
43
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
44
+ data_cache_path:
45
+ desc: null
46
+ value: null
47
+ vocab_size:
48
+ desc: null
49
+ value: null
50
+ vocab_file:
51
+ desc: null
52
+ value: null
53
+ merge_file:
54
+ desc: null
55
+ value: null
56
+ seq_length:
57
+ desc: null
58
+ value: 4096
59
+ num_workers:
60
+ desc: null
61
+ value: 2
62
+ tokenizer_type:
63
+ desc: null
64
+ value: HFPreTrainedTokenizer
65
+ tokenizer_model:
66
+ desc: null
67
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
68
+ reset_position_ids:
69
+ desc: null
70
+ value: false
71
+ reset_attention_mask:
72
+ desc: null
73
+ value: false
74
+ eod_mask_loss:
75
+ desc: null
76
+ value: false
77
+ retro_return_doc_ids:
78
+ desc: null
79
+ value: false
80
+ short_seq_prob:
81
+ desc: null
82
+ value: 0.1
83
+ vocab_extra_ids:
84
+ desc: null
85
+ value: 0
86
+ seed:
87
+ desc: null
88
+ value: 1234
89
+ use_mpi:
90
+ desc: null
91
+ value: false
92
+ wandb_entity:
93
+ desc: null
94
+ value: iwakawa-koichi-q5-tohoku-nlp6723
95
+ wandb_name:
96
+ desc: null
97
+ value: Qwen2-0.5b-0.2_train_2024-08-23-16:38:35
98
+ wandb_project:
99
+ desc: null
100
+ value: llm_tutorial-0.2
101
+ quantization:
102
+ desc: null
103
+ value: false
104
+ use_freeze_layers:
105
+ desc: null
106
+ value: false
107
+ freeze_layers:
108
+ desc: null
109
+ value: null
110
+ bf16:
111
+ desc: null
112
+ value: true
113
+ fp16:
114
+ desc: null
115
+ value: false
116
+ mixed_precision:
117
+ desc: null
118
+ value: true
119
+ param_dtype:
120
+ desc: null
121
+ value: null
122
+ load:
123
+ desc: null
124
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
125
+ save:
126
+ desc: null
127
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
128
+ base_model:
129
+ desc: null
130
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
131
+ use_better_transformer:
132
+ desc: null
133
+ value: false
134
+ grad_clip_norm:
135
+ desc: null
136
+ value: 1.0
137
+ eval_interval:
138
+ desc: null
139
+ value: 10
140
+ save_interval:
141
+ desc: null
142
+ value: 10
143
+ eval_iters:
144
+ desc: null
145
+ value: 10
146
+ optimizer:
147
+ desc: null
148
+ value: anyprecision
149
+ lr:
150
+ desc: null
151
+ value: 2.0e-05
152
+ lr_decay_style:
153
+ desc: null
154
+ value: cosine
155
+ lr_decay_iters:
156
+ desc: null
157
+ value: 7500
158
+ lr_warmup_iters:
159
+ desc: null
160
+ value: 500
161
+ min_lr:
162
+ desc: null
163
+ value: 1.0e-06
164
+ train_iters:
165
+ desc: null
166
+ value: 7500
167
+ train_samples:
168
+ desc: null
169
+ value: null
170
+ global_batch_size:
171
+ desc: null
172
+ value: 320
173
+ micro_batch_size:
174
+ desc: null
175
+ value: 3
176
+ make_vocab_size_divisible_by:
177
+ desc: null
178
+ value: 128
179
+ sliding_window_size:
180
+ desc: null
181
+ value: 131072
182
+ skip_batch:
183
+ desc: null
184
+ value: null
185
+ no_save_optimizer_state:
186
+ desc: null
187
+ value: false
188
+ continual_pretraining:
189
+ desc: null
190
+ value: false
191
+ instruction_tuning:
192
+ desc: null
193
+ value: false
194
+ direct_preference_optimization:
195
+ desc: null
196
+ value: false
197
+ attention_dropout:
198
+ desc: null
199
+ value: 0.1
200
+ hidden_dropout:
201
+ desc: null
202
+ value: 0.1
203
+ weight_decay:
204
+ desc: null
205
+ value: 0.1
206
+ adam_beta1:
207
+ desc: null
208
+ value: 0.9
209
+ adam_beta2:
210
+ desc: null
211
+ value: 0.95
212
+ adam_eps:
213
+ desc: null
214
+ value: 1.0e-06
215
+ hf_transformer_model_dir:
216
+ desc: null
217
+ value: null
218
+ instruction_train_data_path:
219
+ desc: null
220
+ value: null
221
+ instruction_valid_data_path:
222
+ desc: null
223
+ value: null
224
+ epoch:
225
+ desc: null
226
+ value: null
227
+ instruction_dataset_size:
228
+ desc: null
229
+ value: null
230
+ save_sampler_state:
231
+ desc: null
232
+ value: false
233
+ label_smoothing:
234
+ desc: null
235
+ value: 0.0
236
+ save_n_checkpoints:
237
+ desc: null
238
+ value: 10
239
+ hf_repo_id:
240
+ desc: null
241
+ value: koichi12/Qwen2-0.5b-0.2
242
+ create_public_hf_repo:
243
+ desc: null
244
+ value: false
245
+ upload_all_checkpoints_to_hf:
246
+ desc: null
247
+ value: true
248
+ hf_upload_retry_limit:
249
+ desc: null
250
+ value: 2
251
+ exit_duration_in_mins:
252
+ desc: null
253
+ value: null
254
+ source_key:
255
+ desc: null
256
+ value: null
257
+ target_key:
258
+ desc: null
259
+ value: null
260
+ attn_implementation:
261
+ desc: null
262
+ value: flash_attention_2
263
+ efficient_instruction_tuning:
264
+ desc: null
265
+ value: false
266
+ remove_padding_masking:
267
+ desc: null
268
+ value: false
269
+ save_start_iter:
270
+ desc: null
271
+ value: null
272
+ valid_micro_batch_size:
273
+ desc: null
274
+ value: 1
275
+ rank:
276
+ desc: null
277
+ value: 0
278
+ world_size:
279
+ desc: null
280
+ value: 1
281
+ padded_vocab_size:
282
+ desc: null
283
+ value: 151680
284
+ gradient_accumulation_steps:
285
+ desc: null
286
+ value: 106
287
+ _wandb:
288
+ desc: null
289
+ value:
290
+ python_version: 3.10.12
291
+ cli_version: 0.16.3
292
+ framework: huggingface
293
+ huggingface_version: 4.43.3
294
+ is_jupyter_run: false
295
+ is_kaggle_kernel: false
296
+ start_time: 1724398729.364923
297
+ t:
298
+ 1:
299
+ - 1
300
+ - 11
301
+ - 49
302
+ - 55
303
+ - 71
304
+ - 105
305
+ 2:
306
+ - 1
307
+ - 11
308
+ - 49
309
+ - 55
310
+ - 71
311
+ - 105
312
+ 3:
313
+ - 13
314
+ - 16
315
+ - 23
316
+ 4: 3.10.12
317
+ 5: 0.16.3
318
+ 6: 4.43.3
319
+ 8:
320
+ - 5
321
+ 13: linux-x86_64
322
+ model_architecture:
323
+ desc: null
324
+ value: Qwen2ForCausalLM
325
+ activation_function:
326
+ desc: null
327
+ value: silu
328
+ hidden_size:
329
+ desc: null
330
+ value: 896
331
+ model_type:
332
+ desc: null
333
+ value: qwen2
334
+ max_position_embeddings:
335
+ desc: null
336
+ value: 4096
337
+ num_attention_heads:
338
+ desc: null
339
+ value: 14
340
+ num_hidden_layers:
341
+ desc: null
342
+ value: 24
wandb/run-20240823_163849-faey1t8u/files/output.log ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
5
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
6
+ Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
7
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
8
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
9
+ BFloat16 enabled for mixed precision - using bfSixteen policy
10
+ --> applying fsdp activation checkpointing...
11
+ > datasets target sizes (minimum size):
12
+ train: 2400000
13
+ validation: 2403200
14
+ test: 3200
15
+ > building train, validation, and test datasets for GPT ...
16
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
17
+ warnings.warn(
18
+ Let split = None
19
+ Unable to save the indexes because path_to_cache is None
20
+ Building a BlendedDataset for a single MegatronDataset
21
+ Unable to save the indexes because path_to_cache is None
22
+ Building a BlendedDataset for a single MegatronDataset
23
+ Unable to save the indexes because path_to_cache is None
24
+ > finished creating GPT datasets ...
25
+ Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
26
+ [rank0]:[2024-08-23 16:38:58,062] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
27
+ Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
28
+ model info: FullyShardedDataParallel(
29
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
30
+ (model): Qwen2Model(
31
+ (embed_tokens): Embedding(151936, 896)
32
+ (layers): ModuleList(
33
+ (0-23): 24 x FullyShardedDataParallel(
34
+ (_fsdp_wrapped_module): CheckpointWrapper(
35
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
36
+ (self_attn): Qwen2FlashAttention2(
37
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
38
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
39
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
40
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
41
+ (rotary_emb): Qwen2RotaryEmbedding()
42
+ )
43
+ (mlp): Qwen2MLP(
44
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
45
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
46
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
47
+ (act_fn): SiLU()
48
+ )
49
+ (input_layernorm): Qwen2RMSNorm()
50
+ (post_attention_layernorm): Qwen2RMSNorm()
51
+ )
52
+ )
53
+ )
54
+ )
55
+ (norm): Qwen2RMSNorm()
56
+ )
57
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
58
+ )
59
+ )
60
+ model config: Qwen2Config {
61
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
62
+ "architectures": [
63
+ "Qwen2ForCausalLM"
64
+ ],
65
+ "attention_dropout": 0.0,
66
+ "bos_token_id": 151643,
67
+ "eos_token_id": 151643,
68
+ "hidden_act": "silu",
69
+ "hidden_size": 896,
70
+ "initializer_range": 0.02,
71
+ "intermediate_size": 4864,
72
+ "label_smoothing": 0.0,
73
+ "max_position_embeddings": 4096,
74
+ "max_window_layers": 24,
75
+ "model_type": "qwen2",
76
+ "num_attention_heads": 14,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "rms_norm_eps": 1e-06,
80
+ "rope_theta": 1000000.0,
81
+ "sliding_window": 131072,
82
+ "tie_word_embeddings": true,
83
+ "torch_dtype": "bfloat16",
84
+ "transformers_version": "4.43.3",
85
+ "use_cache": false,
86
+ "use_sliding_window": false,
87
+ "vocab_size": 151936
88
+ }
89
+ ------------------------------------------------------------------
90
+ iteration: 41 , TFLOPS: 89.20992749379542, Tokens per sec: 22186.06078570857, Loss: 4.376823425292969
91
+ ------------------------------------------------------------------
92
+ ------------------------------------------------------------------
93
+ iteration: 42 , TFLOPS: 90.73548605717187, Tokens per sec: 22565.459536162463, Loss: 4.388589382171631
94
+ ------------------------------------------------------------------
95
+ ------------------------------------------------------------------
96
+ iteration: 43 , TFLOPS: 90.773373714071, Tokens per sec: 22574.882006089374, Loss: 4.334207057952881
97
+ ------------------------------------------------------------------
98
+ ------------------------------------------------------------------
99
+ iteration: 44 , TFLOPS: 90.78534040748795, Tokens per sec: 22577.85806262267, Loss: 4.347831726074219
100
+ ------------------------------------------------------------------
101
+ ------------------------------------------------------------------
102
+ iteration: 45 , TFLOPS: 90.91842135677658, Tokens per sec: 22610.954626125003, Loss: 4.369765281677246
103
+ ------------------------------------------------------------------
104
+ ------------------------------------------------------------------
105
+ iteration: 46 , TFLOPS: 90.73681564901436, Tokens per sec: 22565.79019897395, Loss: 4.371013164520264
106
+ ------------------------------------------------------------------
107
+ ------------------------------------------------------------------
108
+ iteration: 47 , TFLOPS: 90.85118295674725, Tokens per sec: 22594.23277383689, Loss: 4.347028732299805
109
+ ------------------------------------------------------------------
110
+ ------------------------------------------------------------------
111
+ iteration: 48 , TFLOPS: 90.81419242163798, Tokens per sec: 22585.033413592068, Loss: 4.319859504699707
112
+ ------------------------------------------------------------------
113
+ Traceback (most recent call last):
114
+ File "/project/examples/finetuning.py", line 13, in <module>
115
+ main()
116
+ File "/project/src/llama_recipes/finetuning.py", line 282, in main
117
+ train(
118
+ File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
119
+ loss.backward()
120
+ File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
121
+ torch.autograd.backward(
122
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
123
+ _engine_run_backward(
124
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
125
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
126
+ KeyboardInterrupt
wandb/run-20240823_163849-faey1t8u/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0
wandb/run-20240823_163849-faey1t8u/files/wandb-metadata.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-23T07:38:49.894670",
5
+ "startedAt": "2024-08-23T07:38:49.352718",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "131072",
13
+ "--micro-batch-size",
14
+ "3",
15
+ "--valid_micro_batch_size",
16
+ "1",
17
+ "--global-batch-size",
18
+ "320",
19
+ "--train-iters",
20
+ "7500",
21
+ "--tokenizer-type",
22
+ "HFPreTrainedTokenizer",
23
+ "--tokenizer-model",
24
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
25
+ "--train-data-path",
26
+ "1754785366",
27
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
28
+ "28623823675",
29
+ "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
30
+ "--valid-data-path",
31
+ "1754785366",
32
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
33
+ "--test-data-path",
34
+ "1754785366",
35
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
36
+ "--lr",
37
+ "2e-5",
38
+ "--min-lr",
39
+ "1e-6",
40
+ "--lr-decay-style",
41
+ "cosine",
42
+ "--lr-warmup-iters",
43
+ "500",
44
+ "--lr-decay-iters",
45
+ "7500",
46
+ "--weight-decay",
47
+ "0.1",
48
+ "--grad-clip-norm",
49
+ "1.0",
50
+ "--optimizer",
51
+ "anyprecision",
52
+ "--adam-beta1",
53
+ "0.9",
54
+ "--adam-beta2",
55
+ "0.95",
56
+ "--adam-eps",
57
+ "1e-6",
58
+ "--save-interval",
59
+ "10",
60
+ "--eval-interval",
61
+ "10",
62
+ "--eval-iters",
63
+ "10",
64
+ "--bf16",
65
+ "--mixed-precision",
66
+ "--base-model",
67
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
68
+ "--save",
69
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
70
+ "--load",
71
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
72
+ "--fsdp-activation-checkpointing",
73
+ "--sharding-strategy",
74
+ "FULL_SHARD",
75
+ "--checkpoint-type",
76
+ "LOCAL_STATE_DICT",
77
+ "--save-n-checkpoints",
78
+ "10",
79
+ "--upload-all-checkpoints-to-hf",
80
+ "--hf-upload-retry-limit",
81
+ "2",
82
+ "--hf-repo-id",
83
+ "koichi12/Qwen2-0.5b-0.2",
84
+ "--wandb-entity",
85
+ "iwakawa-koichi-q5-tohoku-nlp6723",
86
+ "--wandb-project",
87
+ "llm_tutorial-0.2",
88
+ "--wandb-name",
89
+ "Qwen2-0.5b-0.2_train_2024-08-23-16:38:35"
90
+ ],
91
+ "state": "running",
92
+ "program": "/project/examples/finetuning.py",
93
+ "codePathLocal": "examples/finetuning.py",
94
+ "codePath": "examples/finetuning.py",
95
+ "git": {
96
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
97
+ "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
98
+ },
99
+ "email": null,
100
+ "root": "/project",
101
+ "host": "gpu-koiwa-00",
102
+ "username": "koiwa",
103
+ "executable": "/usr/bin/python",
104
+ "cpu_count": 18,
105
+ "cpu_count_logical": 18,
106
+ "cpu_freq": {
107
+ "current": 2400.0389999999993,
108
+ "min": 0.0,
109
+ "max": 0.0
110
+ },
111
+ "cpu_freq_per_core": [
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2400.039,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ }
202
+ ],
203
+ "disk": {
204
+ "/": {
205
+ "total": 0.0625,
206
+ "used": 1.1444091796875e-05
207
+ }
208
+ },
209
+ "gpu": "NVIDIA A100-SXM4-40GB",
210
+ "gpu_count": 1,
211
+ "gpu_devices": [
212
+ {
213
+ "name": "NVIDIA A100-SXM4-40GB",
214
+ "memory_total": 42949672960
215
+ }
216
+ ],
217
+ "memory": {
218
+ "total": 56.487831115722656
219
+ }
220
+ }
wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 4.319859504699707, "training/perplexity": 75.17806538514934, "utils/batch_size": 3, "utils/global_batch_size": 318, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 106, "utils/iteration": 48, "optimizer/lr": 2.8240000000000004e-06, "optimizer/variance_l2": 0.050272112510912605, "optimizer/variance_sqrt_l2": 0.9579955556165142, "optimizer/momentum_l2": 0.9571913293356521, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.9176788330078125, "optimizer/variance_sqrt_l1": 4099.5, "optimizer/momentum_l1": 3579.0, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.0419921875, "optimizer/variance_sqrt_abs_max": 0.205078125, "optimizer/momentum_abs_max": 0.2236328125, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 57.68625514700034, "stats/tokens_per_sec": 22585.033413592068, "stats/tokens_per_sec_per_gpu": 22585.033413592068, "stats/tflops": 90.81419242163798, "_timestamp": 1724399202.2699971, "_runtime": 472.90507411956787, "_step": 48, "_wandb": {"runtime": 476}}
wandb/run-20240823_163849-faey1t8u/logs/debug-internal.log ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 16:38:49,367 INFO StreamThr :12305 [internal.py:wandb_internal():86] W&B internal server running at pid: 12305, started at: 2024-08-23 16:38:49.366185
2
+ 2024-08-23 16:38:49,368 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-23 16:38:49,370 INFO WriterThread:12305 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_163849-faey1t8u/run-faey1t8u.wandb
4
+ 2024-08-23 16:38:49,371 DEBUG SenderThread:12305 [sender.py:send():382] send: header
5
+ 2024-08-23 16:38:49,405 DEBUG SenderThread:12305 [sender.py:send():382] send: run
6
+ 2024-08-23 16:38:49,800 INFO SenderThread:12305 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_163849-faey1t8u/files
7
+ 2024-08-23 16:38:49,800 INFO SenderThread:12305 [sender.py:_start_run_threads():1136] run started: faey1t8u with start time 1724398729.364923
8
+ 2024-08-23 16:38:49,806 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-23 16:38:49,806 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-23 16:38:49,876 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-23 16:38:49,882 DEBUG HandlerThread:12305 [system_info.py:__init__():27] System info init
12
+ 2024-08-23 16:38:49,882 DEBUG HandlerThread:12305 [system_info.py:__init__():42] System info init done
13
+ 2024-08-23 16:38:49,882 INFO HandlerThread:12305 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-23 16:38:49,882 INFO SystemMonitor:12305 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-23 16:38:49,882 INFO HandlerThread:12305 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-23 16:38:49,883 INFO SystemMonitor:12305 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-23 16:38:49,883 INFO SystemMonitor:12305 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-23 16:38:49,884 INFO SystemMonitor:12305 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-23 16:38:49,885 INFO SystemMonitor:12305 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-23 16:38:49,886 INFO SystemMonitor:12305 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-23 16:38:49,894 DEBUG HandlerThread:12305 [system_info.py:probe():151] Probing system
22
+ 2024-08-23 16:38:49,896 DEBUG HandlerThread:12305 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-23 16:38:49,908 DEBUG HandlerThread:12305 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-23 16:38:49,909 DEBUG HandlerThread:12305 [system_info.py:probe():199] Probing system done
25
+ 2024-08-23 16:38:49,909 DEBUG HandlerThread:12305 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T07:38:49.894670', 'startedAt': '2024-08-23T07:38:49.352718', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '131072', '--micro-batch-size', '3', '--valid_micro_batch_size', '1', '--global-batch-size', '320', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-16:38:35'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
26
+ 2024-08-23 16:38:49,909 INFO HandlerThread:12305 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-23 16:38:49,909 INFO HandlerThread:12305 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-23 16:38:49,910 INFO HandlerThread:12305 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-23 16:38:49,915 DEBUG SenderThread:12305 [sender.py:send():382] send: files
30
+ 2024-08-23 16:38:49,916 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-23 16:38:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-23 16:38:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
33
+ 2024-08-23 16:38:49,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
34
+ 2024-08-23 16:38:49,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-23 16:38:49,930 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-23 16:38:50,183 DEBUG SenderThread:12305 [sender.py:send():382] send: telemetry
37
+ 2024-08-23 16:38:50,520 INFO wandb-upload_0:12305 [upload_job.py:push():131] Uploaded file /tmp/tmpljn_2vd6wandb/tfd8n6zw-wandb-metadata.json
38
+ 2024-08-23 16:38:50,802 INFO Thread-12 :12305 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-metadata.json
39
+ 2024-08-23 16:38:50,802 INFO Thread-12 :12305 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_163849-faey1t8u/files/requirements.txt
40
+ 2024-08-23 16:38:50,803 INFO Thread-12 :12305 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
41
+ 2024-08-23 16:38:52,803 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
42
+ 2024-08-23 16:38:54,804 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
43
+ 2024-08-23 16:38:54,958 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
44
+ 2024-08-23 16:38:56,805 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
45
+ 2024-08-23 16:38:58,352 DEBUG SenderThread:12305 [sender.py:send():382] send: config
46
+ 2024-08-23 16:38:58,353 DEBUG SenderThread:12305 [sender.py:send():382] send: config
47
+ 2024-08-23 16:38:58,807 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
48
+ 2024-08-23 16:39:00,353 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
49
+ 2024-08-23 16:39:00,808 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
50
+ 2024-08-23 16:39:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
51
+ 2024-08-23 16:39:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
52
+ 2024-08-23 16:39:04,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
53
+ 2024-08-23 16:39:06,113 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
54
+ 2024-08-23 16:39:11,113 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-08-23 16:39:16,114 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
56
+ 2024-08-23 16:39:19,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
57
+ 2024-08-23 16:39:19,926 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
58
+ 2024-08-23 16:39:19,967 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
59
+ 2024-08-23 16:39:21,162 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
60
+ 2024-08-23 16:39:21,819 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/config.yaml
61
+ 2024-08-23 16:39:26,345 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-23 16:39:31,346 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
63
+ 2024-08-23 16:39:34,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
64
+ 2024-08-23 16:39:34,926 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
65
+ 2024-08-23 16:39:34,967 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
66
+ 2024-08-23 16:39:37,125 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
67
+ 2024-08-23 16:39:42,126 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
68
+ 2024-08-23 16:39:47,127 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
69
+ 2024-08-23 16:39:49,886 DEBUG SystemMonitor:12305 [system_monitor.py:_start():172] Starting system metrics aggregation loop
70
+ 2024-08-23 16:39:49,888 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
71
+ 2024-08-23 16:39:49,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
72
+ 2024-08-23 16:39:49,926 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
73
+ 2024-08-23 16:39:49,967 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
74
+ 2024-08-23 16:39:53,111 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
75
+ 2024-08-23 16:39:58,112 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
76
+ 2024-08-23 16:39:58,398 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
77
+ 2024-08-23 16:40:00,838 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
78
+ 2024-08-23 16:40:03,440 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
79
+ 2024-08-23 16:40:04,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
80
+ 2024-08-23 16:40:04,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
81
+ 2024-08-23 16:40:04,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
82
+ 2024-08-23 16:40:09,115 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
83
+ 2024-08-23 16:40:14,116 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
84
+ 2024-08-23 16:40:19,117 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
85
+ 2024-08-23 16:40:19,889 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
86
+ 2024-08-23 16:40:19,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
87
+ 2024-08-23 16:40:19,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
88
+ 2024-08-23 16:40:19,967 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
89
+ 2024-08-23 16:40:25,115 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
90
+ 2024-08-23 16:40:30,116 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
91
+ 2024-08-23 16:40:34,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
92
+ 2024-08-23 16:40:34,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
93
+ 2024-08-23 16:40:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
94
+ 2024-08-23 16:40:35,173 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-23 16:40:40,174 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
96
+ 2024-08-23 16:40:45,174 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
97
+ 2024-08-23 16:40:49,890 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
98
+ 2024-08-23 16:40:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
99
+ 2024-08-23 16:40:49,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
100
+ 2024-08-23 16:40:49,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
101
+ 2024-08-23 16:40:51,147 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
102
+ 2024-08-23 16:40:56,136 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
103
+ 2024-08-23 16:40:56,139 DEBUG SenderThread:12305 [sender.py:send():382] send: history
104
+ 2024-08-23 16:40:56,139 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
105
+ 2024-08-23 16:40:56,154 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
106
+ 2024-08-23 16:40:56,154 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
107
+ 2024-08-23 16:40:56,867 INFO Thread-12 :12305 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
108
+ 2024-08-23 16:40:58,868 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
109
+ 2024-08-23 16:41:01,180 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
110
+ 2024-08-23 16:41:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
111
+ 2024-08-23 16:41:04,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
112
+ 2024-08-23 16:41:04,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
113
+ 2024-08-23 16:41:07,173 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
114
+ 2024-08-23 16:41:12,174 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
115
+ 2024-08-23 16:41:17,174 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
116
+ 2024-08-23 16:41:19,891 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
117
+ 2024-08-23 16:41:19,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
118
+ 2024-08-23 16:41:19,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
119
+ 2024-08-23 16:41:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
120
+ 2024-08-23 16:41:23,138 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
121
+ 2024-08-23 16:41:28,138 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
122
+ 2024-08-23 16:41:33,139 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
123
+ 2024-08-23 16:41:34,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
124
+ 2024-08-23 16:41:34,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
125
+ 2024-08-23 16:41:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
126
+ 2024-08-23 16:41:39,118 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
127
+ 2024-08-23 16:41:44,119 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
128
+ 2024-08-23 16:41:49,120 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
129
+ 2024-08-23 16:41:49,892 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
130
+ 2024-08-23 16:41:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
131
+ 2024-08-23 16:41:49,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
132
+ 2024-08-23 16:41:49,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
133
+ 2024-08-23 16:41:53,851 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
134
+ 2024-08-23 16:41:53,852 DEBUG SenderThread:12305 [sender.py:send():382] send: history
135
+ 2024-08-23 16:41:53,852 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
136
+ 2024-08-23 16:41:53,854 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
137
+ 2024-08-23 16:41:53,896 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
138
+ 2024-08-23 16:41:54,892 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
139
+ 2024-08-23 16:41:54,897 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
140
+ 2024-08-23 16:41:59,892 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
141
+ 2024-08-23 16:42:04,893 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
142
+ 2024-08-23 16:42:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
143
+ 2024-08-23 16:42:04,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
144
+ 2024-08-23 16:42:04,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
145
+ 2024-08-23 16:42:10,105 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
146
+ 2024-08-23 16:42:15,106 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
147
+ 2024-08-23 16:42:19,893 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
148
+ 2024-08-23 16:42:19,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
149
+ 2024-08-23 16:42:19,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
150
+ 2024-08-23 16:42:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
151
+ 2024-08-23 16:42:20,146 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
152
+ 2024-08-23 16:42:25,146 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
153
+ 2024-08-23 16:42:30,147 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
154
+ 2024-08-23 16:42:34,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
155
+ 2024-08-23 16:42:34,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
156
+ 2024-08-23 16:42:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
157
+ 2024-08-23 16:42:36,138 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
158
+ 2024-08-23 16:42:41,138 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
159
+ 2024-08-23 16:42:46,139 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
160
+ 2024-08-23 16:42:49,894 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
161
+ 2024-08-23 16:42:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
162
+ 2024-08-23 16:42:49,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
163
+ 2024-08-23 16:42:49,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
164
+ 2024-08-23 16:42:51,557 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
165
+ 2024-08-23 16:42:51,559 DEBUG SenderThread:12305 [sender.py:send():382] send: history
166
+ 2024-08-23 16:42:51,559 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
167
+ 2024-08-23 16:42:51,560 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
168
+ 2024-08-23 16:42:51,561 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
169
+ 2024-08-23 16:42:51,926 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
170
+ 2024-08-23 16:42:52,927 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
171
+ 2024-08-23 16:42:56,600 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
172
+ 2024-08-23 16:43:01,601 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
173
+ 2024-08-23 16:43:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
174
+ 2024-08-23 16:43:04,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
175
+ 2024-08-23 16:43:04,930 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
176
+ 2024-08-23 16:43:07,132 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
177
+ 2024-08-23 16:43:12,132 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
178
+ 2024-08-23 16:43:17,132 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
179
+ 2024-08-23 16:43:19,895 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
180
+ 2024-08-23 16:43:19,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
181
+ 2024-08-23 16:43:19,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
182
+ 2024-08-23 16:43:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
183
+ 2024-08-23 16:43:23,107 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
184
+ 2024-08-23 16:43:28,108 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
185
+ 2024-08-23 16:43:33,108 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
186
+ 2024-08-23 16:43:34,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
187
+ 2024-08-23 16:43:34,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
188
+ 2024-08-23 16:43:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
189
+ 2024-08-23 16:43:38,202 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
190
+ 2024-08-23 16:43:43,203 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
191
+ 2024-08-23 16:43:48,203 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
192
+ 2024-08-23 16:43:49,180 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
193
+ 2024-08-23 16:43:49,181 DEBUG SenderThread:12305 [sender.py:send():382] send: history
194
+ 2024-08-23 16:43:49,182 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
195
+ 2024-08-23 16:43:49,184 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
196
+ 2024-08-23 16:43:49,896 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
197
+ 2024-08-23 16:43:49,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
198
+ 2024-08-23 16:43:49,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
199
+ 2024-08-23 16:43:49,930 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
200
+ 2024-08-23 16:43:49,954 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
201
+ 2024-08-23 16:43:50,955 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
202
+ 2024-08-23 16:43:54,116 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
203
+ 2024-08-23 16:43:59,116 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
204
+ 2024-08-23 16:44:04,117 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
205
+ 2024-08-23 16:44:04,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
206
+ 2024-08-23 16:44:04,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
207
+ 2024-08-23 16:44:04,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
208
+ 2024-08-23 16:44:09,123 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
209
+ 2024-08-23 16:44:14,124 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
210
+ 2024-08-23 16:44:19,124 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
211
+ 2024-08-23 16:44:19,897 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
212
+ 2024-08-23 16:44:19,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
213
+ 2024-08-23 16:44:19,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
214
+ 2024-08-23 16:44:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
215
+ 2024-08-23 16:44:25,120 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
216
+ 2024-08-23 16:44:30,120 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
217
+ 2024-08-23 16:44:34,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
218
+ 2024-08-23 16:44:34,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
219
+ 2024-08-23 16:44:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
220
+ 2024-08-23 16:44:35,144 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
221
+ 2024-08-23 16:44:40,145 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
222
+ 2024-08-23 16:44:45,145 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
223
+ 2024-08-23 16:44:46,917 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
224
+ 2024-08-23 16:44:46,920 DEBUG SenderThread:12305 [sender.py:send():382] send: history
225
+ 2024-08-23 16:44:46,921 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
226
+ 2024-08-23 16:44:46,922 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
227
+ 2024-08-23 16:44:46,983 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
228
+ 2024-08-23 16:44:48,984 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
229
+ 2024-08-23 16:44:49,898 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
230
+ 2024-08-23 16:44:49,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
231
+ 2024-08-23 16:44:49,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
232
+ 2024-08-23 16:44:49,931 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
233
+ 2024-08-23 16:44:50,159 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
234
+ 2024-08-23 16:44:55,160 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
235
+ 2024-08-23 16:45:00,160 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
236
+ 2024-08-23 16:45:04,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
237
+ 2024-08-23 16:45:04,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
238
+ 2024-08-23 16:45:04,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
239
+ 2024-08-23 16:45:06,142 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
240
+ 2024-08-23 16:45:11,142 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
241
+ 2024-08-23 16:45:16,142 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
242
+ 2024-08-23 16:45:19,899 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
243
+ 2024-08-23 16:45:19,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
244
+ 2024-08-23 16:45:19,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
245
+ 2024-08-23 16:45:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
246
+ 2024-08-23 16:45:21,199 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
247
+ 2024-08-23 16:45:26,199 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
248
+ 2024-08-23 16:45:31,200 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
249
+ 2024-08-23 16:45:34,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
250
+ 2024-08-23 16:45:34,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
251
+ 2024-08-23 16:45:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
252
+ 2024-08-23 16:45:37,098 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
253
+ 2024-08-23 16:45:42,098 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
254
+ 2024-08-23 16:45:44,582 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
255
+ 2024-08-23 16:45:44,584 DEBUG SenderThread:12305 [sender.py:send():382] send: history
256
+ 2024-08-23 16:45:44,584 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
257
+ 2024-08-23 16:45:44,585 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
258
+ 2024-08-23 16:45:45,011 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
259
+ 2024-08-23 16:45:47,011 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
260
+ 2024-08-23 16:45:47,624 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
261
+ 2024-08-23 16:45:49,899 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
262
+ 2024-08-23 16:45:49,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
263
+ 2024-08-23 16:45:49,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
264
+ 2024-08-23 16:45:49,931 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
265
+ 2024-08-23 16:45:53,153 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
266
+ 2024-08-23 16:45:58,153 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
267
+ 2024-08-23 16:46:03,154 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
268
+ 2024-08-23 16:46:04,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
269
+ 2024-08-23 16:46:04,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
270
+ 2024-08-23 16:46:04,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
271
+ 2024-08-23 16:46:09,099 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
272
+ 2024-08-23 16:46:14,100 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
273
+ 2024-08-23 16:46:19,101 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
274
+ 2024-08-23 16:46:19,900 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
275
+ 2024-08-23 16:46:19,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
276
+ 2024-08-23 16:46:19,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
277
+ 2024-08-23 16:46:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
278
+ 2024-08-23 16:46:24,145 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
279
+ 2024-08-23 16:46:29,146 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
280
+ 2024-08-23 16:46:34,146 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
281
+ 2024-08-23 16:46:34,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
282
+ 2024-08-23 16:46:34,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
283
+ 2024-08-23 16:46:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
284
+ 2024-08-23 16:46:40,145 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
285
+ 2024-08-23 16:46:42,271 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
286
+ 2024-08-23 16:46:42,273 DEBUG SenderThread:12305 [sender.py:send():382] send: history
287
+ 2024-08-23 16:46:42,273 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
288
+ 2024-08-23 16:46:42,274 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
289
+ 2024-08-23 16:46:43,038 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
290
+ 2024-08-23 16:46:45,038 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
291
+ 2024-08-23 16:46:45,316 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
292
+ 2024-08-23 16:46:46,081 DEBUG SenderThread:12305 [sender.py:send():382] send: exit
293
+ 2024-08-23 16:46:46,081 INFO SenderThread:12305 [sender.py:send_exit():589] handling exit code: 255
294
+ 2024-08-23 16:46:46,081 INFO SenderThread:12305 [sender.py:send_exit():591] handling runtime: 476
295
+ 2024-08-23 16:46:46,083 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
296
+ 2024-08-23 16:46:46,083 INFO SenderThread:12305 [sender.py:send_exit():597] send defer
297
+ 2024-08-23 16:46:46,083 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
298
+ 2024-08-23 16:46:46,083 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 0
299
+ 2024-08-23 16:46:46,083 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
300
+ 2024-08-23 16:46:46,083 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 0
301
+ 2024-08-23 16:46:46,083 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 1
302
+ 2024-08-23 16:46:46,083 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
303
+ 2024-08-23 16:46:46,083 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 1
304
+ 2024-08-23 16:46:46,084 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
305
+ 2024-08-23 16:46:46,084 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 1
306
+ 2024-08-23 16:46:46,084 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 2
307
+ 2024-08-23 16:46:46,084 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
308
+ 2024-08-23 16:46:46,084 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 2
309
+ 2024-08-23 16:46:46,084 INFO HandlerThread:12305 [system_monitor.py:finish():203] Stopping system monitor
310
+ 2024-08-23 16:46:46,084 DEBUG SystemMonitor:12305 [system_monitor.py:_start():179] Finished system metrics aggregation loop
311
+ 2024-08-23 16:46:46,084 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined cpu monitor
312
+ 2024-08-23 16:46:46,084 DEBUG SystemMonitor:12305 [system_monitor.py:_start():183] Publishing last batch of metrics
313
+ 2024-08-23 16:46:46,084 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined disk monitor
314
+ 2024-08-23 16:46:46,118 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined gpu monitor
315
+ 2024-08-23 16:46:46,118 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined memory monitor
316
+ 2024-08-23 16:46:46,118 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined network monitor
317
+ 2024-08-23 16:46:46,118 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
318
+ 2024-08-23 16:46:46,119 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 2
319
+ 2024-08-23 16:46:46,119 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 3
320
+ 2024-08-23 16:46:46,119 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
321
+ 2024-08-23 16:46:46,119 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
322
+ 2024-08-23 16:46:46,119 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 3
323
+ 2024-08-23 16:46:46,120 DEBUG SenderThread:12305 [sender.py:send():382] send: history
324
+ 2024-08-23 16:46:46,121 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
325
+ 2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
326
+ 2024-08-23 16:46:46,122 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
327
+ 2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 3
328
+ 2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 4
329
+ 2024-08-23 16:46:46,122 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
330
+ 2024-08-23 16:46:46,122 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 4
331
+ 2024-08-23 16:46:46,122 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
332
+ 2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 4
333
+ 2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 5
334
+ 2024-08-23 16:46:46,122 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
335
+ 2024-08-23 16:46:46,122 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 5
336
+ 2024-08-23 16:46:46,123 DEBUG SenderThread:12305 [sender.py:send():382] send: summary
337
+ 2024-08-23 16:46:46,124 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
338
+ 2024-08-23 16:46:46,124 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
339
+ 2024-08-23 16:46:46,124 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 5
340
+ 2024-08-23 16:46:46,124 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 6
341
+ 2024-08-23 16:46:46,124 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
342
+ 2024-08-23 16:46:46,124 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 6
343
+ 2024-08-23 16:46:46,125 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
344
+ 2024-08-23 16:46:46,125 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 6
345
+ 2024-08-23 16:46:46,125 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 7
346
+ 2024-08-23 16:46:46,125 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
347
+ 2024-08-23 16:46:46,125 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
348
+ 2024-08-23 16:46:46,125 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 7
349
+ 2024-08-23 16:46:46,125 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
350
+ 2024-08-23 16:46:46,125 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 7
351
+ 2024-08-23 16:46:47,040 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
352
+ 2024-08-23 16:46:47,081 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
353
+ 2024-08-23 16:46:47,096 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 8
354
+ 2024-08-23 16:46:47,096 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
355
+ 2024-08-23 16:46:47,096 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
356
+ 2024-08-23 16:46:47,097 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 8
357
+ 2024-08-23 16:46:47,097 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
358
+ 2024-08-23 16:46:47,097 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 8
359
+ 2024-08-23 16:46:47,097 INFO SenderThread:12305 [job_builder.py:build():296] Attempting to build job artifact
360
+ 2024-08-23 16:46:47,098 INFO SenderThread:12305 [job_builder.py:_get_source_type():426] is repo sourced job
361
+ 2024-08-23 16:46:47,113 INFO SenderThread:12305 [job_builder.py:build():402] adding wandb-job metadata file
362
+ 2024-08-23 16:46:47,121 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 9
363
+ 2024-08-23 16:46:47,122 DEBUG SenderThread:12305 [sender.py:send():382] send: artifact
364
+ 2024-08-23 16:46:47,122 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
365
+ 2024-08-23 16:46:47,123 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 9
366
+ 2024-08-23 16:46:48,001 INFO SenderThread:12305 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MjAxODA1Mw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MjAxODA1Mw==', 'versionIndex': 3}}}
367
+ 2024-08-23 16:46:48,001 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
368
+ 2024-08-23 16:46:48,001 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 9
369
+ 2024-08-23 16:46:48,001 INFO SenderThread:12305 [dir_watcher.py:finish():358] shutting down directory watcher
370
+ 2024-08-23 16:46:48,041 INFO SenderThread:12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
371
+ 2024-08-23 16:46:48,041 INFO SenderThread:12305 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_163849-faey1t8u/files
372
+ 2024-08-23 16:46:48,042 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/requirements.txt requirements.txt
373
+ 2024-08-23 16:46:48,042 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/config.yaml config.yaml
374
+ 2024-08-23 16:46:48,043 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-metadata.json wandb-metadata.json
375
+ 2024-08-23 16:46:48,043 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json wandb-summary.json
376
+ 2024-08-23 16:46:48,045 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/output.log output.log
377
+ 2024-08-23 16:46:48,046 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 10
378
+ 2024-08-23 16:46:48,047 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
379
+ 2024-08-23 16:46:48,048 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 10
380
+ 2024-08-23 16:46:48,048 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
381
+ 2024-08-23 16:46:48,048 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 10
382
+ 2024-08-23 16:46:48,048 INFO SenderThread:12305 [file_pusher.py:finish():172] shutting down file pusher
383
+ 2024-08-23 16:46:48,082 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
384
+ 2024-08-23 16:46:48,082 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
385
+ 2024-08-23 16:46:49,082 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
386
+ 2024-08-23 16:46:49,082 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
387
+ 2024-08-23 16:46:50,083 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
388
+ 2024-08-23 16:46:50,083 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
389
+ 2024-08-23 16:46:50,119 INFO wandb-upload_2:12305 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
390
+ 2024-08-23 16:46:50,148 INFO wandb-upload_3:12305 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_163849-faey1t8u/files/output.log
391
+ 2024-08-23 16:46:50,166 INFO wandb-upload_0:12305 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_163849-faey1t8u/files/requirements.txt
392
+ 2024-08-23 16:46:50,204 INFO wandb-upload_1:12305 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_163849-faey1t8u/files/config.yaml
393
+ 2024-08-23 16:46:50,405 INFO Thread-11 (_thread_body):12305 [sender.py:transition_state():617] send defer: 11
394
+ 2024-08-23 16:46:50,405 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
395
+ 2024-08-23 16:46:50,405 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 11
396
+ 2024-08-23 16:46:50,405 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
397
+ 2024-08-23 16:46:50,405 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 11
398
+ 2024-08-23 16:46:50,405 INFO SenderThread:12305 [file_pusher.py:join():178] waiting for file pusher
399
+ 2024-08-23 16:46:50,405 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 12
400
+ 2024-08-23 16:46:50,405 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
401
+ 2024-08-23 16:46:50,406 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 12
402
+ 2024-08-23 16:46:50,406 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
403
+ 2024-08-23 16:46:50,406 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 12
404
+ 2024-08-23 16:46:50,406 INFO SenderThread:12305 [file_stream.py:finish():595] file stream finish called
405
+ 2024-08-23 16:46:51,083 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
406
+ 2024-08-23 16:46:51,097 INFO SenderThread:12305 [file_stream.py:finish():599] file stream finish is done
407
+ 2024-08-23 16:46:51,097 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 13
408
+ 2024-08-23 16:46:51,097 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
409
+ 2024-08-23 16:46:51,097 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
410
+ 2024-08-23 16:46:51,098 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 13
411
+ 2024-08-23 16:46:51,098 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
412
+ 2024-08-23 16:46:51,098 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 13
413
+ 2024-08-23 16:46:51,098 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 14
414
+ 2024-08-23 16:46:51,098 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
415
+ 2024-08-23 16:46:51,098 DEBUG SenderThread:12305 [sender.py:send():382] send: final
416
+ 2024-08-23 16:46:51,098 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 14
417
+ 2024-08-23 16:46:51,098 DEBUG SenderThread:12305 [sender.py:send():382] send: footer
418
+ 2024-08-23 16:46:51,098 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
419
+ 2024-08-23 16:46:51,099 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 14
420
+ 2024-08-23 16:46:51,099 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
421
+ 2024-08-23 16:46:51,099 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
422
+ 2024-08-23 16:46:51,099 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
423
+ 2024-08-23 16:46:51,100 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: server_info
424
+ 2024-08-23 16:46:51,100 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
425
+ 2024-08-23 16:46:51,100 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: get_summary
426
+ 2024-08-23 16:46:51,100 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: server_info
427
+ 2024-08-23 16:46:51,101 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: sampled_history
428
+ 2024-08-23 16:46:51,103 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
429
+ 2024-08-23 16:46:51,103 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: job_info
430
+ 2024-08-23 16:46:51,261 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: job_info
431
+ 2024-08-23 16:46:51,262 INFO MainThread:12305 [wandb_run.py:_footer_history_summary_info():3866] rendering history
432
+ 2024-08-23 16:46:51,262 INFO MainThread:12305 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
433
+ 2024-08-23 16:46:51,262 INFO MainThread:12305 [wandb_run.py:_footer_sync_info():3825] logging synced files
434
+ 2024-08-23 16:46:51,263 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: shutdown
435
+ 2024-08-23 16:46:51,263 INFO HandlerThread:12305 [handler.py:finish():869] shutting down handler
436
+ 2024-08-23 16:46:52,103 INFO WriterThread:12305 [datastore.py:close():296] close: /project/wandb/run-20240823_163849-faey1t8u/run-faey1t8u.wandb
437
+ 2024-08-23 16:46:52,262 INFO SenderThread:12305 [sender.py:finish():1572] shutting down sender
438
+ 2024-08-23 16:46:52,262 INFO SenderThread:12305 [file_pusher.py:finish():172] shutting down file pusher
439
+ 2024-08-23 16:46:52,262 INFO SenderThread:12305 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240823_163849-faey1t8u/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 16:38:49,358 INFO MainThread:12234 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-23 16:38:49,358 INFO MainThread:12234 [wandb_setup.py:_flush():76] Configure stats pid to 12234
3
+ 2024-08-23 16:38:49,358 INFO MainThread:12234 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-23 16:38:49,358 INFO MainThread:12234 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_163849-faey1t8u/logs/debug.log
9
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_163849-faey1t8u/logs/debug-internal.log
10
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-16:38:35', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 3, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 106}
13
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:init():616] starting backend
14
+ 2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-23 16:38:49,364 INFO MainThread:12234 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-23 16:38:49,364 INFO MainThread:12234 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-23 16:38:49,369 INFO MainThread:12234 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-23 16:38:49,401 INFO MainThread:12234 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-23 16:38:49,805 INFO MainThread:12234 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-23 16:38:49,830 INFO MainThread:12234 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-23 16:38:49,830 INFO MainThread:12234 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-23 16:38:49,926 INFO MainThread:12234 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-23 16:38:49,926 INFO MainThread:12234 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-23 16:38:49,926 INFO MainThread:12234 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-23 16:38:49,927 INFO MainThread:12234 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-23 16:38:49,927 INFO MainThread:12234 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-23 16:38:58,351 INFO MainThread:12234 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-23 16:38:58,352 INFO MainThread:12234 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-23 16:46:52,263 WARNING MsgRouterThr:12234 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240823_163849-faey1t8u/run-faey1t8u.wandb ADDED
Binary file (49.1 kB). View file
 
wandb/run-20240823_202540-om09pls8/files/config.yaml ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '1754785366'
31
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
32
+ - '28623823675'
33
+ - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
34
+ valid_data_path:
35
+ desc: null
36
+ value:
37
+ - '1754785366'
38
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
39
+ test_data_path:
40
+ desc: null
41
+ value:
42
+ - '1754785366'
43
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
44
+ data_cache_path:
45
+ desc: null
46
+ value: null
47
+ vocab_size:
48
+ desc: null
49
+ value: null
50
+ vocab_file:
51
+ desc: null
52
+ value: null
53
+ merge_file:
54
+ desc: null
55
+ value: null
56
+ seq_length:
57
+ desc: null
58
+ value: 1024
59
+ num_workers:
60
+ desc: null
61
+ value: 2
62
+ tokenizer_type:
63
+ desc: null
64
+ value: HFPreTrainedTokenizer
65
+ tokenizer_model:
66
+ desc: null
67
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
68
+ reset_position_ids:
69
+ desc: null
70
+ value: false
71
+ reset_attention_mask:
72
+ desc: null
73
+ value: false
74
+ eod_mask_loss:
75
+ desc: null
76
+ value: false
77
+ retro_return_doc_ids:
78
+ desc: null
79
+ value: false
80
+ short_seq_prob:
81
+ desc: null
82
+ value: 0.1
83
+ vocab_extra_ids:
84
+ desc: null
85
+ value: 0
86
+ seed:
87
+ desc: null
88
+ value: 1234
89
+ use_mpi:
90
+ desc: null
91
+ value: false
92
+ wandb_entity:
93
+ desc: null
94
+ value: iwakawa-koichi-q5-tohoku-nlp6723
95
+ wandb_name:
96
+ desc: null
97
+ value: Qwen2-0.5b-0.2_train_2024-08-23-20:25:00
98
+ wandb_project:
99
+ desc: null
100
+ value: llm_tutorial-0.2
101
+ quantization:
102
+ desc: null
103
+ value: false
104
+ use_freeze_layers:
105
+ desc: null
106
+ value: false
107
+ freeze_layers:
108
+ desc: null
109
+ value: null
110
+ bf16:
111
+ desc: null
112
+ value: true
113
+ fp16:
114
+ desc: null
115
+ value: false
116
+ mixed_precision:
117
+ desc: null
118
+ value: true
119
+ param_dtype:
120
+ desc: null
121
+ value: null
122
+ load:
123
+ desc: null
124
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
125
+ save:
126
+ desc: null
127
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
128
+ base_model:
129
+ desc: null
130
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
131
+ use_better_transformer:
132
+ desc: null
133
+ value: false
134
+ grad_clip_norm:
135
+ desc: null
136
+ value: 1.0
137
+ eval_interval:
138
+ desc: null
139
+ value: 3
140
+ save_interval:
141
+ desc: null
142
+ value: 500
143
+ eval_iters:
144
+ desc: null
145
+ value: 10
146
+ optimizer:
147
+ desc: null
148
+ value: anyprecision
149
+ lr:
150
+ desc: null
151
+ value: 2.0e-05
152
+ lr_decay_style:
153
+ desc: null
154
+ value: cosine
155
+ lr_decay_iters:
156
+ desc: null
157
+ value: 16000
158
+ lr_warmup_iters:
159
+ desc: null
160
+ value: 500
161
+ min_lr:
162
+ desc: null
163
+ value: 1.0e-06
164
+ train_iters:
165
+ desc: null
166
+ value: 16000
167
+ train_samples:
168
+ desc: null
169
+ value: null
170
+ global_batch_size:
171
+ desc: null
172
+ value: 612
173
+ micro_batch_size:
174
+ desc: null
175
+ value: 17
176
+ make_vocab_size_divisible_by:
177
+ desc: null
178
+ value: 128
179
+ sliding_window_size:
180
+ desc: null
181
+ value: 131072
182
+ skip_batch:
183
+ desc: null
184
+ value: null
185
+ no_save_optimizer_state:
186
+ desc: null
187
+ value: false
188
+ continual_pretraining:
189
+ desc: null
190
+ value: false
191
+ instruction_tuning:
192
+ desc: null
193
+ value: false
194
+ direct_preference_optimization:
195
+ desc: null
196
+ value: false
197
+ attention_dropout:
198
+ desc: null
199
+ value: 0.1
200
+ hidden_dropout:
201
+ desc: null
202
+ value: 0.1
203
+ weight_decay:
204
+ desc: null
205
+ value: 0.1
206
+ adam_beta1:
207
+ desc: null
208
+ value: 0.9
209
+ adam_beta2:
210
+ desc: null
211
+ value: 0.95
212
+ adam_eps:
213
+ desc: null
214
+ value: 1.0e-06
215
+ hf_transformer_model_dir:
216
+ desc: null
217
+ value: null
218
+ instruction_train_data_path:
219
+ desc: null
220
+ value: null
221
+ instruction_valid_data_path:
222
+ desc: null
223
+ value: null
224
+ epoch:
225
+ desc: null
226
+ value: null
227
+ instruction_dataset_size:
228
+ desc: null
229
+ value: null
230
+ save_sampler_state:
231
+ desc: null
232
+ value: false
233
+ label_smoothing:
234
+ desc: null
235
+ value: 0.0
236
+ save_n_checkpoints:
237
+ desc: null
238
+ value: 10
239
+ hf_repo_id:
240
+ desc: null
241
+ value: koichi12/Qwen2-0.5b-0.2
242
+ create_public_hf_repo:
243
+ desc: null
244
+ value: false
245
+ upload_all_checkpoints_to_hf:
246
+ desc: null
247
+ value: true
248
+ hf_upload_retry_limit:
249
+ desc: null
250
+ value: 2
251
+ exit_duration_in_mins:
252
+ desc: null
253
+ value: null
254
+ source_key:
255
+ desc: null
256
+ value: null
257
+ target_key:
258
+ desc: null
259
+ value: null
260
+ attn_implementation:
261
+ desc: null
262
+ value: flash_attention_2
263
+ efficient_instruction_tuning:
264
+ desc: null
265
+ value: false
266
+ remove_padding_masking:
267
+ desc: null
268
+ value: false
269
+ save_start_iter:
270
+ desc: null
271
+ value: null
272
+ valid_micro_batch_size:
273
+ desc: null
274
+ value: 10
275
+ rank:
276
+ desc: null
277
+ value: 0
278
+ world_size:
279
+ desc: null
280
+ value: 4
281
+ padded_vocab_size:
282
+ desc: null
283
+ value: 151680
284
+ gradient_accumulation_steps:
285
+ desc: null
286
+ value: 9
287
+ _wandb:
288
+ desc: null
289
+ value:
290
+ python_version: 3.10.12
291
+ cli_version: 0.16.3
292
+ framework: huggingface
293
+ huggingface_version: 4.43.3
294
+ is_jupyter_run: false
295
+ is_kaggle_kernel: false
296
+ start_time: 1724412340.7504
297
+ t:
298
+ 1:
299
+ - 1
300
+ - 11
301
+ - 49
302
+ - 55
303
+ - 71
304
+ - 105
305
+ 2:
306
+ - 1
307
+ - 11
308
+ - 49
309
+ - 55
310
+ - 71
311
+ - 105
312
+ 3:
313
+ - 13
314
+ - 16
315
+ - 23
316
+ 4: 3.10.12
317
+ 5: 0.16.3
318
+ 6: 4.43.3
319
+ 8:
320
+ - 5
321
+ 13: linux-x86_64
322
+ model_architecture:
323
+ desc: null
324
+ value: Qwen2ForCausalLM
325
+ activation_function:
326
+ desc: null
327
+ value: silu
328
+ hidden_size:
329
+ desc: null
330
+ value: 896
331
+ model_type:
332
+ desc: null
333
+ value: qwen2
334
+ max_position_embeddings:
335
+ desc: null
336
+ value: 1024
337
+ num_attention_heads:
338
+ desc: null
339
+ value: 14
340
+ num_hidden_layers:
341
+ desc: null
342
+ value: 24
wandb/run-20240823_202540-om09pls8/files/output.log ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000100/model.pt
5
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
6
+ Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000100/model.pt
7
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
8
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
9
+ BFloat16 enabled for mixed precision - using bfSixteen policy
10
+ --> applying fsdp activation checkpointing...
11
+ > datasets target sizes (minimum size):
12
+ train: 9792000
13
+ validation: 32644080
14
+ test: 6120
15
+ > building train, validation, and test datasets for GPT ...
16
+ Let split = None
17
+ Unable to save the indexes because path_to_cache is None
18
+ Building a BlendedDataset for a single MegatronDataset
19
+ Unable to save the indexes because path_to_cache is None
20
+ > finished creating GPT datasets ...
21
+ Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000100/optimizer.pt
22
+ Building a BlendedDataset for a single MegatronDataset
23
+ Unable to save the indexes because path_to_cache is None
24
+ [rank0]:[2024-08-23 20:26:02,460] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
25
+ Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000100/optimizer.pt
26
+ model info: FullyShardedDataParallel(
27
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
28
+ (model): Qwen2Model(
29
+ (embed_tokens): Embedding(151936, 896)
30
+ (layers): ModuleList(
31
+ (0-23): 24 x FullyShardedDataParallel(
32
+ (_fsdp_wrapped_module): CheckpointWrapper(
33
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
34
+ (self_attn): Qwen2FlashAttention2(
35
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
36
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
37
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
38
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
39
+ (rotary_emb): Qwen2RotaryEmbedding()
40
+ )
41
+ (mlp): Qwen2MLP(
42
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
43
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
44
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
45
+ (act_fn): SiLU()
46
+ )
47
+ (input_layernorm): Qwen2RMSNorm()
48
+ (post_attention_layernorm): Qwen2RMSNorm()
49
+ )
50
+ )
51
+ )
52
+ )
53
+ (norm): Qwen2RMSNorm()
54
+ )
55
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
56
+ )
57
+ )
58
+ model config: Qwen2Config {
59
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
60
+ "architectures": [
61
+ "Qwen2ForCausalLM"
62
+ ],
63
+ "attention_dropout": 0.0,
64
+ "bos_token_id": 151643,
65
+ "eos_token_id": 151643,
66
+ "hidden_act": "silu",
67
+ "hidden_size": 896,
68
+ "initializer_range": 0.02,
69
+ "intermediate_size": 4864,
70
+ "label_smoothing": 0.0,
71
+ "max_position_embeddings": 1024,
72
+ "max_window_layers": 24,
73
+ "model_type": "qwen2",
74
+ "num_attention_heads": 14,
75
+ "num_hidden_layers": 24,
76
+ "num_key_value_heads": 2,
77
+ "rms_norm_eps": 1e-06,
78
+ "rope_theta": 1000000.0,
79
+ "sliding_window": 131072,
80
+ "tie_word_embeddings": true,
81
+ "torch_dtype": "bfloat16",
82
+ "transformers_version": "4.43.3",
83
+ "use_cache": false,
84
+ "use_sliding_window": false,
85
+ "vocab_size": 151936
86
+ }
87
+ ------------------------------------------------------------------
88
+ iteration: 101 , TFLOPS: 23.617124223738323, Tokens per sec: 29262.917733714043, Loss: 3.7116615772247314
89
+ ------------------------------------------------------------------
90
+ ------------------------------------------------------------------
91
+ iteration: 102 , TFLOPS: 72.97505903558803, Tokens per sec: 90420.11757828314, Loss: 3.7358791828155518
92
+ ------------------------------------------------------------------
93
+ eval ppl=30.41041374206543, eval loss=3.414785146713257
94
+ ------------------------------------------------------------------
95
+ iteration: 103 , TFLOPS: 65.5275483737983, Tokens per sec: 81192.2416628126, Loss: 3.757955551147461
96
+ ------------------------------------------------------------------
97
+ ------------------------------------------------------------------
98
+ iteration: 104 , TFLOPS: 73.41964709398604, Tokens per sec: 90970.9866703471, Loss: 3.730485439300537
99
+ ------------------------------------------------------------------
100
+ ------------------------------------------------------------------
101
+ iteration: 105 , TFLOPS: 70.30393549004248, Tokens per sec: 87110.44838107748, Loss: 3.7091140747070312
102
+ ------------------------------------------------------------------
103
+ eval ppl=30.61298179626465, eval loss=3.421424150466919
104
+ ------------------------------------------------------------------
105
+ iteration: 106 , TFLOPS: 67.02992418712539, Tokens per sec: 83053.76804570397, Loss: 3.732792377471924
106
+ ------------------------------------------------------------------
107
+ ------------------------------------------------------------------
108
+ iteration: 107 , TFLOPS: 72.30676298983876, Tokens per sec: 89592.06196815638, Loss: 3.7457761764526367
109
+ ------------------------------------------------------------------
110
+ ------------------------------------------------------------------
111
+ iteration: 108 , TFLOPS: 73.24548732883933, Tokens per sec: 90755.19312868938, Loss: 3.7291133403778076
112
+ ------------------------------------------------------------------
113
+ eval ppl=31.11376953125, eval loss=3.437650442123413
114
+ ------------------------------------------------------------------
115
+ iteration: 109 , TFLOPS: 66.08722155273428, Tokens per sec: 81885.7075580594, Loss: 3.71726131439209
116
+ ------------------------------------------------------------------
117
+ ------------------------------------------------------------------
118
+ iteration: 110 , TFLOPS: 73.33268637131981, Tokens per sec: 90863.23754520644, Loss: 3.7274892330169678
119
+ ------------------------------------------------------------------
120
+ Traceback (most recent call last):
121
+ File "/project/examples/finetuning.py", line 13, in <module>
122
+ main()
123
+ File "/project/src/llama_recipes/finetuning.py", line 282, in main
124
+ train(
125
+ File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
126
+ loss.backward()
127
+ File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
128
+ torch.autograd.backward(
129
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
130
+ _engine_run_backward(
131
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
132
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
133
+ KeyboardInterrupt
wandb/run-20240823_202540-om09pls8/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0
wandb/run-20240823_202540-om09pls8/files/wandb-metadata.json ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-23T11:25:41.454442",
5
+ "startedAt": "2024-08-23T11:25:40.735970",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "1024",
11
+ "--sliding-window-size",
12
+ "131072",
13
+ "--micro-batch-size",
14
+ "17",
15
+ "--valid_micro_batch_size",
16
+ "10",
17
+ "--global-batch-size",
18
+ "612",
19
+ "--train-iters",
20
+ "16000",
21
+ "--tokenizer-type",
22
+ "HFPreTrainedTokenizer",
23
+ "--tokenizer-model",
24
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
25
+ "--train-data-path",
26
+ "1754785366",
27
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
28
+ "28623823675",
29
+ "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
30
+ "--valid-data-path",
31
+ "1754785366",
32
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
33
+ "--test-data-path",
34
+ "1754785366",
35
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
36
+ "--lr",
37
+ "2e-5",
38
+ "--min-lr",
39
+ "1e-6",
40
+ "--lr-decay-style",
41
+ "cosine",
42
+ "--lr-warmup-iters",
43
+ "500",
44
+ "--lr-decay-iters",
45
+ "16000",
46
+ "--weight-decay",
47
+ "0.1",
48
+ "--grad-clip-norm",
49
+ "1.0",
50
+ "--optimizer",
51
+ "anyprecision",
52
+ "--adam-beta1",
53
+ "0.9",
54
+ "--adam-beta2",
55
+ "0.95",
56
+ "--adam-eps",
57
+ "1e-6",
58
+ "--save-interval",
59
+ "500",
60
+ "--eval-interval",
61
+ "3",
62
+ "--eval-iters",
63
+ "10",
64
+ "--bf16",
65
+ "--mixed-precision",
66
+ "--base-model",
67
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
68
+ "--save",
69
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
70
+ "--load",
71
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
72
+ "--fsdp-activation-checkpointing",
73
+ "--sharding-strategy",
74
+ "FULL_SHARD",
75
+ "--checkpoint-type",
76
+ "LOCAL_STATE_DICT",
77
+ "--save-n-checkpoints",
78
+ "10",
79
+ "--upload-all-checkpoints-to-hf",
80
+ "--hf-upload-retry-limit",
81
+ "2",
82
+ "--hf-repo-id",
83
+ "koichi12/Qwen2-0.5b-0.2",
84
+ "--wandb-entity",
85
+ "iwakawa-koichi-q5-tohoku-nlp6723",
86
+ "--wandb-project",
87
+ "llm_tutorial-0.2",
88
+ "--wandb-name",
89
+ "Qwen2-0.5b-0.2_train_2024-08-23-20:25:00"
90
+ ],
91
+ "state": "running",
92
+ "program": "/project/examples/finetuning.py",
93
+ "codePathLocal": "examples/finetuning.py",
94
+ "codePath": "examples/finetuning.py",
95
+ "git": {
96
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
97
+ "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
98
+ },
99
+ "email": null,
100
+ "root": "/project",
101
+ "host": "gpu-koiwa-00",
102
+ "username": "koiwa",
103
+ "executable": "/usr/bin/python",
104
+ "cpu_count": 72,
105
+ "cpu_count_logical": 72,
106
+ "cpu_freq": {
107
+ "current": 2400.038999999999,
108
+ "min": 0.0,
109
+ "max": 0.0
110
+ },
111
+ "cpu_freq_per_core": [
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2400.039,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ },
202
+ {
203
+ "current": 2400.039,
204
+ "min": 0.0,
205
+ "max": 0.0
206
+ },
207
+ {
208
+ "current": 2400.039,
209
+ "min": 0.0,
210
+ "max": 0.0
211
+ },
212
+ {
213
+ "current": 2400.039,
214
+ "min": 0.0,
215
+ "max": 0.0
216
+ },
217
+ {
218
+ "current": 2400.039,
219
+ "min": 0.0,
220
+ "max": 0.0
221
+ },
222
+ {
223
+ "current": 2400.039,
224
+ "min": 0.0,
225
+ "max": 0.0
226
+ },
227
+ {
228
+ "current": 2400.039,
229
+ "min": 0.0,
230
+ "max": 0.0
231
+ },
232
+ {
233
+ "current": 2400.039,
234
+ "min": 0.0,
235
+ "max": 0.0
236
+ },
237
+ {
238
+ "current": 2400.039,
239
+ "min": 0.0,
240
+ "max": 0.0
241
+ },
242
+ {
243
+ "current": 2400.039,
244
+ "min": 0.0,
245
+ "max": 0.0
246
+ },
247
+ {
248
+ "current": 2400.039,
249
+ "min": 0.0,
250
+ "max": 0.0
251
+ },
252
+ {
253
+ "current": 2400.039,
254
+ "min": 0.0,
255
+ "max": 0.0
256
+ },
257
+ {
258
+ "current": 2400.039,
259
+ "min": 0.0,
260
+ "max": 0.0
261
+ },
262
+ {
263
+ "current": 2400.039,
264
+ "min": 0.0,
265
+ "max": 0.0
266
+ },
267
+ {
268
+ "current": 2400.039,
269
+ "min": 0.0,
270
+ "max": 0.0
271
+ },
272
+ {
273
+ "current": 2400.039,
274
+ "min": 0.0,
275
+ "max": 0.0
276
+ },
277
+ {
278
+ "current": 2400.039,
279
+ "min": 0.0,
280
+ "max": 0.0
281
+ },
282
+ {
283
+ "current": 2400.039,
284
+ "min": 0.0,
285
+ "max": 0.0
286
+ },
287
+ {
288
+ "current": 2400.039,
289
+ "min": 0.0,
290
+ "max": 0.0
291
+ },
292
+ {
293
+ "current": 2400.039,
294
+ "min": 0.0,
295
+ "max": 0.0
296
+ },
297
+ {
298
+ "current": 2400.039,
299
+ "min": 0.0,
300
+ "max": 0.0
301
+ },
302
+ {
303
+ "current": 2400.039,
304
+ "min": 0.0,
305
+ "max": 0.0
306
+ },
307
+ {
308
+ "current": 2400.039,
309
+ "min": 0.0,
310
+ "max": 0.0
311
+ },
312
+ {
313
+ "current": 2400.039,
314
+ "min": 0.0,
315
+ "max": 0.0
316
+ },
317
+ {
318
+ "current": 2400.039,
319
+ "min": 0.0,
320
+ "max": 0.0
321
+ },
322
+ {
323
+ "current": 2400.039,
324
+ "min": 0.0,
325
+ "max": 0.0
326
+ },
327
+ {
328
+ "current": 2400.039,
329
+ "min": 0.0,
330
+ "max": 0.0
331
+ },
332
+ {
333
+ "current": 2400.039,
334
+ "min": 0.0,
335
+ "max": 0.0
336
+ },
337
+ {
338
+ "current": 2400.039,
339
+ "min": 0.0,
340
+ "max": 0.0
341
+ },
342
+ {
343
+ "current": 2400.039,
344
+ "min": 0.0,
345
+ "max": 0.0
346
+ },
347
+ {
348
+ "current": 2400.039,
349
+ "min": 0.0,
350
+ "max": 0.0
351
+ },
352
+ {
353
+ "current": 2400.039,
354
+ "min": 0.0,
355
+ "max": 0.0
356
+ },
357
+ {
358
+ "current": 2400.039,
359
+ "min": 0.0,
360
+ "max": 0.0
361
+ },
362
+ {
363
+ "current": 2400.039,
364
+ "min": 0.0,
365
+ "max": 0.0
366
+ },
367
+ {
368
+ "current": 2400.039,
369
+ "min": 0.0,
370
+ "max": 0.0
371
+ },
372
+ {
373
+ "current": 2400.039,
374
+ "min": 0.0,
375
+ "max": 0.0
376
+ },
377
+ {
378
+ "current": 2400.039,
379
+ "min": 0.0,
380
+ "max": 0.0
381
+ },
382
+ {
383
+ "current": 2400.039,
384
+ "min": 0.0,
385
+ "max": 0.0
386
+ },
387
+ {
388
+ "current": 2400.039,
389
+ "min": 0.0,
390
+ "max": 0.0
391
+ },
392
+ {
393
+ "current": 2400.039,
394
+ "min": 0.0,
395
+ "max": 0.0
396
+ },
397
+ {
398
+ "current": 2400.039,
399
+ "min": 0.0,
400
+ "max": 0.0
401
+ },
402
+ {
403
+ "current": 2400.039,
404
+ "min": 0.0,
405
+ "max": 0.0
406
+ },
407
+ {
408
+ "current": 2400.039,
409
+ "min": 0.0,
410
+ "max": 0.0
411
+ },
412
+ {
413
+ "current": 2400.039,
414
+ "min": 0.0,
415
+ "max": 0.0
416
+ },
417
+ {
418
+ "current": 2400.039,
419
+ "min": 0.0,
420
+ "max": 0.0
421
+ },
422
+ {
423
+ "current": 2400.039,
424
+ "min": 0.0,
425
+ "max": 0.0
426
+ },
427
+ {
428
+ "current": 2400.039,
429
+ "min": 0.0,
430
+ "max": 0.0
431
+ },
432
+ {
433
+ "current": 2400.039,
434
+ "min": 0.0,
435
+ "max": 0.0
436
+ },
437
+ {
438
+ "current": 2400.039,
439
+ "min": 0.0,
440
+ "max": 0.0
441
+ },
442
+ {
443
+ "current": 2400.039,
444
+ "min": 0.0,
445
+ "max": 0.0
446
+ },
447
+ {
448
+ "current": 2400.039,
449
+ "min": 0.0,
450
+ "max": 0.0
451
+ },
452
+ {
453
+ "current": 2400.039,
454
+ "min": 0.0,
455
+ "max": 0.0
456
+ },
457
+ {
458
+ "current": 2400.039,
459
+ "min": 0.0,
460
+ "max": 0.0
461
+ },
462
+ {
463
+ "current": 2400.039,
464
+ "min": 0.0,
465
+ "max": 0.0
466
+ },
467
+ {
468
+ "current": 2400.039,
469
+ "min": 0.0,
470
+ "max": 0.0
471
+ }
472
+ ],
473
+ "disk": {
474
+ "/": {
475
+ "total": 0.0625,
476
+ "used": 1.1444091796875e-05
477
+ }
478
+ },
479
+ "gpu": "NVIDIA A100-SXM4-40GB",
480
+ "gpu_count": 4,
481
+ "gpu_devices": [
482
+ {
483
+ "name": "NVIDIA A100-SXM4-40GB",
484
+ "memory_total": 42949672960
485
+ },
486
+ {
487
+ "name": "NVIDIA A100-SXM4-40GB",
488
+ "memory_total": 42949672960
489
+ },
490
+ {
491
+ "name": "NVIDIA A100-SXM4-40GB",
492
+ "memory_total": 42949672960
493
+ },
494
+ {
495
+ "name": "NVIDIA A100-SXM4-40GB",
496
+ "memory_total": 42949672960
497
+ }
498
+ ],
499
+ "memory": {
500
+ "total": 226.66352462768555
501
+ }
502
+ }
wandb/run-20240823_202540-om09pls8/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 3.7274892330169678, "training/perplexity": 41.57459289701456, "utils/batch_size": 17, "utils/global_batch_size": 612, "utils/seq_len": 1025, "utils/gradient_accumulation_steps": 9, "utils/iteration": 110, "optimizer/lr": 5.18e-06, "optimizer/variance_l2": 0.04658828859035772, "optimizer/variance_sqrt_l2": 0.8728927373830674, "optimizer/momentum_l2": 0.8215464439288661, "optimizer/weight_l2": 640.8711356427281, "optimizer/variance_l1": 0.7605819702148438, "optimizer/variance_sqrt_l1": 1761.25, "optimizer/momentum_l1": 1269.75, "optimizer/weight_l1": 1809664.0, "optimizer/variance_abs_max": 0.04248046875, "optimizer/variance_sqrt_abs_max": 0.2060546875, "optimizer/momentum_abs_max": 0.18359375, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 6.903782178000256, "stats/tokens_per_sec": 90863.23754520644, "stats/tokens_per_sec_per_gpu": 22715.80938630161, "stats/tflops": 73.33268637131981, "_timestamp": 1724412471.451136, "_runtime": 130.7007360458374, "_step": 110, "evaluation/val_loss": 3.437650442123413, "evaluation/val_ppl": 31.11376953125, "_wandb": {"runtime": 131}}
wandb/run-20240823_202540-om09pls8/logs/debug-internal.log ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 20:25:40,751 INFO StreamThr :13176 [internal.py:wandb_internal():86] W&B internal server running at pid: 13176, started at: 2024-08-23 20:25:40.750453
2
+ 2024-08-23 20:25:40,752 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-23 20:25:40,755 INFO WriterThread:13176 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_202540-om09pls8/run-om09pls8.wandb
4
+ 2024-08-23 20:25:40,756 DEBUG SenderThread:13176 [sender.py:send():382] send: header
5
+ 2024-08-23 20:25:40,869 DEBUG SenderThread:13176 [sender.py:send():382] send: run
6
+ 2024-08-23 20:25:41,336 INFO SenderThread:13176 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_202540-om09pls8/files
7
+ 2024-08-23 20:25:41,336 INFO SenderThread:13176 [sender.py:_start_run_threads():1136] run started: om09pls8 with start time 1724412340.7504
8
+ 2024-08-23 20:25:41,342 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-23 20:25:41,342 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-23 20:25:41,412 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-23 20:25:41,418 DEBUG HandlerThread:13176 [system_info.py:__init__():27] System info init
12
+ 2024-08-23 20:25:41,418 DEBUG HandlerThread:13176 [system_info.py:__init__():42] System info init done
13
+ 2024-08-23 20:25:41,418 INFO HandlerThread:13176 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-23 20:25:41,418 INFO SystemMonitor:13176 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-23 20:25:41,418 INFO HandlerThread:13176 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-23 20:25:41,419 INFO SystemMonitor:13176 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-23 20:25:41,419 INFO SystemMonitor:13176 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-23 20:25:41,420 INFO SystemMonitor:13176 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-23 20:25:41,421 INFO SystemMonitor:13176 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-23 20:25:41,423 INFO SystemMonitor:13176 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-23 20:25:41,454 DEBUG HandlerThread:13176 [system_info.py:probe():151] Probing system
22
+ 2024-08-23 20:25:41,456 DEBUG HandlerThread:13176 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-23 20:25:41,470 DEBUG HandlerThread:13176 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-23 20:25:41,470 DEBUG HandlerThread:13176 [system_info.py:probe():199] Probing system done
25
+ 2024-08-23 20:25:41,470 DEBUG HandlerThread:13176 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T11:25:41.454442', 'startedAt': '2024-08-23T11:25:40.735970', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '131072', '--micro-batch-size', '17', '--valid_micro_batch_size', '10', '--global-batch-size', '612', '--train-iters', '16000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '16000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '500', '--eval-interval', '3', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-20:25:00'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 72, 'cpu_count_logical': 72, 'cpu_freq': {'current': 2400.038999999999, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 4, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 226.66352462768555}}
26
+ 2024-08-23 20:25:41,470 INFO HandlerThread:13176 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-23 20:25:41,470 INFO HandlerThread:13176 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-23 20:25:41,472 INFO HandlerThread:13176 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-23 20:25:41,502 DEBUG SenderThread:13176 [sender.py:send():382] send: files
30
+ 2024-08-23 20:25:41,502 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-23 20:25:41,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-23 20:25:41,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-23 20:25:41,514 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-23 20:25:41,514 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-23 20:25:41,516 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-23 20:25:41,704 DEBUG SenderThread:13176 [sender.py:send():382] send: telemetry
37
+ 2024-08-23 20:25:42,229 INFO wandb-upload_0:13176 [upload_job.py:push():131] Uploaded file /tmp/tmpnyk2zt9mwandb/u7uqpthk-wandb-metadata.json
38
+ 2024-08-23 20:25:42,338 INFO Thread-12 :13176 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_202540-om09pls8/files/output.log
39
+ 2024-08-23 20:25:42,339 INFO Thread-12 :13176 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_202540-om09pls8/files/wandb-metadata.json
40
+ 2024-08-23 20:25:42,339 INFO Thread-12 :13176 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_202540-om09pls8/files/requirements.txt
41
+ 2024-08-23 20:25:44,339 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
42
+ 2024-08-23 20:25:46,083 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-23 20:25:46,340 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
44
+ 2024-08-23 20:25:47,341 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
45
+ 2024-08-23 20:25:51,887 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
46
+ 2024-08-23 20:25:52,344 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
47
+ 2024-08-23 20:25:53,345 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
48
+ 2024-08-23 20:25:55,346 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
49
+ 2024-08-23 20:25:56,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
50
+ 2024-08-23 20:25:56,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
51
+ 2024-08-23 20:25:56,513 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
52
+ 2024-08-23 20:25:57,347 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
53
+ 2024-08-23 20:25:57,765 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
54
+ 2024-08-23 20:25:58,348 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
55
+ 2024-08-23 20:25:59,349 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
56
+ 2024-08-23 20:26:02,595 DEBUG SenderThread:13176 [sender.py:send():382] send: config
57
+ 2024-08-23 20:26:02,595 DEBUG SenderThread:13176 [sender.py:send():382] send: config
58
+ 2024-08-23 20:26:03,351 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
59
+ 2024-08-23 20:26:03,596 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
60
+ 2024-08-23 20:26:04,352 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
61
+ 2024-08-23 20:26:08,596 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-23 20:26:11,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
63
+ 2024-08-23 20:26:11,514 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
64
+ 2024-08-23 20:26:11,514 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
65
+ 2024-08-23 20:26:13,784 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
66
+ 2024-08-23 20:26:14,359 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/config.yaml
67
+ 2024-08-23 20:26:18,968 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
68
+ 2024-08-23 20:26:23,969 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
69
+ 2024-08-23 20:26:28,970 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
70
+ 2024-08-23 20:26:30,283 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
71
+ 2024-08-23 20:26:30,422 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
72
+ 2024-08-23 20:26:30,422 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
73
+ 2024-08-23 20:26:34,601 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
74
+ 2024-08-23 20:26:39,602 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
75
+ 2024-08-23 20:26:39,739 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
76
+ 2024-08-23 20:26:41,424 DEBUG SystemMonitor:13176 [system_monitor.py:_start():172] Starting system metrics aggregation loop
77
+ 2024-08-23 20:26:41,427 DEBUG SenderThread:13176 [sender.py:send():382] send: stats
78
+ 2024-08-23 20:26:42,374 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
79
+ 2024-08-23 20:26:45,083 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
80
+ 2024-08-23 20:26:45,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
81
+ 2024-08-23 20:26:45,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
82
+ 2024-08-23 20:26:45,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
83
+ 2024-08-23 20:26:46,679 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
84
+ 2024-08-23 20:26:46,682 DEBUG SenderThread:13176 [sender.py:send():382] send: history
85
+ 2024-08-23 20:26:46,682 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
86
+ 2024-08-23 20:26:46,684 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
87
+ 2024-08-23 20:26:47,378 INFO Thread-12 :13176 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
88
+ 2024-08-23 20:26:48,379 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
89
+ 2024-08-23 20:26:49,094 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
90
+ 2024-08-23 20:26:50,093 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
91
+ 2024-08-23 20:26:50,380 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
92
+ 2024-08-23 20:26:55,094 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
93
+ 2024-08-23 20:26:56,822 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
94
+ 2024-08-23 20:26:56,825 DEBUG SenderThread:13176 [sender.py:send():382] send: history
95
+ 2024-08-23 20:26:56,826 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
96
+ 2024-08-23 20:26:56,829 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
97
+ 2024-08-23 20:26:57,385 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
98
+ 2024-08-23 20:26:58,386 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
99
+ 2024-08-23 20:27:00,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
100
+ 2024-08-23 20:27:00,085 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
101
+ 2024-08-23 20:27:00,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
102
+ 2024-08-23 20:27:00,326 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
103
+ 2024-08-23 20:27:03,720 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
104
+ 2024-08-23 20:27:03,722 DEBUG SenderThread:13176 [sender.py:send():382] send: history
105
+ 2024-08-23 20:27:03,722 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
106
+ 2024-08-23 20:27:03,723 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
107
+ 2024-08-23 20:27:04,390 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
108
+ 2024-08-23 20:27:05,764 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
109
+ 2024-08-23 20:27:06,392 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
110
+ 2024-08-23 20:27:10,765 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
111
+ 2024-08-23 20:27:10,923 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
112
+ 2024-08-23 20:27:10,926 DEBUG SenderThread:13176 [sender.py:send():382] send: history
113
+ 2024-08-23 20:27:10,926 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
114
+ 2024-08-23 20:27:10,928 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
115
+ 2024-08-23 20:27:11,395 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
116
+ 2024-08-23 20:27:11,429 DEBUG SenderThread:13176 [sender.py:send():382] send: stats
117
+ 2024-08-23 20:27:12,396 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
118
+ 2024-08-23 20:27:13,173 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
119
+ 2024-08-23 20:27:14,397 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
120
+ 2024-08-23 20:27:15,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
121
+ 2024-08-23 20:27:15,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
122
+ 2024-08-23 20:27:15,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
123
+ 2024-08-23 20:27:16,306 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
124
+ 2024-08-23 20:27:20,728 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
125
+ 2024-08-23 20:27:20,730 DEBUG SenderThread:13176 [sender.py:send():382] send: history
126
+ 2024-08-23 20:27:20,730 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
127
+ 2024-08-23 20:27:20,731 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
128
+ 2024-08-23 20:27:21,402 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
129
+ 2024-08-23 20:27:21,772 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
130
+ 2024-08-23 20:27:22,403 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
131
+ 2024-08-23 20:27:26,773 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
132
+ 2024-08-23 20:27:27,731 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
133
+ 2024-08-23 20:27:27,734 DEBUG SenderThread:13176 [sender.py:send():382] send: history
134
+ 2024-08-23 20:27:27,734 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
135
+ 2024-08-23 20:27:27,736 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
136
+ 2024-08-23 20:27:28,408 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
137
+ 2024-08-23 20:27:30,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
138
+ 2024-08-23 20:27:30,085 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
139
+ 2024-08-23 20:27:30,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
140
+ 2024-08-23 20:27:30,409 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
141
+ 2024-08-23 20:27:32,346 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
142
+ 2024-08-23 20:27:34,646 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
143
+ 2024-08-23 20:27:34,649 DEBUG SenderThread:13176 [sender.py:send():382] send: history
144
+ 2024-08-23 20:27:34,649 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
145
+ 2024-08-23 20:27:34,651 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
146
+ 2024-08-23 20:27:35,413 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
147
+ 2024-08-23 20:27:36,413 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
148
+ 2024-08-23 20:27:36,883 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
149
+ 2024-08-23 20:27:37,883 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
150
+ 2024-08-23 20:27:38,415 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
151
+ 2024-08-23 20:27:41,432 DEBUG SenderThread:13176 [sender.py:send():382] send: stats
152
+ 2024-08-23 20:27:43,433 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
153
+ 2024-08-23 20:27:44,546 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
154
+ 2024-08-23 20:27:44,549 DEBUG SenderThread:13176 [sender.py:send():382] send: history
155
+ 2024-08-23 20:27:44,549 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
156
+ 2024-08-23 20:27:44,551 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
157
+ 2024-08-23 20:27:45,085 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
158
+ 2024-08-23 20:27:45,085 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
159
+ 2024-08-23 20:27:45,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
160
+ 2024-08-23 20:27:45,420 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
161
+ 2024-08-23 20:27:46,421 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
162
+ 2024-08-23 20:27:49,351 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
163
+ 2024-08-23 20:27:51,452 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
164
+ 2024-08-23 20:27:51,454 DEBUG SenderThread:13176 [sender.py:send():382] send: history
165
+ 2024-08-23 20:27:51,454 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
166
+ 2024-08-23 20:27:51,456 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
167
+ 2024-08-23 20:27:52,425 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
168
+ 2024-08-23 20:27:53,012 DEBUG SenderThread:13176 [sender.py:send():382] send: exit
169
+ 2024-08-23 20:27:53,012 INFO SenderThread:13176 [sender.py:send_exit():589] handling exit code: 255
170
+ 2024-08-23 20:27:53,013 INFO SenderThread:13176 [sender.py:send_exit():591] handling runtime: 131
171
+ 2024-08-23 20:27:53,014 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
172
+ 2024-08-23 20:27:53,014 INFO SenderThread:13176 [sender.py:send_exit():597] send defer
173
+ 2024-08-23 20:27:53,014 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
174
+ 2024-08-23 20:27:53,014 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 0
175
+ 2024-08-23 20:27:53,015 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
176
+ 2024-08-23 20:27:53,015 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 0
177
+ 2024-08-23 20:27:53,015 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 1
178
+ 2024-08-23 20:27:53,015 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
179
+ 2024-08-23 20:27:53,015 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 1
180
+ 2024-08-23 20:27:53,015 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
181
+ 2024-08-23 20:27:53,015 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 1
182
+ 2024-08-23 20:27:53,015 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 2
183
+ 2024-08-23 20:27:53,015 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
184
+ 2024-08-23 20:27:53,015 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 2
185
+ 2024-08-23 20:27:53,015 INFO HandlerThread:13176 [system_monitor.py:finish():203] Stopping system monitor
186
+ 2024-08-23 20:27:53,015 DEBUG SystemMonitor:13176 [system_monitor.py:_start():179] Finished system metrics aggregation loop
187
+ 2024-08-23 20:27:53,016 DEBUG SystemMonitor:13176 [system_monitor.py:_start():183] Publishing last batch of metrics
188
+ 2024-08-23 20:27:53,016 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined cpu monitor
189
+ 2024-08-23 20:27:53,018 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined disk monitor
190
+ 2024-08-23 20:27:53,348 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined gpu monitor
191
+ 2024-08-23 20:27:53,349 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined memory monitor
192
+ 2024-08-23 20:27:53,349 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined network monitor
193
+ 2024-08-23 20:27:53,350 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
194
+ 2024-08-23 20:27:53,350 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 2
195
+ 2024-08-23 20:27:53,350 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 3
196
+ 2024-08-23 20:27:53,350 DEBUG SenderThread:13176 [sender.py:send():382] send: stats
197
+ 2024-08-23 20:27:53,350 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
198
+ 2024-08-23 20:27:53,351 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 3
199
+ 2024-08-23 20:27:53,354 DEBUG SenderThread:13176 [sender.py:send():382] send: history
200
+ 2024-08-23 20:27:53,354 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
201
+ 2024-08-23 20:27:53,355 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
202
+ 2024-08-23 20:27:53,355 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
203
+ 2024-08-23 20:27:53,355 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 3
204
+ 2024-08-23 20:27:53,356 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 4
205
+ 2024-08-23 20:27:53,356 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
206
+ 2024-08-23 20:27:53,356 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 4
207
+ 2024-08-23 20:27:53,356 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
208
+ 2024-08-23 20:27:53,356 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 4
209
+ 2024-08-23 20:27:53,356 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 5
210
+ 2024-08-23 20:27:53,356 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
211
+ 2024-08-23 20:27:53,356 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 5
212
+ 2024-08-23 20:27:53,358 DEBUG SenderThread:13176 [sender.py:send():382] send: summary
213
+ 2024-08-23 20:27:53,358 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
214
+ 2024-08-23 20:27:53,359 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
215
+ 2024-08-23 20:27:53,359 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 5
216
+ 2024-08-23 20:27:53,359 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 6
217
+ 2024-08-23 20:27:53,359 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
218
+ 2024-08-23 20:27:53,359 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 6
219
+ 2024-08-23 20:27:53,359 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
220
+ 2024-08-23 20:27:53,359 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 6
221
+ 2024-08-23 20:27:53,359 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 7
222
+ 2024-08-23 20:27:53,359 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
223
+ 2024-08-23 20:27:53,360 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
224
+ 2024-08-23 20:27:53,360 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 7
225
+ 2024-08-23 20:27:53,360 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
226
+ 2024-08-23 20:27:53,360 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 7
227
+ 2024-08-23 20:27:53,427 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
228
+ 2024-08-23 20:27:54,012 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
229
+ 2024-08-23 20:27:54,427 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
230
+ 2024-08-23 20:27:54,598 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 8
231
+ 2024-08-23 20:27:54,598 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
232
+ 2024-08-23 20:27:54,598 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
233
+ 2024-08-23 20:27:54,598 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 8
234
+ 2024-08-23 20:27:54,599 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
235
+ 2024-08-23 20:27:54,599 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 8
236
+ 2024-08-23 20:27:54,599 INFO SenderThread:13176 [job_builder.py:build():296] Attempting to build job artifact
237
+ 2024-08-23 20:27:54,600 INFO SenderThread:13176 [job_builder.py:_get_source_type():426] is repo sourced job
238
+ 2024-08-23 20:27:54,620 INFO SenderThread:13176 [job_builder.py:build():402] adding wandb-job metadata file
239
+ 2024-08-23 20:27:54,630 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 9
240
+ 2024-08-23 20:27:54,631 DEBUG SenderThread:13176 [sender.py:send():382] send: artifact
241
+ 2024-08-23 20:27:54,631 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
242
+ 2024-08-23 20:27:54,632 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 9
243
+ 2024-08-23 20:27:55,013 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
244
+ 2024-08-23 20:27:55,428 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
245
+ 2024-08-23 20:27:55,507 INFO SenderThread:13176 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MTk4ODkxMQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MjAxODA1Mw==', 'versionIndex': 3}}}
246
+ 2024-08-23 20:27:55,507 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
247
+ 2024-08-23 20:27:55,507 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 9
248
+ 2024-08-23 20:27:55,507 INFO SenderThread:13176 [dir_watcher.py:finish():358] shutting down directory watcher
249
+ 2024-08-23 20:27:56,429 INFO SenderThread:13176 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_202540-om09pls8/files
250
+ 2024-08-23 20:27:56,429 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/requirements.txt requirements.txt
251
+ 2024-08-23 20:27:56,429 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/config.yaml config.yaml
252
+ 2024-08-23 20:27:56,431 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/wandb-metadata.json wandb-metadata.json
253
+ 2024-08-23 20:27:56,431 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json wandb-summary.json
254
+ 2024-08-23 20:27:56,433 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/output.log output.log
255
+ 2024-08-23 20:27:56,433 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 10
256
+ 2024-08-23 20:27:56,434 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
257
+ 2024-08-23 20:27:56,435 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
258
+ 2024-08-23 20:27:56,435 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 10
259
+ 2024-08-23 20:27:56,436 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
260
+ 2024-08-23 20:27:56,436 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 10
261
+ 2024-08-23 20:27:56,436 INFO SenderThread:13176 [file_pusher.py:finish():172] shutting down file pusher
262
+ 2024-08-23 20:27:56,841 INFO wandb-upload_0:13176 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_202540-om09pls8/files/requirements.txt
263
+ 2024-08-23 20:27:56,894 INFO wandb-upload_2:13176 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
264
+ 2024-08-23 20:27:56,896 INFO wandb-upload_3:13176 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_202540-om09pls8/files/output.log
265
+ 2024-08-23 20:27:56,902 INFO wandb-upload_1:13176 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_202540-om09pls8/files/config.yaml
266
+ 2024-08-23 20:27:57,013 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
267
+ 2024-08-23 20:27:57,014 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
268
+ 2024-08-23 20:27:57,103 INFO Thread-11 (_thread_body):13176 [sender.py:transition_state():617] send defer: 11
269
+ 2024-08-23 20:27:57,103 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
270
+ 2024-08-23 20:27:57,103 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 11
271
+ 2024-08-23 20:27:57,103 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
272
+ 2024-08-23 20:27:57,103 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 11
273
+ 2024-08-23 20:27:57,103 INFO SenderThread:13176 [file_pusher.py:join():178] waiting for file pusher
274
+ 2024-08-23 20:27:57,103 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 12
275
+ 2024-08-23 20:27:57,103 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
276
+ 2024-08-23 20:27:57,103 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 12
277
+ 2024-08-23 20:27:57,104 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
278
+ 2024-08-23 20:27:57,104 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 12
279
+ 2024-08-23 20:27:57,104 INFO SenderThread:13176 [file_stream.py:finish():595] file stream finish called
280
+ 2024-08-23 20:27:57,389 INFO SenderThread:13176 [file_stream.py:finish():599] file stream finish is done
281
+ 2024-08-23 20:27:57,389 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 13
282
+ 2024-08-23 20:27:57,389 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
283
+ 2024-08-23 20:27:57,389 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 13
284
+ 2024-08-23 20:27:57,389 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
285
+ 2024-08-23 20:27:57,389 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 13
286
+ 2024-08-23 20:27:57,389 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 14
287
+ 2024-08-23 20:27:57,390 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
288
+ 2024-08-23 20:27:57,390 DEBUG SenderThread:13176 [sender.py:send():382] send: final
289
+ 2024-08-23 20:27:57,390 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 14
290
+ 2024-08-23 20:27:57,390 DEBUG SenderThread:13176 [sender.py:send():382] send: footer
291
+ 2024-08-23 20:27:57,390 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
292
+ 2024-08-23 20:27:57,390 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 14
293
+ 2024-08-23 20:27:57,391 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
294
+ 2024-08-23 20:27:57,391 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
295
+ 2024-08-23 20:27:57,391 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
296
+ 2024-08-23 20:27:57,391 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: server_info
297
+ 2024-08-23 20:27:57,392 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
298
+ 2024-08-23 20:27:57,392 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: get_summary
299
+ 2024-08-23 20:27:57,392 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: server_info
300
+ 2024-08-23 20:27:57,393 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: sampled_history
301
+ 2024-08-23 20:27:57,395 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
302
+ 2024-08-23 20:27:57,395 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: job_info
303
+ 2024-08-23 20:27:57,563 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: job_info
304
+ 2024-08-23 20:27:57,563 INFO MainThread:13176 [wandb_run.py:_footer_history_summary_info():3866] rendering history
305
+ 2024-08-23 20:27:57,564 INFO MainThread:13176 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
306
+ 2024-08-23 20:27:57,564 INFO MainThread:13176 [wandb_run.py:_footer_sync_info():3825] logging synced files
307
+ 2024-08-23 20:27:57,564 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: shutdown
308
+ 2024-08-23 20:27:57,564 INFO HandlerThread:13176 [handler.py:finish():869] shutting down handler
309
+ 2024-08-23 20:27:58,396 INFO WriterThread:13176 [datastore.py:close():296] close: /project/wandb/run-20240823_202540-om09pls8/run-om09pls8.wandb
310
+ 2024-08-23 20:27:58,563 INFO SenderThread:13176 [sender.py:finish():1572] shutting down sender
311
+ 2024-08-23 20:27:58,563 INFO SenderThread:13176 [file_pusher.py:finish():172] shutting down file pusher
312
+ 2024-08-23 20:27:58,563 INFO SenderThread:13176 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240823_202540-om09pls8/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 20:25:40,742 INFO MainThread:12857 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Configure stats pid to 12857
3
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_202540-om09pls8/logs/debug.log
9
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_202540-om09pls8/logs/debug-internal.log
10
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-20:25:00', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 3, 'save_interval': 500, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 16000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 16000, 'train_samples': None, 'global_batch_size': 612, 'micro_batch_size': 17, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 10, 'rank': 0, 'world_size': 4, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 9}
13
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:init():616] starting backend
14
+ 2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-23 20:25:40,748 INFO MainThread:12857 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-23 20:25:40,750 INFO MainThread:12857 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-23 20:25:40,754 INFO MainThread:12857 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-23 20:25:40,864 INFO MainThread:12857 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-23 20:25:41,341 INFO MainThread:12857 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-23 20:25:41,364 INFO MainThread:12857 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-23 20:25:41,364 INFO MainThread:12857 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-23 20:25:41,512 INFO MainThread:12857 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-23 20:25:41,512 INFO MainThread:12857 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-23 20:25:41,512 INFO MainThread:12857 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-23 20:25:41,513 INFO MainThread:12857 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-23 20:25:41,514 INFO MainThread:12857 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-23 20:26:02,594 INFO MainThread:12857 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 1024, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-23 20:26:02,595 INFO MainThread:12857 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 4}
30
+ 2024-08-23 20:27:58,565 WARNING MsgRouterThr:12857 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240823_202540-om09pls8/run-om09pls8.wandb ADDED
Binary file (47.7 kB). View file
 
wandb/run-20240831_192346-5vo4p2k7/files/config.yaml ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: SHARD_GRAD_OP
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value: null
30
+ valid_data_path:
31
+ desc: null
32
+ value: null
33
+ test_data_path:
34
+ desc: null
35
+ value: null
36
+ data_cache_path:
37
+ desc: null
38
+ value: null
39
+ vocab_size:
40
+ desc: null
41
+ value: null
42
+ vocab_file:
43
+ desc: null
44
+ value: null
45
+ merge_file:
46
+ desc: null
47
+ value: null
48
+ seq_length:
49
+ desc: null
50
+ value: 2048
51
+ num_workers:
52
+ desc: null
53
+ value: 4
54
+ tokenizer_type:
55
+ desc: null
56
+ value: HFPreTrainedTokenizer
57
+ tokenizer_model:
58
+ desc: null
59
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
60
+ reset_position_ids:
61
+ desc: null
62
+ value: false
63
+ reset_attention_mask:
64
+ desc: null
65
+ value: false
66
+ eod_mask_loss:
67
+ desc: null
68
+ value: false
69
+ retro_return_doc_ids:
70
+ desc: null
71
+ value: false
72
+ short_seq_prob:
73
+ desc: null
74
+ value: 0.1
75
+ vocab_extra_ids:
76
+ desc: null
77
+ value: 0
78
+ seed:
79
+ desc: null
80
+ value: 1234
81
+ use_mpi:
82
+ desc: null
83
+ value: false
84
+ wandb_entity:
85
+ desc: null
86
+ value: iwakawa-koichi-q5-tohoku-nlp6723
87
+ wandb_name:
88
+ desc: null
89
+ value: yans-baseline-qwen2-0.5B-3.5e-5-ichikara_train_2024-08-31-19:23:33
90
+ wandb_project:
91
+ desc: null
92
+ value: yans_experiment
93
+ quantization:
94
+ desc: null
95
+ value: false
96
+ use_freeze_layers:
97
+ desc: null
98
+ value: false
99
+ freeze_layers:
100
+ desc: null
101
+ value: null
102
+ bf16:
103
+ desc: null
104
+ value: true
105
+ fp16:
106
+ desc: null
107
+ value: false
108
+ mixed_precision:
109
+ desc: null
110
+ value: true
111
+ param_dtype:
112
+ desc: null
113
+ value: null
114
+ load:
115
+ desc: null
116
+ value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara
117
+ save:
118
+ desc: null
119
+ value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara
120
+ base_model:
121
+ desc: null
122
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
123
+ use_better_transformer:
124
+ desc: null
125
+ value: false
126
+ grad_clip_norm:
127
+ desc: null
128
+ value: 1.0
129
+ eval_interval:
130
+ desc: null
131
+ value: 100
132
+ save_interval:
133
+ desc: null
134
+ value: 100
135
+ eval_iters:
136
+ desc: null
137
+ value: 10
138
+ optimizer:
139
+ desc: null
140
+ value: anyprecision
141
+ lr:
142
+ desc: null
143
+ value: 2.0e-05
144
+ lr_decay_style:
145
+ desc: null
146
+ value: cosine
147
+ lr_decay_iters:
148
+ desc: null
149
+ value: 1000
150
+ lr_warmup_iters:
151
+ desc: null
152
+ value: 25
153
+ min_lr:
154
+ desc: null
155
+ value: 1.0e-06
156
+ train_iters:
157
+ desc: null
158
+ value: 1000
159
+ train_samples:
160
+ desc: null
161
+ value: null
162
+ global_batch_size:
163
+ desc: null
164
+ value: 16
165
+ micro_batch_size:
166
+ desc: null
167
+ value: 4
168
+ make_vocab_size_divisible_by:
169
+ desc: null
170
+ value: 128
171
+ sliding_window_size:
172
+ desc: null
173
+ value: 4096
174
+ skip_batch:
175
+ desc: null
176
+ value: null
177
+ no_save_optimizer_state:
178
+ desc: null
179
+ value: false
180
+ continual_pretraining:
181
+ desc: null
182
+ value: false
183
+ instruction_tuning:
184
+ desc: null
185
+ value: false
186
+ direct_preference_optimization:
187
+ desc: null
188
+ value: false
189
+ attention_dropout:
190
+ desc: null
191
+ value: 0.1
192
+ hidden_dropout:
193
+ desc: null
194
+ value: 0.1
195
+ weight_decay:
196
+ desc: null
197
+ value: 0.1
198
+ adam_beta1:
199
+ desc: null
200
+ value: 0.9
201
+ adam_beta2:
202
+ desc: null
203
+ value: 0.99
204
+ adam_eps:
205
+ desc: null
206
+ value: 1.0e-06
207
+ hf_transformer_model_dir:
208
+ desc: null
209
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
210
+ instruction_train_data_path:
211
+ desc: null
212
+ value: /work/datasets/bin/ichikara/train/data
213
+ instruction_valid_data_path:
214
+ desc: null
215
+ value: /work/datasets/bin/ichikara/valid/data
216
+ epoch:
217
+ desc: null
218
+ value: null
219
+ instruction_dataset_size:
220
+ desc: null
221
+ value: null
222
+ save_sampler_state:
223
+ desc: null
224
+ value: true
225
+ label_smoothing:
226
+ desc: null
227
+ value: 0.0
228
+ save_n_checkpoints:
229
+ desc: null
230
+ value: 10
231
+ hf_repo_id:
232
+ desc: null
233
+ value: koichi12/yans-baseline-qwen2-0.5B-3.5e-5-ichikara
234
+ create_public_hf_repo:
235
+ desc: null
236
+ value: false
237
+ upload_all_checkpoints_to_hf:
238
+ desc: null
239
+ value: false
240
+ hf_upload_retry_limit:
241
+ desc: null
242
+ value: 2
243
+ exit_duration_in_mins:
244
+ desc: null
245
+ value: null
246
+ source_key:
247
+ desc: null
248
+ value: source
249
+ target_key:
250
+ desc: null
251
+ value: target
252
+ attn_implementation:
253
+ desc: null
254
+ value: flash_attention_2
255
+ efficient_instruction_tuning:
256
+ desc: null
257
+ value: true
258
+ remove_padding_masking:
259
+ desc: null
260
+ value: true
261
+ save_start_iter:
262
+ desc: null
263
+ value: null
264
+ valid_micro_batch_size:
265
+ desc: null
266
+ value: 1
267
+ rank:
268
+ desc: null
269
+ value: 0
270
+ world_size:
271
+ desc: null
272
+ value: 1
273
+ padded_vocab_size:
274
+ desc: null
275
+ value: 151680
276
+ gradient_accumulation_steps:
277
+ desc: null
278
+ value: 4
279
+ _wandb:
280
+ desc: null
281
+ value:
282
+ python_version: 3.10.12
283
+ cli_version: 0.16.3
284
+ framework: huggingface
285
+ huggingface_version: 4.43.3
286
+ is_jupyter_run: false
287
+ is_kaggle_kernel: false
288
+ start_time: 1725099826.21653
289
+ t:
290
+ 1:
291
+ - 1
292
+ - 11
293
+ - 49
294
+ - 55
295
+ - 71
296
+ - 105
297
+ 2:
298
+ - 1
299
+ - 11
300
+ - 49
301
+ - 55
302
+ - 71
303
+ - 105
304
+ 3:
305
+ - 13
306
+ - 16
307
+ - 23
308
+ 4: 3.10.12
309
+ 5: 0.16.3
310
+ 6: 4.43.3
311
+ 8:
312
+ - 5
313
+ 13: linux-x86_64
wandb/run-20240831_192346-5vo4p2k7/files/output.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-baseline-qwen2-0.5B-3.5e-5-ichikara.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara/latest_iteration.txt
8
+ Traceback (most recent call last):
9
+ File "/project/examples/finetuning.py", line 13, in <module>
10
+ main()
11
+ File "/project/src/llama_recipes/finetuning.py", line 103, in main
12
+ model = get_model(
13
+ File "/project/src/llama_recipes/get_models.py", line 106, in get_model
14
+ assert sliding_window == 131072
15
+ AssertionError
wandb/run-20240831_192346-5vo4p2k7/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0
wandb/run-20240831_192346-5vo4p2k7/files/wandb-metadata.json ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-31T10:23:46.789150",
5
+ "startedAt": "2024-08-31T10:23:46.204215",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "2048",
11
+ "--micro-batch-size",
12
+ "4",
13
+ "--valid_micro_batch_size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "16",
17
+ "--train-iters",
18
+ "1000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
23
+ "--instruction-train-data-path",
24
+ "/work/datasets/bin/ichikara/train/data",
25
+ "--instruction-valid-data-path",
26
+ "/work/datasets/bin/ichikara/valid/data",
27
+ "--efficient-instruction-tuning",
28
+ "--remove-padding-masking",
29
+ "--source-key",
30
+ "source",
31
+ "--target-key",
32
+ "target",
33
+ "--lr",
34
+ "2e-5",
35
+ "--min-lr",
36
+ "1e-6",
37
+ "--lr-decay-style",
38
+ "cosine",
39
+ "--lr-warmup-iters",
40
+ "25",
41
+ "--lr-decay-iters",
42
+ "1000",
43
+ "--weight-decay",
44
+ "0.1",
45
+ "--grad-clip-norm",
46
+ "1.0",
47
+ "--optimizer",
48
+ "anyprecision",
49
+ "--adam-beta1",
50
+ "0.9",
51
+ "--adam-beta2",
52
+ "0.99",
53
+ "--adam-eps",
54
+ "1e-6",
55
+ "--save-interval",
56
+ "100",
57
+ "--eval-interval",
58
+ "100",
59
+ "--eval-iters",
60
+ "10",
61
+ "--bf16",
62
+ "--mixed-precision",
63
+ "--base-model",
64
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
65
+ "--hf-transformer-model-dir",
66
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
67
+ "--save",
68
+ "/work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara",
69
+ "--load",
70
+ "/work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara",
71
+ "--fsdp-activation-checkpointing",
72
+ "--sharding-strategy",
73
+ "SHARD_GRAD_OP",
74
+ "--checkpoint-type",
75
+ "LOCAL_STATE_DICT",
76
+ "--save-sampler-state",
77
+ "--save-n-checkpoints",
78
+ "10",
79
+ "--hf-upload-retry-limit",
80
+ "2",
81
+ "--hf-repo-id",
82
+ "koichi12/yans-baseline-qwen2-0.5B-3.5e-5-ichikara",
83
+ "--num-workers",
84
+ "4",
85
+ "--wandb-entity",
86
+ "iwakawa-koichi-q5-tohoku-nlp6723",
87
+ "--wandb-project",
88
+ "yans_experiment",
89
+ "--wandb-name",
90
+ "yans-baseline-qwen2-0.5B-3.5e-5-ichikara_train_2024-08-31-19:23:33"
91
+ ],
92
+ "state": "running",
93
+ "program": "/project/examples/finetuning.py",
94
+ "codePathLocal": "examples/finetuning.py",
95
+ "codePath": "examples/finetuning.py",
96
+ "git": {
97
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
98
+ "commit": "3b2976faebe2228c39adb20194a29b785a37defe"
99
+ },
100
+ "email": null,
101
+ "root": "/project",
102
+ "host": "gpu-koiwa-00",
103
+ "username": "koiwa",
104
+ "executable": "/usr/bin/python",
105
+ "cpu_count": 18,
106
+ "cpu_count_logical": 18,
107
+ "cpu_freq": {
108
+ "current": 2400.025999999999,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ "cpu_freq_per_core": [
113
+ {
114
+ "current": 2400.026,
115
+ "min": 0.0,
116
+ "max": 0.0
117
+ },
118
+ {
119
+ "current": 2400.026,
120
+ "min": 0.0,
121
+ "max": 0.0
122
+ },
123
+ {
124
+ "current": 2400.026,
125
+ "min": 0.0,
126
+ "max": 0.0
127
+ },
128
+ {
129
+ "current": 2400.026,
130
+ "min": 0.0,
131
+ "max": 0.0
132
+ },
133
+ {
134
+ "current": 2400.026,
135
+ "min": 0.0,
136
+ "max": 0.0
137
+ },
138
+ {
139
+ "current": 2400.026,
140
+ "min": 0.0,
141
+ "max": 0.0
142
+ },
143
+ {
144
+ "current": 2400.026,
145
+ "min": 0.0,
146
+ "max": 0.0
147
+ },
148
+ {
149
+ "current": 2400.026,
150
+ "min": 0.0,
151
+ "max": 0.0
152
+ },
153
+ {
154
+ "current": 2400.026,
155
+ "min": 0.0,
156
+ "max": 0.0
157
+ },
158
+ {
159
+ "current": 2400.026,
160
+ "min": 0.0,
161
+ "max": 0.0
162
+ },
163
+ {
164
+ "current": 2400.026,
165
+ "min": 0.0,
166
+ "max": 0.0
167
+ },
168
+ {
169
+ "current": 2400.026,
170
+ "min": 0.0,
171
+ "max": 0.0
172
+ },
173
+ {
174
+ "current": 2400.026,
175
+ "min": 0.0,
176
+ "max": 0.0
177
+ },
178
+ {
179
+ "current": 2400.026,
180
+ "min": 0.0,
181
+ "max": 0.0
182
+ },
183
+ {
184
+ "current": 2400.026,
185
+ "min": 0.0,
186
+ "max": 0.0
187
+ },
188
+ {
189
+ "current": 2400.026,
190
+ "min": 0.0,
191
+ "max": 0.0
192
+ },
193
+ {
194
+ "current": 2400.026,
195
+ "min": 0.0,
196
+ "max": 0.0
197
+ },
198
+ {
199
+ "current": 2400.026,
200
+ "min": 0.0,
201
+ "max": 0.0
202
+ }
203
+ ],
204
+ "disk": {
205
+ "/": {
206
+ "total": 0.0625,
207
+ "used": 1.1444091796875e-05
208
+ }
209
+ },
210
+ "gpu": "NVIDIA A100-SXM4-40GB",
211
+ "gpu_count": 1,
212
+ "gpu_devices": [
213
+ {
214
+ "name": "NVIDIA A100-SXM4-40GB",
215
+ "memory_total": 42949672960
216
+ }
217
+ ],
218
+ "memory": {
219
+ "total": 56.48781967163086
220
+ }
221
+ }
wandb/run-20240831_192346-5vo4p2k7/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 3}}