Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- wandb/run-20240803_191521-sg37tylz/files/config.yaml +335 -0
- wandb/run-20240803_191521-sg37tylz/files/output.log +103 -0
- wandb/run-20240803_191521-sg37tylz/files/requirements.txt +271 -0
- wandb/run-20240803_191521-sg37tylz/files/wandb-metadata.json +215 -0
- wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json +1 -0
- wandb/run-20240803_191521-sg37tylz/logs/debug-internal.log +194 -0
- wandb/run-20240803_191521-sg37tylz/logs/debug.log +30 -0
- wandb/run-20240803_191521-sg37tylz/run-sg37tylz.wandb +0 -0
- wandb/run-20240803_191815-jdwps0z3/files/config.yaml +335 -0
- wandb/run-20240803_191815-jdwps0z3/files/output.log +239 -0
- wandb/run-20240803_191815-jdwps0z3/files/requirements.txt +271 -0
- wandb/run-20240803_191815-jdwps0z3/files/wandb-metadata.json +215 -0
- wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json +1 -0
- wandb/run-20240803_191815-jdwps0z3/logs/debug-internal.log +524 -0
- wandb/run-20240803_191815-jdwps0z3/logs/debug.log +29 -0
- wandb/run-20240803_191815-jdwps0z3/run-jdwps0z3.wandb +0 -0
- wandb/run-20240803_192355-n3hnzq4n/files/config.yaml +335 -0
- wandb/run-20240803_192355-n3hnzq4n/files/output.log +0 -0
- wandb/run-20240803_192355-n3hnzq4n/files/requirements.txt +271 -0
- wandb/run-20240803_192355-n3hnzq4n/files/wandb-metadata.json +215 -0
- wandb/run-20240803_192355-n3hnzq4n/files/wandb-summary.json +1 -0
- wandb/run-20240803_192355-n3hnzq4n/logs/debug-internal.log +0 -0
- wandb/run-20240803_192355-n3hnzq4n/logs/debug.log +29 -0
- wandb/run-20240812_063027-j1htzx7q/files/config.yaml +335 -0
- wandb/run-20240812_063027-j1htzx7q/files/requirements.txt +271 -0
- wandb/run-20240812_063027-j1htzx7q/files/wandb-metadata.json +215 -0
- wandb/run-20240812_063027-j1htzx7q/logs/debug-internal.log +261 -0
- wandb/run-20240812_063027-j1htzx7q/logs/debug.log +30 -0
- wandb/run-20240812_063027-j1htzx7q/run-j1htzx7q.wandb +0 -0
- wandb/run-20240823_163849-faey1t8u/files/config.yaml +342 -0
- wandb/run-20240823_163849-faey1t8u/files/output.log +126 -0
- wandb/run-20240823_163849-faey1t8u/files/requirements.txt +375 -0
- wandb/run-20240823_163849-faey1t8u/files/wandb-metadata.json +220 -0
- wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json +1 -0
- wandb/run-20240823_163849-faey1t8u/logs/debug-internal.log +439 -0
- wandb/run-20240823_163849-faey1t8u/logs/debug.log +30 -0
- wandb/run-20240823_163849-faey1t8u/run-faey1t8u.wandb +0 -0
- wandb/run-20240823_202540-om09pls8/files/config.yaml +342 -0
- wandb/run-20240823_202540-om09pls8/files/output.log +133 -0
- wandb/run-20240823_202540-om09pls8/files/requirements.txt +375 -0
- wandb/run-20240823_202540-om09pls8/files/wandb-metadata.json +502 -0
- wandb/run-20240823_202540-om09pls8/files/wandb-summary.json +1 -0
- wandb/run-20240823_202540-om09pls8/logs/debug-internal.log +312 -0
- wandb/run-20240823_202540-om09pls8/logs/debug.log +30 -0
- wandb/run-20240823_202540-om09pls8/run-om09pls8.wandb +0 -0
- wandb/run-20240831_192346-5vo4p2k7/files/config.yaml +313 -0
- wandb/run-20240831_192346-5vo4p2k7/files/output.log +15 -0
- wandb/run-20240831_192346-5vo4p2k7/files/requirements.txt +375 -0
- wandb/run-20240831_192346-5vo4p2k7/files/wandb-metadata.json +221 -0
- wandb/run-20240831_192346-5vo4p2k7/files/wandb-summary.json +1 -0
wandb/run-20240803_191521-sg37tylz/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 512
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-mistral-sample_train_2024-08-03-19:14:48
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-mistral-sample
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-mistral-sample
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/custom/tiny-mistral
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 1600
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 40
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-mistral-sample
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32768
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722680121.573481
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 256
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: mistral
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 512
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 4
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 4
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: MistralForCausalLM
|
wandb/run-20240803_191521-sg37tylz/files/output.log
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/tiny-mistral-sample.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
5 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
6 |
+
warnings.warn(
|
7 |
+
Let split = None
|
8 |
+
Loading model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
|
9 |
+
Loaded model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
|
10 |
+
--> Model /share/pretrained_lm/custom/tiny-mistral
|
11 |
+
--> /share/pretrained_lm/custom/tiny-mistral has 19.925248 Million params
|
12 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
13 |
+
--> applying fsdp activation checkpointing...
|
14 |
+
> datasets target sizes (minimum size):
|
15 |
+
train: 32000000
|
16 |
+
validation: 1616000
|
17 |
+
test: 16000
|
18 |
+
> building train, validation, and test datasets for GPT ...
|
19 |
+
Building a BlendedDataset for a single MegatronDataset
|
20 |
+
Unable to save the indexes because path_to_cache is None
|
21 |
+
Building a BlendedDataset for a single MegatronDataset
|
22 |
+
Unable to save the indexes because path_to_cache is None
|
23 |
+
Building a BlendedDataset for a single MegatronDataset
|
24 |
+
Unable to save the indexes because path_to_cache is None
|
25 |
+
> finished creating GPT datasets ...
|
26 |
+
Loading optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
|
27 |
+
Loaded optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
|
28 |
+
model info: FullyShardedDataParallel(
|
29 |
+
(_fsdp_wrapped_module): MistralForCausalLM(
|
30 |
+
(model): MistralModel(
|
31 |
+
(embed_tokens): Embedding(32768, 256)
|
32 |
+
(layers): ModuleList(
|
33 |
+
(0-3): 4 x FullyShardedDataParallel(
|
34 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
35 |
+
(_checkpoint_wrapped_module): MistralDecoderLayer(
|
36 |
+
(self_attn): MistralFlashAttention2(
|
37 |
+
(q_proj): Linear(in_features=256, out_features=512, bias=False)
|
38 |
+
(k_proj): Linear(in_features=256, out_features=256, bias=False)
|
39 |
+
(v_proj): Linear(in_features=256, out_features=256, bias=False)
|
40 |
+
(o_proj): Linear(in_features=512, out_features=256, bias=False)
|
41 |
+
(rotary_emb): MistralRotaryEmbedding()
|
42 |
+
)
|
43 |
+
(mlp): MistralMLP(
|
44 |
+
(gate_proj): Linear(in_features=256, out_features=512, bias=False)
|
45 |
+
(up_proj): Linear(in_features=256, out_features=512, bias=False)
|
46 |
+
(down_proj): Linear(in_features=512, out_features=256, bias=False)
|
47 |
+
(act_fn): SiLU()
|
48 |
+
)
|
49 |
+
(input_layernorm): MistralRMSNorm()
|
50 |
+
(post_attention_layernorm): MistralRMSNorm()
|
51 |
+
)
|
52 |
+
)
|
53 |
+
)
|
54 |
+
)
|
55 |
+
(norm): MistralRMSNorm()
|
56 |
+
)
|
57 |
+
(lm_head): Linear(in_features=256, out_features=32768, bias=False)
|
58 |
+
)
|
59 |
+
)
|
60 |
+
model config: MistralConfig {
|
61 |
+
"_name_or_path": "/share/pretrained_lm/custom/tiny-mistral",
|
62 |
+
"architectures": [
|
63 |
+
"MistralForCausalLM"
|
64 |
+
],
|
65 |
+
"attention_dropout": 0.0,
|
66 |
+
"bos_token_id": 1,
|
67 |
+
"eos_token_id": 2,
|
68 |
+
"head_dim": 128,
|
69 |
+
"hidden_act": "silu",
|
70 |
+
"hidden_size": 256,
|
71 |
+
"initializer_range": 0.02,
|
72 |
+
"intermediate_size": 512,
|
73 |
+
"label_smoothing": 0.0,
|
74 |
+
"max_position_embeddings": 512,
|
75 |
+
"model_type": "mistral",
|
76 |
+
"num_attention_heads": 4,
|
77 |
+
"num_hidden_layers": 4,
|
78 |
+
"num_key_value_heads": 2,
|
79 |
+
"rms_norm_eps": 1e-05,
|
80 |
+
"rope_theta": 1000000.0,
|
81 |
+
"sliding_window": 4096,
|
82 |
+
"tie_word_embeddings": false,
|
83 |
+
"torch_dtype": "float32",
|
84 |
+
"transformers_version": "4.43.3",
|
85 |
+
"use_cache": false,
|
86 |
+
"vocab_size": 32768
|
87 |
+
}
|
88 |
+
Saving checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000
|
89 |
+
Saving model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
|
90 |
+
Saved model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
|
91 |
+
Saving optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
|
92 |
+
Saved optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
|
93 |
+
Saving scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
|
94 |
+
Saved scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
|
95 |
+
Saving RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
|
96 |
+
Saved RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
|
97 |
+
Saved checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000, took 0.18s
|
98 |
+
[rank0]:[2024-08-03 19:15:37,067] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
99 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
100 |
+
warnings.warn(
|
101 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
102 |
+
warnings.warn(
|
103 |
+
[rank0]:[2024-08-03 19:15:37,199] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.0010301990000698424, 'preprocessing_with_comm': 0.009796595000352681, 'state_converting': 0.007119276000139507, <Type.ALL: 'all'>: 0.018263464000028762})
|
wandb/run-20240803_191521-sg37tylz/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240803_191521-sg37tylz/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-03T10:15:22.462858",
|
5 |
+
"startedAt": "2024-08-03T10:15:21.560082",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"512",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"40",
|
15 |
+
"--global-batch-size",
|
16 |
+
"1600",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/custom/tiny-mistral",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-mistral-sample",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-mistral-sample",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-mistral-sample",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-mistral-sample_train_2024-08-03-19:14:48"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.034,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.034,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.034,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.034,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.034,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.034,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.034,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.034,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.034,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.034,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.034,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.034,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.034,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.034,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.034,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.034,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.034,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.034,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.034,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 14}}
|
wandb/run-20240803_191521-sg37tylz/logs/debug-internal.log
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-03 19:15:21,574 INFO StreamThr :9246 [internal.py:wandb_internal():86] W&B internal server running at pid: 9246, started at: 2024-08-03 19:15:21.573011
|
2 |
+
2024-08-03 19:15:21,575 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-03 19:15:21,578 INFO WriterThread:9246 [datastore.py:open_for_write():87] open: /project/wandb/run-20240803_191521-sg37tylz/run-sg37tylz.wandb
|
4 |
+
2024-08-03 19:15:21,579 DEBUG SenderThread:9246 [sender.py:send():382] send: header
|
5 |
+
2024-08-03 19:15:21,755 DEBUG SenderThread:9246 [sender.py:send():382] send: run
|
6 |
+
2024-08-03 19:15:22,256 INFO SenderThread:9246 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240803_191521-sg37tylz/files
|
7 |
+
2024-08-03 19:15:22,257 INFO SenderThread:9246 [sender.py:_start_run_threads():1136] run started: sg37tylz with start time 1722680121.573481
|
8 |
+
2024-08-03 19:15:22,262 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-03 19:15:22,262 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-03 19:15:22,442 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-03 19:15:22,448 DEBUG HandlerThread:9246 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-03 19:15:22,448 DEBUG HandlerThread:9246 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-03 19:15:22,448 INFO HandlerThread:9246 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-03 19:15:22,448 INFO SystemMonitor:9246 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-03 19:15:22,449 INFO HandlerThread:9246 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-03 19:15:22,449 INFO SystemMonitor:9246 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-03 19:15:22,450 INFO SystemMonitor:9246 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-03 19:15:22,450 INFO SystemMonitor:9246 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-03 19:15:22,451 INFO SystemMonitor:9246 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-03 19:15:22,452 INFO SystemMonitor:9246 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-03 19:15:22,462 DEBUG HandlerThread:9246 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-03 19:15:22,464 DEBUG HandlerThread:9246 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-03 19:15:22,476 DEBUG HandlerThread:9246 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-03 19:15:22,476 DEBUG HandlerThread:9246 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-03 19:15:22,476 DEBUG HandlerThread:9246 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T10:15:22.462858', 'startedAt': '2024-08-03T10:15:21.560082', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '40', '--global-batch-size', '1600', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample', '--load', '/work/llm_recipes/models/tiny-mistral-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample_train_2024-08-03-19:14:48'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
26 |
+
2024-08-03 19:15:22,476 INFO HandlerThread:9246 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-03 19:15:22,476 INFO HandlerThread:9246 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-03 19:15:22,478 INFO HandlerThread:9246 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-03 19:15:22,505 DEBUG SenderThread:9246 [sender.py:send():382] send: files
|
30 |
+
2024-08-03 19:15:22,505 INFO SenderThread:9246 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-03 19:15:22,514 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-03 19:15:22,514 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-03 19:15:22,514 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-03 19:15:22,514 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-03 19:15:22,527 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-03 19:15:22,755 DEBUG SenderThread:9246 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-03 19:15:23,258 INFO Thread-12 :9246 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191521-sg37tylz/files/requirements.txt
|
38 |
+
2024-08-03 19:15:23,258 INFO Thread-12 :9246 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191521-sg37tylz/files/wandb-metadata.json
|
39 |
+
2024-08-03 19:15:25,596 INFO wandb-upload_0:9246 [upload_job.py:push():131] Uploaded file /tmp/tmp1lfwq_epwandb/7v7ji8nj-wandb-metadata.json
|
40 |
+
2024-08-03 19:15:26,756 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status_report
|
41 |
+
2024-08-03 19:15:28,261 INFO Thread-12 :9246 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
|
42 |
+
2024-08-03 19:15:30,262 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
|
43 |
+
2024-08-03 19:15:31,262 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
|
44 |
+
2024-08-03 19:15:31,904 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status_report
|
45 |
+
2024-08-03 19:15:33,263 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
|
46 |
+
2024-08-03 19:15:37,068 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status_report
|
47 |
+
2024-08-03 19:15:37,110 DEBUG SenderThread:9246 [sender.py:send():382] send: config
|
48 |
+
2024-08-03 19:15:37,111 DEBUG SenderThread:9246 [sender.py:send():382] send: config
|
49 |
+
2024-08-03 19:15:37,265 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
|
50 |
+
2024-08-03 19:15:37,316 DEBUG SenderThread:9246 [sender.py:send():382] send: exit
|
51 |
+
2024-08-03 19:15:37,316 INFO SenderThread:9246 [sender.py:send_exit():589] handling exit code: 0
|
52 |
+
2024-08-03 19:15:37,317 INFO SenderThread:9246 [sender.py:send_exit():591] handling runtime: 14
|
53 |
+
2024-08-03 19:15:37,318 INFO SenderThread:9246 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
54 |
+
2024-08-03 19:15:37,318 INFO SenderThread:9246 [sender.py:send_exit():597] send defer
|
55 |
+
2024-08-03 19:15:37,318 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
56 |
+
2024-08-03 19:15:37,318 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 0
|
57 |
+
2024-08-03 19:15:37,318 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
58 |
+
2024-08-03 19:15:37,318 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 0
|
59 |
+
2024-08-03 19:15:37,318 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 1
|
60 |
+
2024-08-03 19:15:37,319 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
61 |
+
2024-08-03 19:15:37,319 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 1
|
62 |
+
2024-08-03 19:15:37,319 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
63 |
+
2024-08-03 19:15:37,319 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 1
|
64 |
+
2024-08-03 19:15:37,319 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 2
|
65 |
+
2024-08-03 19:15:37,319 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
66 |
+
2024-08-03 19:15:37,319 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 2
|
67 |
+
2024-08-03 19:15:37,319 INFO HandlerThread:9246 [system_monitor.py:finish():203] Stopping system monitor
|
68 |
+
2024-08-03 19:15:37,319 DEBUG SystemMonitor:9246 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
69 |
+
2024-08-03 19:15:37,319 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined cpu monitor
|
70 |
+
2024-08-03 19:15:37,320 DEBUG SystemMonitor:9246 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
71 |
+
2024-08-03 19:15:37,320 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined disk monitor
|
72 |
+
2024-08-03 19:15:37,320 DEBUG SystemMonitor:9246 [system_monitor.py:_start():183] Publishing last batch of metrics
|
73 |
+
2024-08-03 19:15:37,351 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined gpu monitor
|
74 |
+
2024-08-03 19:15:37,351 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined memory monitor
|
75 |
+
2024-08-03 19:15:37,351 INFO HandlerThread:9246 [interfaces.py:finish():202] Joined network monitor
|
76 |
+
2024-08-03 19:15:37,352 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
77 |
+
2024-08-03 19:15:37,352 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 2
|
78 |
+
2024-08-03 19:15:37,352 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 3
|
79 |
+
2024-08-03 19:15:37,352 DEBUG SenderThread:9246 [sender.py:send():382] send: stats
|
80 |
+
2024-08-03 19:15:37,352 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
81 |
+
2024-08-03 19:15:37,352 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 3
|
82 |
+
2024-08-03 19:15:37,353 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
83 |
+
2024-08-03 19:15:37,353 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 3
|
84 |
+
2024-08-03 19:15:37,353 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 4
|
85 |
+
2024-08-03 19:15:37,353 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
86 |
+
2024-08-03 19:15:37,353 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 4
|
87 |
+
2024-08-03 19:15:37,353 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
88 |
+
2024-08-03 19:15:37,353 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 4
|
89 |
+
2024-08-03 19:15:37,353 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 5
|
90 |
+
2024-08-03 19:15:37,353 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
91 |
+
2024-08-03 19:15:37,353 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 5
|
92 |
+
2024-08-03 19:15:37,353 DEBUG SenderThread:9246 [sender.py:send():382] send: summary
|
93 |
+
2024-08-03 19:15:37,354 INFO SenderThread:9246 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
94 |
+
2024-08-03 19:15:37,354 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
95 |
+
2024-08-03 19:15:37,354 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 5
|
96 |
+
2024-08-03 19:15:37,354 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 6
|
97 |
+
2024-08-03 19:15:37,355 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
98 |
+
2024-08-03 19:15:37,355 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 6
|
99 |
+
2024-08-03 19:15:37,355 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
100 |
+
2024-08-03 19:15:37,355 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 6
|
101 |
+
2024-08-03 19:15:37,357 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: status_report
|
102 |
+
2024-08-03 19:15:37,549 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 7
|
103 |
+
2024-08-03 19:15:37,550 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
104 |
+
2024-08-03 19:15:37,550 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 7
|
105 |
+
2024-08-03 19:15:37,550 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
106 |
+
2024-08-03 19:15:37,550 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 7
|
107 |
+
2024-08-03 19:15:38,267 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/config.yaml
|
108 |
+
2024-08-03 19:15:38,267 INFO Thread-12 :9246 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json
|
109 |
+
2024-08-03 19:15:38,316 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
|
110 |
+
2024-08-03 19:15:39,265 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 8
|
111 |
+
2024-08-03 19:15:39,265 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
|
112 |
+
2024-08-03 19:15:39,265 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
113 |
+
2024-08-03 19:15:39,265 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 8
|
114 |
+
2024-08-03 19:15:39,265 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
115 |
+
2024-08-03 19:15:39,265 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 8
|
116 |
+
2024-08-03 19:15:39,266 INFO SenderThread:9246 [job_builder.py:build():296] Attempting to build job artifact
|
117 |
+
2024-08-03 19:15:39,266 INFO SenderThread:9246 [job_builder.py:_get_source_type():426] is repo sourced job
|
118 |
+
2024-08-03 19:15:39,267 INFO Thread-12 :9246 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191521-sg37tylz/files/output.log
|
119 |
+
2024-08-03 19:15:39,281 INFO SenderThread:9246 [job_builder.py:build():402] adding wandb-job metadata file
|
120 |
+
2024-08-03 19:15:39,289 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 9
|
121 |
+
2024-08-03 19:15:39,290 DEBUG SenderThread:9246 [sender.py:send():382] send: artifact
|
122 |
+
2024-08-03 19:15:39,290 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
123 |
+
2024-08-03 19:15:39,291 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 9
|
124 |
+
2024-08-03 19:15:39,317 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
|
125 |
+
2024-08-03 19:15:40,500 INFO wandb-upload_1:9246 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpftiijukc
|
126 |
+
2024-08-03 19:15:40,873 INFO wandb-upload_0:9246 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp8adahfv9
|
127 |
+
2024-08-03 19:15:42,354 INFO SenderThread:9246 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA4NTUyMzkyNA==', 'versionIndex': 0}}}
|
128 |
+
2024-08-03 19:15:42,354 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
129 |
+
2024-08-03 19:15:42,354 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 9
|
130 |
+
2024-08-03 19:15:42,354 INFO SenderThread:9246 [dir_watcher.py:finish():358] shutting down directory watcher
|
131 |
+
2024-08-03 19:15:43,268 INFO SenderThread:9246 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240803_191521-sg37tylz/files
|
132 |
+
2024-08-03 19:15:43,268 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/requirements.txt requirements.txt
|
133 |
+
2024-08-03 19:15:43,269 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/config.yaml config.yaml
|
134 |
+
2024-08-03 19:15:43,270 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/wandb-metadata.json wandb-metadata.json
|
135 |
+
2024-08-03 19:15:43,270 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json wandb-summary.json
|
136 |
+
2024-08-03 19:15:43,271 INFO SenderThread:9246 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191521-sg37tylz/files/output.log output.log
|
137 |
+
2024-08-03 19:15:43,273 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 10
|
138 |
+
2024-08-03 19:15:43,273 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
|
139 |
+
2024-08-03 19:15:43,273 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
140 |
+
2024-08-03 19:15:43,275 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 10
|
141 |
+
2024-08-03 19:15:43,275 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
142 |
+
2024-08-03 19:15:43,275 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 10
|
143 |
+
2024-08-03 19:15:43,275 INFO SenderThread:9246 [file_pusher.py:finish():172] shutting down file pusher
|
144 |
+
2024-08-03 19:15:43,318 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
|
145 |
+
2024-08-03 19:15:43,318 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
|
146 |
+
2024-08-03 19:15:43,685 INFO wandb-upload_1:9246 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191521-sg37tylz/files/requirements.txt
|
147 |
+
2024-08-03 19:15:43,839 INFO wandb-upload_0:9246 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191521-sg37tylz/files/config.yaml
|
148 |
+
2024-08-03 19:15:43,870 INFO wandb-upload_2:9246 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191521-sg37tylz/files/wandb-summary.json
|
149 |
+
2024-08-03 19:15:43,873 INFO wandb-upload_3:9246 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191521-sg37tylz/files/output.log
|
150 |
+
2024-08-03 19:15:44,073 INFO Thread-11 (_thread_body):9246 [sender.py:transition_state():617] send defer: 11
|
151 |
+
2024-08-03 19:15:44,074 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
152 |
+
2024-08-03 19:15:44,074 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 11
|
153 |
+
2024-08-03 19:15:44,074 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
154 |
+
2024-08-03 19:15:44,074 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 11
|
155 |
+
2024-08-03 19:15:44,074 INFO SenderThread:9246 [file_pusher.py:join():178] waiting for file pusher
|
156 |
+
2024-08-03 19:15:44,074 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 12
|
157 |
+
2024-08-03 19:15:44,074 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
158 |
+
2024-08-03 19:15:44,074 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 12
|
159 |
+
2024-08-03 19:15:44,074 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
160 |
+
2024-08-03 19:15:44,074 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 12
|
161 |
+
2024-08-03 19:15:44,074 INFO SenderThread:9246 [file_stream.py:finish():595] file stream finish called
|
162 |
+
2024-08-03 19:15:44,248 INFO SenderThread:9246 [file_stream.py:finish():599] file stream finish is done
|
163 |
+
2024-08-03 19:15:44,248 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 13
|
164 |
+
2024-08-03 19:15:44,248 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
165 |
+
2024-08-03 19:15:44,248 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 13
|
166 |
+
2024-08-03 19:15:44,248 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
167 |
+
2024-08-03 19:15:44,248 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 13
|
168 |
+
2024-08-03 19:15:44,249 INFO SenderThread:9246 [sender.py:transition_state():617] send defer: 14
|
169 |
+
2024-08-03 19:15:44,249 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: defer
|
170 |
+
2024-08-03 19:15:44,249 DEBUG SenderThread:9246 [sender.py:send():382] send: final
|
171 |
+
2024-08-03 19:15:44,249 INFO HandlerThread:9246 [handler.py:handle_request_defer():172] handle defer: 14
|
172 |
+
2024-08-03 19:15:44,249 DEBUG SenderThread:9246 [sender.py:send():382] send: footer
|
173 |
+
2024-08-03 19:15:44,249 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: defer
|
174 |
+
2024-08-03 19:15:44,249 INFO SenderThread:9246 [sender.py:send_request_defer():613] handle sender defer: 14
|
175 |
+
2024-08-03 19:15:44,250 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
|
176 |
+
2024-08-03 19:15:44,250 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
|
177 |
+
2024-08-03 19:15:44,250 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: poll_exit
|
178 |
+
2024-08-03 19:15:44,250 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: server_info
|
179 |
+
2024-08-03 19:15:44,251 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: get_summary
|
180 |
+
2024-08-03 19:15:44,251 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: poll_exit
|
181 |
+
2024-08-03 19:15:44,251 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: server_info
|
182 |
+
2024-08-03 19:15:44,252 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: sampled_history
|
183 |
+
2024-08-03 19:15:44,252 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: internal_messages
|
184 |
+
2024-08-03 19:15:44,253 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: job_info
|
185 |
+
2024-08-03 19:15:44,413 DEBUG SenderThread:9246 [sender.py:send_request():409] send_request: job_info
|
186 |
+
2024-08-03 19:15:44,413 INFO MainThread:9246 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
187 |
+
2024-08-03 19:15:44,413 INFO MainThread:9246 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
188 |
+
2024-08-03 19:15:44,414 INFO MainThread:9246 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
189 |
+
2024-08-03 19:15:44,414 DEBUG HandlerThread:9246 [handler.py:handle_request():146] handle_request: shutdown
|
190 |
+
2024-08-03 19:15:44,414 INFO HandlerThread:9246 [handler.py:finish():869] shutting down handler
|
191 |
+
2024-08-03 19:15:45,253 INFO WriterThread:9246 [datastore.py:close():296] close: /project/wandb/run-20240803_191521-sg37tylz/run-sg37tylz.wandb
|
192 |
+
2024-08-03 19:15:45,413 INFO SenderThread:9246 [sender.py:finish():1572] shutting down sender
|
193 |
+
2024-08-03 19:15:45,413 INFO SenderThread:9246 [file_pusher.py:finish():172] shutting down file pusher
|
194 |
+
2024-08-03 19:15:45,414 INFO SenderThread:9246 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240803_191521-sg37tylz/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-03 19:15:21,565 INFO MainThread:9173 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Configure stats pid to 9173
|
3 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
|
6 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240803_191521-sg37tylz/logs/debug.log
|
9 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240803_191521-sg37tylz/logs/debug-internal.log
|
10 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-03-19:14:48', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 1600, 'micro_batch_size': 40, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-03 19:15:21,566 INFO MainThread:9173 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-03 19:15:21,571 INFO MainThread:9173 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-03 19:15:21,573 INFO MainThread:9173 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-03 19:15:21,578 INFO MainThread:9173 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-03 19:15:21,751 INFO MainThread:9173 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-03 19:15:22,261 INFO MainThread:9173 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-03 19:15:22,435 INFO MainThread:9173 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-03 19:15:22,435 INFO MainThread:9173 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-03 19:15:22,513 INFO MainThread:9173 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-03 19:15:22,514 INFO MainThread:9173 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-03 19:15:22,514 INFO MainThread:9173 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-03 19:15:22,514 INFO MainThread:9173 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-03 19:15:22,514 INFO MainThread:9173 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-03 19:15:37,109 INFO MainThread:9173 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
|
29 |
+
2024-08-03 19:15:37,110 INFO MainThread:9173 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-03 19:15:45,414 WARNING MsgRouterThr:9173 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240803_191521-sg37tylz/run-sg37tylz.wandb
ADDED
Binary file (17.1 kB). View file
|
|
wandb/run-20240803_191815-jdwps0z3/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 512
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-mistral-sample2_train_2024-08-03-19:18:05
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-mistral-sample2
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-mistral-sample2
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/custom/tiny-mistral
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 1600
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 40
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-mistral-sample2
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32768
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722680295.872336
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 256
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: mistral
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 512
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 4
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 4
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: MistralForCausalLM
|
wandb/run-20240803_191815-jdwps0z3/files/output.log
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/tiny-mistral-sample2.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
8 |
+
File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
9 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
10 |
+
No checkpoint found in /work/llm_recipes/models/tiny-mistral-sample2, skipping model loading
|
11 |
+
--> Model /share/pretrained_lm/custom/tiny-mistral
|
12 |
+
--> /share/pretrained_lm/custom/tiny-mistral has 19.925248 Million params
|
13 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
14 |
+
--> applying fsdp activation checkpointing...
|
15 |
+
> datasets target sizes (minimum size):
|
16 |
+
train: 32000000
|
17 |
+
validation: 1616000
|
18 |
+
test: 16000
|
19 |
+
> building train, validation, and test datasets for GPT ...
|
20 |
+
> finished creating GPT datasets ...
|
21 |
+
File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
22 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
23 |
+
No checkpoint found in /work/llm_recipes/models/tiny-mistral-sample2, skipping optimizer loading
|
24 |
+
File not found: /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
25 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-mistral-sample2/latest_iteration.txt
|
26 |
+
model info: FullyShardedDataParallel(
|
27 |
+
(_fsdp_wrapped_module): MistralForCausalLM(
|
28 |
+
(model): MistralModel(
|
29 |
+
(embed_tokens): Embedding(32768, 256)
|
30 |
+
(layers): ModuleList(
|
31 |
+
(0-3): 4 x FullyShardedDataParallel(
|
32 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
33 |
+
(_checkpoint_wrapped_module): MistralDecoderLayer(
|
34 |
+
(self_attn): MistralFlashAttention2(
|
35 |
+
(q_proj): Linear(in_features=256, out_features=512, bias=False)
|
36 |
+
(k_proj): Linear(in_features=256, out_features=256, bias=False)
|
37 |
+
(v_proj): Linear(in_features=256, out_features=256, bias=False)
|
38 |
+
(o_proj): Linear(in_features=512, out_features=256, bias=False)
|
39 |
+
(rotary_emb): MistralRotaryEmbedding()
|
40 |
+
)
|
41 |
+
(mlp): MistralMLP(
|
42 |
+
(gate_proj): Linear(in_features=256, out_features=512, bias=False)
|
43 |
+
(up_proj): Linear(in_features=256, out_features=512, bias=False)
|
44 |
+
(down_proj): Linear(in_features=512, out_features=256, bias=False)
|
45 |
+
(act_fn): SiLU()
|
46 |
+
)
|
47 |
+
(input_layernorm): MistralRMSNorm()
|
48 |
+
(post_attention_layernorm): MistralRMSNorm()
|
49 |
+
)
|
50 |
+
)
|
51 |
+
)
|
52 |
+
)
|
53 |
+
(norm): MistralRMSNorm()
|
54 |
+
)
|
55 |
+
(lm_head): Linear(in_features=256, out_features=32768, bias=False)
|
56 |
+
)
|
57 |
+
)
|
58 |
+
model config: MistralConfig {
|
59 |
+
"_name_or_path": "/share/pretrained_lm/custom/tiny-mistral",
|
60 |
+
"architectures": [
|
61 |
+
"MistralForCausalLM"
|
62 |
+
],
|
63 |
+
"attention_dropout": 0.0,
|
64 |
+
"bos_token_id": 1,
|
65 |
+
"eos_token_id": 2,
|
66 |
+
"head_dim": 128,
|
67 |
+
"hidden_act": "silu",
|
68 |
+
"hidden_size": 256,
|
69 |
+
"initializer_range": 0.02,
|
70 |
+
"intermediate_size": 512,
|
71 |
+
"label_smoothing": 0.0,
|
72 |
+
"max_position_embeddings": 512,
|
73 |
+
"model_type": "mistral",
|
74 |
+
"num_attention_heads": 4,
|
75 |
+
"num_hidden_layers": 4,
|
76 |
+
"num_key_value_heads": 2,
|
77 |
+
"rms_norm_eps": 1e-05,
|
78 |
+
"rope_theta": 1000000.0,
|
79 |
+
"sliding_window": 4096,
|
80 |
+
"tie_word_embeddings": false,
|
81 |
+
"torch_dtype": "float32",
|
82 |
+
"transformers_version": "4.43.3",
|
83 |
+
"use_cache": false,
|
84 |
+
"vocab_size": 32768
|
85 |
+
}
|
86 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
87 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
88 |
+
warnings.warn(
|
89 |
+
Let split = None
|
90 |
+
Building a BlendedDataset for a single MegatronDataset
|
91 |
+
Unable to save the indexes because path_to_cache is None
|
92 |
+
Building a BlendedDataset for a single MegatronDataset
|
93 |
+
Unable to save the indexes because path_to_cache is None
|
94 |
+
Building a BlendedDataset for a single MegatronDataset
|
95 |
+
Unable to save the indexes because path_to_cache is None
|
96 |
+
------------------------------------------------------------------
|
97 |
+
iteration: 1 , TFLOPS: 7.474408749953364, Tokens per sec: 105583.91620199519, Loss: 10.45809555053711
|
98 |
+
------------------------------------------------------------------
|
99 |
+
------------------------------------------------------------------
|
100 |
+
iteration: 2 , TFLOPS: 19.626322889381036, Tokens per sec: 277242.53524650185, Loss: 10.456090927124023
|
101 |
+
------------------------------------------------------------------
|
102 |
+
------------------------------------------------------------------
|
103 |
+
iteration: 3 , TFLOPS: 19.74558354343518, Tokens per sec: 278927.2179184158, Loss: 10.458115577697754
|
104 |
+
------------------------------------------------------------------
|
105 |
+
------------------------------------------------------------------
|
106 |
+
iteration: 4 , TFLOPS: 19.588618586475842, Tokens per sec: 276709.92215407215, Loss: 10.456512451171875
|
107 |
+
------------------------------------------------------------------
|
108 |
+
------------------------------------------------------------------
|
109 |
+
iteration: 5 , TFLOPS: 19.72619917981179, Tokens per sec: 278653.39331329847, Loss: 10.458467483520508
|
110 |
+
------------------------------------------------------------------
|
111 |
+
------------------------------------------------------------------
|
112 |
+
iteration: 6 , TFLOPS: 19.627595003583973, Tokens per sec: 277260.5052029086, Loss: 10.457172393798828
|
113 |
+
------------------------------------------------------------------
|
114 |
+
------------------------------------------------------------------
|
115 |
+
iteration: 7 , TFLOPS: 19.77370988376223, Tokens per sec: 279324.53217557067, Loss: 10.457324981689453
|
116 |
+
------------------------------------------------------------------
|
117 |
+
------------------------------------------------------------------
|
118 |
+
iteration: 8 , TFLOPS: 19.77766998452317, Tokens per sec: 279380.47278049105, Loss: 10.457306861877441
|
119 |
+
------------------------------------------------------------------
|
120 |
+
------------------------------------------------------------------
|
121 |
+
iteration: 9 , TFLOPS: 19.749958254856136, Tokens per sec: 278989.01533671736, Loss: 10.457944869995117
|
122 |
+
------------------------------------------------------------------
|
123 |
+
------------------------------------------------------------------
|
124 |
+
iteration: 10 , TFLOPS: 19.752309517490676, Tokens per sec: 279022.22940424824, Loss: 10.45663833618164
|
125 |
+
------------------------------------------------------------------
|
126 |
+
------------------------------------------------------------------
|
127 |
+
iteration: 11 , TFLOPS: 19.74392548352723, Tokens per sec: 278903.7960713861, Loss: 10.456352233886719
|
128 |
+
------------------------------------------------------------------
|
129 |
+
------------------------------------------------------------------
|
130 |
+
iteration: 12 , TFLOPS: 19.758394357809003, Tokens per sec: 279108.18419903744, Loss: 10.455950736999512
|
131 |
+
------------------------------------------------------------------
|
132 |
+
------------------------------------------------------------------
|
133 |
+
iteration: 13 , TFLOPS: 19.729624964309398, Tokens per sec: 278701.78613678756, Loss: 10.45804214477539
|
134 |
+
------------------------------------------------------------------
|
135 |
+
------------------------------------------------------------------
|
136 |
+
iteration: 14 , TFLOPS: 19.626730348672773, Tokens per sec: 277248.29103925463, Loss: 10.457955360412598
|
137 |
+
------------------------------------------------------------------
|
138 |
+
------------------------------------------------------------------
|
139 |
+
iteration: 15 , TFLOPS: 19.724026466287597, Tokens per sec: 278622.7014404904, Loss: 10.459123611450195
|
140 |
+
------------------------------------------------------------------
|
141 |
+
------------------------------------------------------------------
|
142 |
+
iteration: 16 , TFLOPS: 19.108803574684035, Tokens per sec: 269932.0284514028, Loss: 10.45695686340332
|
143 |
+
------------------------------------------------------------------
|
144 |
+
------------------------------------------------------------------
|
145 |
+
iteration: 17 , TFLOPS: 19.73958477407385, Tokens per sec: 278842.4789667809, Loss: 10.456724166870117
|
146 |
+
------------------------------------------------------------------
|
147 |
+
------------------------------------------------------------------
|
148 |
+
iteration: 18 , TFLOPS: 19.47597897335848, Tokens per sec: 275118.76867688465, Loss: 10.456052780151367
|
149 |
+
------------------------------------------------------------------
|
150 |
+
------------------------------------------------------------------
|
151 |
+
iteration: 19 , TFLOPS: 19.72614156699911, Tokens per sec: 278652.57947148307, Loss: 10.458043098449707
|
152 |
+
------------------------------------------------------------------
|
153 |
+
------------------------------------------------------------------
|
154 |
+
iteration: 20 , TFLOPS: 19.701494428407866, Tokens per sec: 278304.4126127127, Loss: 10.455198287963867
|
155 |
+
------------------------------------------------------------------
|
156 |
+
------------------------------------------------------------------
|
157 |
+
iteration: 21 , TFLOPS: 19.74596953791029, Tokens per sec: 278932.6704979679, Loss: 10.457011222839355
|
158 |
+
------------------------------------------------------------------
|
159 |
+
------------------------------------------------------------------
|
160 |
+
iteration: 22 , TFLOPS: 19.615963035173014, Tokens per sec: 277096.1913663158, Loss: 10.457304954528809
|
161 |
+
------------------------------------------------------------------
|
162 |
+
------------------------------------------------------------------
|
163 |
+
iteration: 23 , TFLOPS: 19.718852381531324, Tokens per sec: 278549.61202972836, Loss: 10.456293106079102
|
164 |
+
------------------------------------------------------------------
|
165 |
+
------------------------------------------------------------------
|
166 |
+
iteration: 24 , TFLOPS: 19.67743842750437, Tokens per sec: 277964.5961979942, Loss: 10.456793785095215
|
167 |
+
------------------------------------------------------------------
|
168 |
+
------------------------------------------------------------------
|
169 |
+
iteration: 25 , TFLOPS: 19.72792104268366, Tokens per sec: 278677.7164445664, Loss: 10.45568561553955
|
170 |
+
------------------------------------------------------------------
|
171 |
+
------------------------------------------------------------------
|
172 |
+
iteration: 26 , TFLOPS: 19.766049190998213, Tokens per sec: 279216.31680096325, Loss: 10.455270767211914
|
173 |
+
------------------------------------------------------------------
|
174 |
+
------------------------------------------------------------------
|
175 |
+
iteration: 27 , TFLOPS: 19.748005203155174, Tokens per sec: 278961.4264191145, Loss: 10.456525802612305
|
176 |
+
------------------------------------------------------------------
|
177 |
+
------------------------------------------------------------------
|
178 |
+
iteration: 28 , TFLOPS: 19.788746629218007, Tokens per sec: 279536.9420831989, Loss: 10.458827018737793
|
179 |
+
------------------------------------------------------------------
|
180 |
+
------------------------------------------------------------------
|
181 |
+
iteration: 29 , TFLOPS: 19.64595613343327, Tokens per sec: 277519.8755504821, Loss: 10.454755783081055
|
182 |
+
------------------------------------------------------------------
|
183 |
+
------------------------------------------------------------------
|
184 |
+
iteration: 30 , TFLOPS: 19.791751212574006, Tokens per sec: 279579.38499579503, Loss: 10.455424308776855
|
185 |
+
------------------------------------------------------------------
|
186 |
+
------------------------------------------------------------------
|
187 |
+
iteration: 31 , TFLOPS: 19.647830180890995, Tokens per sec: 277546.34844972467, Loss: 10.455726623535156
|
188 |
+
------------------------------------------------------------------
|
189 |
+
------------------------------------------------------------------
|
190 |
+
iteration: 32 , TFLOPS: 19.108735127334096, Tokens per sec: 269931.06156030786, Loss: 10.456134796142578
|
191 |
+
------------------------------------------------------------------
|
192 |
+
------------------------------------------------------------------
|
193 |
+
iteration: 33 , TFLOPS: 19.790956814139804, Tokens per sec: 279568.16327906615, Loss: 10.45483684539795
|
194 |
+
------------------------------------------------------------------
|
195 |
+
------------------------------------------------------------------
|
196 |
+
iteration: 34 , TFLOPS: 19.809776481063633, Tokens per sec: 279834.0109470101, Loss: 10.455580711364746
|
197 |
+
------------------------------------------------------------------
|
198 |
+
------------------------------------------------------------------
|
199 |
+
iteration: 35 , TFLOPS: 19.78921404344017, Tokens per sec: 279543.5448026535, Loss: 10.4553861618042
|
200 |
+
------------------------------------------------------------------
|
201 |
+
------------------------------------------------------------------
|
202 |
+
iteration: 36 , TFLOPS: 19.760295167208643, Tokens per sec: 279135.03513896884, Loss: 10.455459594726562
|
203 |
+
------------------------------------------------------------------
|
204 |
+
------------------------------------------------------------------
|
205 |
+
iteration: 37 , TFLOPS: 19.77547756611738, Tokens per sec: 279349.502555423, Loss: 10.45541000366211
|
206 |
+
------------------------------------------------------------------
|
207 |
+
------------------------------------------------------------------
|
208 |
+
iteration: 38 , TFLOPS: 19.777886920468827, Tokens per sec: 279383.53722979716, Loss: 10.455677032470703
|
209 |
+
------------------------------------------------------------------
|
210 |
+
------------------------------------------------------------------
|
211 |
+
iteration: 39 , TFLOPS: 19.686203609180314, Tokens per sec: 278088.4136447687, Loss: 10.454801559448242
|
212 |
+
------------------------------------------------------------------
|
213 |
+
------------------------------------------------------------------
|
214 |
+
iteration: 40 , TFLOPS: 19.733730730492624, Tokens per sec: 278759.7844196133, Loss: 10.45442008972168
|
215 |
+
------------------------------------------------------------------
|
216 |
+
------------------------------------------------------------------
|
217 |
+
iteration: 41 , TFLOPS: 19.781936717382273, Tokens per sec: 279440.74488758645, Loss: 10.453974723815918
|
218 |
+
------------------------------------------------------------------
|
219 |
+
------------------------------------------------------------------
|
220 |
+
iteration: 42 , TFLOPS: 19.713193575703478, Tokens per sec: 278469.67542198877, Loss: 10.454327583312988
|
221 |
+
------------------------------------------------------------------
|
222 |
+
------------------------------------------------------------------
|
223 |
+
iteration: 43 , TFLOPS: 19.765683899965286, Tokens per sec: 279211.1566793938, Loss: 10.453243255615234
|
224 |
+
------------------------------------------------------------------
|
225 |
+
------------------------------------------------------------------
|
226 |
+
iteration: 44 , TFLOPS: 19.737930705062368, Tokens per sec: 278819.11349537794, Loss: 10.452882766723633
|
227 |
+
------------------------------------------------------------------
|
228 |
+
------------------------------------------------------------------
|
229 |
+
iteration: 45 , TFLOPS: 19.768669599163157, Tokens per sec: 279253.3328333156, Loss: 10.452847480773926
|
230 |
+
------------------------------------------------------------------
|
231 |
+
------------------------------------------------------------------
|
232 |
+
iteration: 46 , TFLOPS: 19.195477136883916, Tokens per sec: 271156.3840405051, Loss: 10.452914237976074
|
233 |
+
------------------------------------------------------------------
|
234 |
+
------------------------------------------------------------------
|
235 |
+
iteration: 47 , TFLOPS: 19.78773782868223, Tokens per sec: 279522.69171038724, Loss: 10.452442169189453
|
236 |
+
------------------------------------------------------------------
|
237 |
+
------------------------------------------------------------------
|
238 |
+
iteration: 48 , TFLOPS: 19.79785807451266, Tokens per sec: 279665.6508692251, Loss: 10.452777862548828
|
239 |
+
------------------------------------------------------------------
|
wandb/run-20240803_191815-jdwps0z3/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240803_191815-jdwps0z3/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-03T10:18:16.495932",
|
5 |
+
"startedAt": "2024-08-03T10:18:15.860061",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"512",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"40",
|
15 |
+
"--global-batch-size",
|
16 |
+
"1600",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/custom/tiny-mistral",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-mistral-sample2",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-mistral-sample2",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-mistral-sample2",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-mistral-sample2_train_2024-08-03-19:18:05"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.034,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.034,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.034,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.034,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.034,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.034,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.034,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.034,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.034,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.034,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.034,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.034,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.034,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.034,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.034,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.034,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.034,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.034,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.034,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 10.452777862548828, "training/perplexity": 34640.467638738424, "utils/batch_size": 40, "utils/global_batch_size": 1600, "utils/seq_len": 513, "utils/gradient_accumulation_steps": 40, "utils/iteration": 48, "optimizer/lr": 2.8240000000000004e-06, "optimizer/variance_l2": 0.0037958921404718368, "optimizer/variance_sqrt_l2": 0.9574357404136762, "optimizer/momentum_l2": 0.9945472829864366, "optimizer/weight_l2": 101.38293742045552, "optimizer/variance_l1": 0.91748046875, "optimizer/variance_sqrt_l1": 723.0, "optimizer/momentum_l1": 740.5, "optimizer/weight_l1": 320512.0, "optimizer/variance_abs_max": 0.0003185272216796875, "optimizer/variance_sqrt_abs_max": 0.017822265625, "optimizer/momentum_abs_max": 0.0184326171875, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 2.9349331869998423, "stats/tokens_per_sec": 279665.6508692251, "stats/tokens_per_sec_per_gpu": 279665.6508692251, "stats/tflops": 19.79785807451266, "_timestamp": 1722680446.903461, "_runtime": 151.03112506866455, "_step": 48, "_wandb": {"runtime": 151}}
|
wandb/run-20240803_191815-jdwps0z3/logs/debug-internal.log
ADDED
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-03 19:18:15,873 INFO StreamThr :9504 [internal.py:wandb_internal():86] W&B internal server running at pid: 9504, started at: 2024-08-03 19:18:15.872881
|
2 |
+
2024-08-03 19:18:15,875 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-03 19:18:15,877 INFO WriterThread:9504 [datastore.py:open_for_write():87] open: /project/wandb/run-20240803_191815-jdwps0z3/run-jdwps0z3.wandb
|
4 |
+
2024-08-03 19:18:15,878 DEBUG SenderThread:9504 [sender.py:send():382] send: header
|
5 |
+
2024-08-03 19:18:15,892 DEBUG SenderThread:9504 [sender.py:send():382] send: run
|
6 |
+
2024-08-03 19:18:16,382 INFO SenderThread:9504 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240803_191815-jdwps0z3/files
|
7 |
+
2024-08-03 19:18:16,382 INFO SenderThread:9504 [sender.py:_start_run_threads():1136] run started: jdwps0z3 with start time 1722680295.872336
|
8 |
+
2024-08-03 19:18:16,387 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-03 19:18:16,387 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-03 19:18:16,477 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-03 19:18:16,483 DEBUG HandlerThread:9504 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-03 19:18:16,483 DEBUG HandlerThread:9504 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-03 19:18:16,483 INFO HandlerThread:9504 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-03 19:18:16,483 INFO SystemMonitor:9504 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-03 19:18:16,483 INFO HandlerThread:9504 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-03 19:18:16,484 INFO SystemMonitor:9504 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-03 19:18:16,485 INFO SystemMonitor:9504 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-03 19:18:16,486 INFO SystemMonitor:9504 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-03 19:18:16,486 INFO SystemMonitor:9504 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-03 19:18:16,487 INFO SystemMonitor:9504 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-03 19:18:16,495 DEBUG HandlerThread:9504 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-03 19:18:16,497 DEBUG HandlerThread:9504 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-03 19:18:16,509 DEBUG HandlerThread:9504 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-03 19:18:16,509 DEBUG HandlerThread:9504 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-03 19:18:16,509 DEBUG HandlerThread:9504 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T10:18:16.495932', 'startedAt': '2024-08-03T10:18:15.860061', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '40', '--global-batch-size', '1600', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample2', '--load', '/work/llm_recipes/models/tiny-mistral-sample2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample2_train_2024-08-03-19:18:05'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
26 |
+
2024-08-03 19:18:16,509 INFO HandlerThread:9504 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-03 19:18:16,509 INFO HandlerThread:9504 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-03 19:18:16,510 INFO HandlerThread:9504 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-03 19:18:16,516 DEBUG SenderThread:9504 [sender.py:send():382] send: files
|
30 |
+
2024-08-03 19:18:16,516 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-03 19:18:16,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-03 19:18:16,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-03 19:18:16,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-03 19:18:16,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-03 19:18:16,528 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-03 19:18:16,829 DEBUG SenderThread:9504 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-03 19:18:17,199 INFO wandb-upload_0:9504 [upload_job.py:push():131] Uploaded file /tmp/tmp76a9qs5lwandb/q74vxtjq-wandb-metadata.json
|
38 |
+
2024-08-03 19:18:17,385 INFO Thread-12 :9504 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-metadata.json
|
39 |
+
2024-08-03 19:18:17,386 INFO Thread-12 :9504 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191815-jdwps0z3/files/requirements.txt
|
40 |
+
2024-08-03 19:18:17,386 INFO Thread-12 :9504 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
41 |
+
2024-08-03 19:18:17,990 DEBUG SenderThread:9504 [sender.py:send():382] send: config
|
42 |
+
2024-08-03 19:18:17,991 DEBUG SenderThread:9504 [sender.py:send():382] send: config
|
43 |
+
2024-08-03 19:18:19,388 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
44 |
+
2024-08-03 19:18:20,991 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
45 |
+
2024-08-03 19:18:25,992 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
46 |
+
2024-08-03 19:18:28,096 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
47 |
+
2024-08-03 19:18:30,394 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
48 |
+
2024-08-03 19:18:31,059 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
49 |
+
2024-08-03 19:18:31,062 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
50 |
+
2024-08-03 19:18:31,063 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
51 |
+
2024-08-03 19:18:31,063 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
52 |
+
2024-08-03 19:18:31,064 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
53 |
+
2024-08-03 19:18:31,395 INFO Thread-12 :9504 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
54 |
+
2024-08-03 19:18:31,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
55 |
+
2024-08-03 19:18:31,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
56 |
+
2024-08-03 19:18:31,568 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
57 |
+
2024-08-03 19:18:34,003 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
58 |
+
2024-08-03 19:18:34,005 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
59 |
+
2024-08-03 19:18:34,005 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
60 |
+
2024-08-03 19:18:34,006 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
61 |
+
2024-08-03 19:18:34,397 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
62 |
+
2024-08-03 19:18:34,397 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
63 |
+
2024-08-03 19:18:36,398 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
64 |
+
2024-08-03 19:18:36,971 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
65 |
+
2024-08-03 19:18:36,974 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
66 |
+
2024-08-03 19:18:36,975 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
67 |
+
2024-08-03 19:18:36,976 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
68 |
+
2024-08-03 19:18:36,977 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
69 |
+
2024-08-03 19:18:37,399 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
70 |
+
2024-08-03 19:18:39,919 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
71 |
+
2024-08-03 19:18:39,921 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
72 |
+
2024-08-03 19:18:39,921 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
73 |
+
2024-08-03 19:18:39,922 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
74 |
+
2024-08-03 19:18:40,401 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
75 |
+
2024-08-03 19:18:40,401 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
76 |
+
2024-08-03 19:18:42,402 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
77 |
+
2024-08-03 19:18:42,881 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
78 |
+
2024-08-03 19:18:42,883 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
79 |
+
2024-08-03 19:18:42,883 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
80 |
+
2024-08-03 19:18:42,884 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
81 |
+
2024-08-03 19:18:42,885 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
82 |
+
2024-08-03 19:18:43,403 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
83 |
+
2024-08-03 19:18:44,403 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
84 |
+
2024-08-03 19:18:45,821 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
85 |
+
2024-08-03 19:18:45,823 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
86 |
+
2024-08-03 19:18:45,824 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
87 |
+
2024-08-03 19:18:45,825 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
88 |
+
2024-08-03 19:18:46,405 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
89 |
+
2024-08-03 19:18:46,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
90 |
+
2024-08-03 19:18:46,525 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
91 |
+
2024-08-03 19:18:46,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
92 |
+
2024-08-03 19:18:48,406 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
93 |
+
2024-08-03 19:18:48,697 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
94 |
+
2024-08-03 19:18:48,761 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
95 |
+
2024-08-03 19:18:48,885 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
96 |
+
2024-08-03 19:18:48,885 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
97 |
+
2024-08-03 19:18:48,887 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
98 |
+
2024-08-03 19:18:49,407 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
99 |
+
2024-08-03 19:18:49,407 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/config.yaml
|
100 |
+
2024-08-03 19:18:50,407 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
101 |
+
2024-08-03 19:18:51,705 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
102 |
+
2024-08-03 19:18:51,707 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
103 |
+
2024-08-03 19:18:51,708 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
104 |
+
2024-08-03 19:18:51,709 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
105 |
+
2024-08-03 19:18:52,409 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
106 |
+
2024-08-03 19:18:54,410 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
107 |
+
2024-08-03 19:18:54,648 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
108 |
+
2024-08-03 19:18:54,651 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
109 |
+
2024-08-03 19:18:54,651 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
110 |
+
2024-08-03 19:18:54,651 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
111 |
+
2024-08-03 19:18:54,652 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
112 |
+
2024-08-03 19:18:55,411 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
113 |
+
2024-08-03 19:18:56,412 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
114 |
+
2024-08-03 19:18:57,593 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
115 |
+
2024-08-03 19:18:57,596 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
116 |
+
2024-08-03 19:18:57,596 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
117 |
+
2024-08-03 19:18:57,597 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
118 |
+
2024-08-03 19:18:58,413 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
119 |
+
2024-08-03 19:19:00,414 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
120 |
+
2024-08-03 19:19:00,536 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
121 |
+
2024-08-03 19:19:00,540 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
122 |
+
2024-08-03 19:19:00,540 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
123 |
+
2024-08-03 19:19:00,541 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
124 |
+
2024-08-03 19:19:00,542 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
125 |
+
2024-08-03 19:19:01,415 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
126 |
+
2024-08-03 19:19:01,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
127 |
+
2024-08-03 19:19:01,525 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
128 |
+
2024-08-03 19:19:01,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
129 |
+
2024-08-03 19:19:02,416 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
130 |
+
2024-08-03 19:19:03,483 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
131 |
+
2024-08-03 19:19:03,484 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
132 |
+
2024-08-03 19:19:03,485 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
133 |
+
2024-08-03 19:19:03,486 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
134 |
+
2024-08-03 19:19:04,417 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
135 |
+
2024-08-03 19:19:06,418 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
136 |
+
2024-08-03 19:19:06,446 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
137 |
+
2024-08-03 19:19:06,448 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
138 |
+
2024-08-03 19:19:06,448 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
139 |
+
2024-08-03 19:19:06,448 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
140 |
+
2024-08-03 19:19:06,449 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
141 |
+
2024-08-03 19:19:07,420 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
142 |
+
2024-08-03 19:19:08,420 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
143 |
+
2024-08-03 19:19:09,393 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
144 |
+
2024-08-03 19:19:09,396 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
145 |
+
2024-08-03 19:19:09,396 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
146 |
+
2024-08-03 19:19:09,397 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
147 |
+
2024-08-03 19:19:09,421 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
148 |
+
2024-08-03 19:19:10,422 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
149 |
+
2024-08-03 19:19:12,398 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
150 |
+
2024-08-03 19:19:12,436 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
151 |
+
2024-08-03 19:19:12,438 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
152 |
+
2024-08-03 19:19:12,438 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
153 |
+
2024-08-03 19:19:12,440 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
154 |
+
2024-08-03 19:19:13,424 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
155 |
+
2024-08-03 19:19:14,424 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
156 |
+
2024-08-03 19:19:15,381 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
157 |
+
2024-08-03 19:19:15,383 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
158 |
+
2024-08-03 19:19:15,384 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
159 |
+
2024-08-03 19:19:15,385 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
160 |
+
2024-08-03 19:19:15,425 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
161 |
+
2024-08-03 19:19:16,426 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
162 |
+
2024-08-03 19:19:16,487 DEBUG SystemMonitor:9504 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
163 |
+
2024-08-03 19:19:16,489 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
|
164 |
+
2024-08-03 19:19:16,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
165 |
+
2024-08-03 19:19:16,525 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
166 |
+
2024-08-03 19:19:16,526 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
167 |
+
2024-08-03 19:19:17,757 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
168 |
+
2024-08-03 19:19:18,366 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
169 |
+
2024-08-03 19:19:18,368 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
170 |
+
2024-08-03 19:19:18,368 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
171 |
+
2024-08-03 19:19:18,369 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
172 |
+
2024-08-03 19:19:18,427 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
173 |
+
2024-08-03 19:19:20,428 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
174 |
+
2024-08-03 19:19:21,314 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
175 |
+
2024-08-03 19:19:21,316 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
176 |
+
2024-08-03 19:19:21,317 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
177 |
+
2024-08-03 19:19:21,318 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
178 |
+
2024-08-03 19:19:21,429 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
179 |
+
2024-08-03 19:19:22,430 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
180 |
+
2024-08-03 19:19:23,318 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
181 |
+
2024-08-03 19:19:24,265 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
182 |
+
2024-08-03 19:19:24,267 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
183 |
+
2024-08-03 19:19:24,268 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
184 |
+
2024-08-03 19:19:24,269 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
185 |
+
2024-08-03 19:19:24,431 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
186 |
+
2024-08-03 19:19:26,432 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
187 |
+
2024-08-03 19:19:27,210 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
188 |
+
2024-08-03 19:19:27,212 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
189 |
+
2024-08-03 19:19:27,213 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
190 |
+
2024-08-03 19:19:27,214 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
191 |
+
2024-08-03 19:19:27,433 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
192 |
+
2024-08-03 19:19:28,434 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
193 |
+
2024-08-03 19:19:29,214 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
194 |
+
2024-08-03 19:19:30,174 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
195 |
+
2024-08-03 19:19:30,175 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
196 |
+
2024-08-03 19:19:30,176 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
197 |
+
2024-08-03 19:19:30,177 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
198 |
+
2024-08-03 19:19:30,435 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
199 |
+
2024-08-03 19:19:31,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
200 |
+
2024-08-03 19:19:31,525 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
201 |
+
2024-08-03 19:19:31,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
202 |
+
2024-08-03 19:19:32,436 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
203 |
+
2024-08-03 19:19:33,125 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
204 |
+
2024-08-03 19:19:33,126 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
205 |
+
2024-08-03 19:19:33,126 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
206 |
+
2024-08-03 19:19:33,127 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
207 |
+
2024-08-03 19:19:33,437 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
208 |
+
2024-08-03 19:19:34,438 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
209 |
+
2024-08-03 19:19:35,169 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
210 |
+
2024-08-03 19:19:36,079 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
211 |
+
2024-08-03 19:19:36,081 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
212 |
+
2024-08-03 19:19:36,082 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
213 |
+
2024-08-03 19:19:36,084 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
214 |
+
2024-08-03 19:19:36,439 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
215 |
+
2024-08-03 19:19:38,441 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
216 |
+
2024-08-03 19:19:39,026 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
217 |
+
2024-08-03 19:19:39,029 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
218 |
+
2024-08-03 19:19:39,029 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
219 |
+
2024-08-03 19:19:39,030 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
220 |
+
2024-08-03 19:19:39,442 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
221 |
+
2024-08-03 19:19:40,443 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
222 |
+
2024-08-03 19:19:41,031 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
223 |
+
2024-08-03 19:19:41,968 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
224 |
+
2024-08-03 19:19:41,970 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
225 |
+
2024-08-03 19:19:41,970 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
226 |
+
2024-08-03 19:19:41,972 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
227 |
+
2024-08-03 19:19:42,444 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
228 |
+
2024-08-03 19:19:44,445 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
229 |
+
2024-08-03 19:19:44,912 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
230 |
+
2024-08-03 19:19:44,914 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
231 |
+
2024-08-03 19:19:44,915 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
232 |
+
2024-08-03 19:19:44,916 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
233 |
+
2024-08-03 19:19:45,447 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
234 |
+
2024-08-03 19:19:46,447 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
235 |
+
2024-08-03 19:19:46,489 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
|
236 |
+
2024-08-03 19:19:46,490 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
237 |
+
2024-08-03 19:19:46,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
238 |
+
2024-08-03 19:19:46,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
239 |
+
2024-08-03 19:19:46,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
240 |
+
2024-08-03 19:19:47,851 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
241 |
+
2024-08-03 19:19:47,852 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
242 |
+
2024-08-03 19:19:47,852 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
243 |
+
2024-08-03 19:19:47,853 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
244 |
+
2024-08-03 19:19:48,449 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
245 |
+
2024-08-03 19:19:50,450 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
246 |
+
2024-08-03 19:19:50,810 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
247 |
+
2024-08-03 19:19:50,814 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
248 |
+
2024-08-03 19:19:50,814 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
249 |
+
2024-08-03 19:19:50,815 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
250 |
+
2024-08-03 19:19:51,451 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
251 |
+
2024-08-03 19:19:51,815 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
252 |
+
2024-08-03 19:19:52,452 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
253 |
+
2024-08-03 19:19:53,748 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
254 |
+
2024-08-03 19:19:53,750 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
255 |
+
2024-08-03 19:19:53,750 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
256 |
+
2024-08-03 19:19:53,752 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
257 |
+
2024-08-03 19:19:54,454 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
258 |
+
2024-08-03 19:19:56,455 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
259 |
+
2024-08-03 19:19:56,707 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
260 |
+
2024-08-03 19:19:56,709 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
261 |
+
2024-08-03 19:19:56,709 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
262 |
+
2024-08-03 19:19:56,711 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
263 |
+
2024-08-03 19:19:57,456 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
264 |
+
2024-08-03 19:19:57,712 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
265 |
+
2024-08-03 19:19:58,456 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
266 |
+
2024-08-03 19:19:59,749 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
267 |
+
2024-08-03 19:19:59,752 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
268 |
+
2024-08-03 19:19:59,752 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
269 |
+
2024-08-03 19:19:59,754 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
270 |
+
2024-08-03 19:20:00,458 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
271 |
+
2024-08-03 19:20:01,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
272 |
+
2024-08-03 19:20:01,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
273 |
+
2024-08-03 19:20:01,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
274 |
+
2024-08-03 19:20:02,459 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
275 |
+
2024-08-03 19:20:02,687 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
276 |
+
2024-08-03 19:20:02,689 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
277 |
+
2024-08-03 19:20:02,689 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
278 |
+
2024-08-03 19:20:02,690 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
279 |
+
2024-08-03 19:20:02,729 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
280 |
+
2024-08-03 19:20:03,460 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
281 |
+
2024-08-03 19:20:04,460 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
282 |
+
2024-08-03 19:20:05,622 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
283 |
+
2024-08-03 19:20:05,624 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
284 |
+
2024-08-03 19:20:05,625 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
285 |
+
2024-08-03 19:20:05,626 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
286 |
+
2024-08-03 19:20:06,462 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
287 |
+
2024-08-03 19:20:08,463 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
288 |
+
2024-08-03 19:20:08,560 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
289 |
+
2024-08-03 19:20:08,563 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
290 |
+
2024-08-03 19:20:08,563 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
291 |
+
2024-08-03 19:20:08,563 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
292 |
+
2024-08-03 19:20:08,564 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
293 |
+
2024-08-03 19:20:09,464 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
294 |
+
2024-08-03 19:20:10,464 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
295 |
+
2024-08-03 19:20:11,503 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
296 |
+
2024-08-03 19:20:11,505 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
297 |
+
2024-08-03 19:20:11,506 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
298 |
+
2024-08-03 19:20:11,507 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
299 |
+
2024-08-03 19:20:12,466 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
300 |
+
2024-08-03 19:20:12,466 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
301 |
+
2024-08-03 19:20:14,443 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
302 |
+
2024-08-03 19:20:14,468 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
303 |
+
2024-08-03 19:20:14,469 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
304 |
+
2024-08-03 19:20:14,469 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
305 |
+
2024-08-03 19:20:14,470 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
306 |
+
2024-08-03 19:20:15,470 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
307 |
+
2024-08-03 19:20:16,471 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
308 |
+
2024-08-03 19:20:16,490 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
|
309 |
+
2024-08-03 19:20:16,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
310 |
+
2024-08-03 19:20:16,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
311 |
+
2024-08-03 19:20:16,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
312 |
+
2024-08-03 19:20:17,382 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
313 |
+
2024-08-03 19:20:17,384 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
314 |
+
2024-08-03 19:20:17,384 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
315 |
+
2024-08-03 19:20:17,385 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
316 |
+
2024-08-03 19:20:17,471 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
317 |
+
2024-08-03 19:20:18,472 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
318 |
+
2024-08-03 19:20:20,336 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
319 |
+
2024-08-03 19:20:20,338 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
320 |
+
2024-08-03 19:20:20,339 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
321 |
+
2024-08-03 19:20:20,339 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
322 |
+
2024-08-03 19:20:20,340 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
323 |
+
2024-08-03 19:20:20,473 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
324 |
+
2024-08-03 19:20:22,474 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
325 |
+
2024-08-03 19:20:23,282 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
326 |
+
2024-08-03 19:20:23,284 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
327 |
+
2024-08-03 19:20:23,285 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
328 |
+
2024-08-03 19:20:23,286 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
329 |
+
2024-08-03 19:20:23,475 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
330 |
+
2024-08-03 19:20:24,476 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
331 |
+
2024-08-03 19:20:26,221 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
332 |
+
2024-08-03 19:20:26,223 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
333 |
+
2024-08-03 19:20:26,224 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
334 |
+
2024-08-03 19:20:26,224 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
335 |
+
2024-08-03 19:20:26,225 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
336 |
+
2024-08-03 19:20:26,477 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
337 |
+
2024-08-03 19:20:28,478 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
338 |
+
2024-08-03 19:20:29,171 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
339 |
+
2024-08-03 19:20:29,173 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
340 |
+
2024-08-03 19:20:29,174 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
341 |
+
2024-08-03 19:20:29,175 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
342 |
+
2024-08-03 19:20:29,479 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
343 |
+
2024-08-03 19:20:30,480 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
344 |
+
2024-08-03 19:20:31,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
345 |
+
2024-08-03 19:20:31,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
346 |
+
2024-08-03 19:20:31,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
347 |
+
2024-08-03 19:20:31,709 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
348 |
+
2024-08-03 19:20:32,112 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
349 |
+
2024-08-03 19:20:32,114 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
350 |
+
2024-08-03 19:20:32,114 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
351 |
+
2024-08-03 19:20:32,115 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
352 |
+
2024-08-03 19:20:32,482 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
353 |
+
2024-08-03 19:20:34,483 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
354 |
+
2024-08-03 19:20:35,058 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
355 |
+
2024-08-03 19:20:35,060 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
356 |
+
2024-08-03 19:20:35,061 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
357 |
+
2024-08-03 19:20:35,062 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
358 |
+
2024-08-03 19:20:35,484 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
359 |
+
2024-08-03 19:20:36,485 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
360 |
+
2024-08-03 19:20:37,062 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
361 |
+
2024-08-03 19:20:37,999 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
362 |
+
2024-08-03 19:20:38,001 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
363 |
+
2024-08-03 19:20:38,002 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
364 |
+
2024-08-03 19:20:38,003 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
365 |
+
2024-08-03 19:20:38,486 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
366 |
+
2024-08-03 19:20:40,487 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
367 |
+
2024-08-03 19:20:41,029 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
368 |
+
2024-08-03 19:20:41,031 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
369 |
+
2024-08-03 19:20:41,032 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
370 |
+
2024-08-03 19:20:41,033 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
371 |
+
2024-08-03 19:20:41,488 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
372 |
+
2024-08-03 19:20:42,489 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
373 |
+
2024-08-03 19:20:43,034 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
374 |
+
2024-08-03 19:20:43,967 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
375 |
+
2024-08-03 19:20:43,971 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
376 |
+
2024-08-03 19:20:43,971 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
377 |
+
2024-08-03 19:20:43,972 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
378 |
+
2024-08-03 19:20:44,490 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
379 |
+
2024-08-03 19:20:46,491 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
|
380 |
+
2024-08-03 19:20:46,492 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
381 |
+
2024-08-03 19:20:46,525 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: stop_status
|
382 |
+
2024-08-03 19:20:46,526 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: stop_status
|
383 |
+
2024-08-03 19:20:46,527 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: internal_messages
|
384 |
+
2024-08-03 19:20:46,904 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: partial_history
|
385 |
+
2024-08-03 19:20:46,905 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
386 |
+
2024-08-03 19:20:46,906 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
387 |
+
2024-08-03 19:20:46,907 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
388 |
+
2024-08-03 19:20:47,493 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
389 |
+
2024-08-03 19:20:47,496 DEBUG SenderThread:9504 [sender.py:send():382] send: exit
|
390 |
+
2024-08-03 19:20:47,496 INFO SenderThread:9504 [sender.py:send_exit():589] handling exit code: 255
|
391 |
+
2024-08-03 19:20:47,496 INFO SenderThread:9504 [sender.py:send_exit():591] handling runtime: 151
|
392 |
+
2024-08-03 19:20:47,497 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
393 |
+
2024-08-03 19:20:47,497 INFO SenderThread:9504 [sender.py:send_exit():597] send defer
|
394 |
+
2024-08-03 19:20:47,497 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
395 |
+
2024-08-03 19:20:47,497 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 0
|
396 |
+
2024-08-03 19:20:47,497 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
397 |
+
2024-08-03 19:20:47,497 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 0
|
398 |
+
2024-08-03 19:20:47,497 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 1
|
399 |
+
2024-08-03 19:20:47,498 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
400 |
+
2024-08-03 19:20:47,498 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 1
|
401 |
+
2024-08-03 19:20:47,498 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
402 |
+
2024-08-03 19:20:47,498 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 1
|
403 |
+
2024-08-03 19:20:47,498 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 2
|
404 |
+
2024-08-03 19:20:47,498 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
405 |
+
2024-08-03 19:20:47,498 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 2
|
406 |
+
2024-08-03 19:20:47,498 INFO HandlerThread:9504 [system_monitor.py:finish():203] Stopping system monitor
|
407 |
+
2024-08-03 19:20:47,498 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined cpu monitor
|
408 |
+
2024-08-03 19:20:47,498 DEBUG SystemMonitor:9504 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
409 |
+
2024-08-03 19:20:47,499 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined disk monitor
|
410 |
+
2024-08-03 19:20:47,499 DEBUG SystemMonitor:9504 [system_monitor.py:_start():183] Publishing last batch of metrics
|
411 |
+
2024-08-03 19:20:47,532 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined gpu monitor
|
412 |
+
2024-08-03 19:20:47,532 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined memory monitor
|
413 |
+
2024-08-03 19:20:47,532 INFO HandlerThread:9504 [interfaces.py:finish():202] Joined network monitor
|
414 |
+
2024-08-03 19:20:47,533 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
415 |
+
2024-08-03 19:20:47,533 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 2
|
416 |
+
2024-08-03 19:20:47,533 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 3
|
417 |
+
2024-08-03 19:20:47,533 DEBUG SenderThread:9504 [sender.py:send():382] send: stats
|
418 |
+
2024-08-03 19:20:47,533 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
419 |
+
2024-08-03 19:20:47,533 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 3
|
420 |
+
2024-08-03 19:20:47,536 DEBUG SenderThread:9504 [sender.py:send():382] send: history
|
421 |
+
2024-08-03 19:20:47,537 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: summary_record
|
422 |
+
2024-08-03 19:20:47,537 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
423 |
+
2024-08-03 19:20:47,538 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
424 |
+
2024-08-03 19:20:47,538 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 3
|
425 |
+
2024-08-03 19:20:47,538 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 4
|
426 |
+
2024-08-03 19:20:47,538 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
427 |
+
2024-08-03 19:20:47,538 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 4
|
428 |
+
2024-08-03 19:20:47,538 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
429 |
+
2024-08-03 19:20:47,538 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 4
|
430 |
+
2024-08-03 19:20:47,538 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 5
|
431 |
+
2024-08-03 19:20:47,538 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
432 |
+
2024-08-03 19:20:47,538 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 5
|
433 |
+
2024-08-03 19:20:47,539 DEBUG SenderThread:9504 [sender.py:send():382] send: summary
|
434 |
+
2024-08-03 19:20:47,540 INFO SenderThread:9504 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
435 |
+
2024-08-03 19:20:47,540 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
436 |
+
2024-08-03 19:20:47,540 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 5
|
437 |
+
2024-08-03 19:20:47,540 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 6
|
438 |
+
2024-08-03 19:20:47,541 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
439 |
+
2024-08-03 19:20:47,541 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 6
|
440 |
+
2024-08-03 19:20:47,541 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
441 |
+
2024-08-03 19:20:47,541 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 6
|
442 |
+
2024-08-03 19:20:47,541 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 7
|
443 |
+
2024-08-03 19:20:47,541 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
444 |
+
2024-08-03 19:20:47,541 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
445 |
+
2024-08-03 19:20:47,541 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 7
|
446 |
+
2024-08-03 19:20:47,541 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
447 |
+
2024-08-03 19:20:47,541 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 7
|
448 |
+
2024-08-03 19:20:47,875 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 8
|
449 |
+
2024-08-03 19:20:47,875 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
450 |
+
2024-08-03 19:20:47,875 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 8
|
451 |
+
2024-08-03 19:20:47,875 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
452 |
+
2024-08-03 19:20:47,875 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 8
|
453 |
+
2024-08-03 19:20:47,875 INFO SenderThread:9504 [job_builder.py:build():296] Attempting to build job artifact
|
454 |
+
2024-08-03 19:20:47,876 INFO SenderThread:9504 [job_builder.py:_get_source_type():426] is repo sourced job
|
455 |
+
2024-08-03 19:20:47,890 INFO SenderThread:9504 [job_builder.py:build():402] adding wandb-job metadata file
|
456 |
+
2024-08-03 19:20:47,898 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 9
|
457 |
+
2024-08-03 19:20:47,899 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
458 |
+
2024-08-03 19:20:47,899 DEBUG SenderThread:9504 [sender.py:send():382] send: artifact
|
459 |
+
2024-08-03 19:20:47,899 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 9
|
460 |
+
2024-08-03 19:20:48,494 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
461 |
+
2024-08-03 19:20:48,494 INFO Thread-12 :9504 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
462 |
+
2024-08-03 19:20:48,496 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: poll_exit
|
463 |
+
2024-08-03 19:20:49,057 INFO wandb-upload_1:9504 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmp1gtfugn3
|
464 |
+
2024-08-03 19:20:49,437 INFO wandb-upload_0:9504 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp8rydwr53
|
465 |
+
2024-08-03 19:20:50,875 INFO SenderThread:9504 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk4ODAyMA==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'versionIndex': 1}}}
|
466 |
+
2024-08-03 19:20:50,876 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
467 |
+
2024-08-03 19:20:50,876 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 9
|
468 |
+
2024-08-03 19:20:50,876 INFO SenderThread:9504 [dir_watcher.py:finish():358] shutting down directory watcher
|
469 |
+
2024-08-03 19:20:51,495 INFO SenderThread:9504 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240803_191815-jdwps0z3/files
|
470 |
+
2024-08-03 19:20:51,495 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/requirements.txt requirements.txt
|
471 |
+
2024-08-03 19:20:51,495 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/config.yaml config.yaml
|
472 |
+
2024-08-03 19:20:51,497 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-metadata.json wandb-metadata.json
|
473 |
+
2024-08-03 19:20:51,497 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json wandb-summary.json
|
474 |
+
2024-08-03 19:20:51,498 INFO SenderThread:9504 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240803_191815-jdwps0z3/files/output.log output.log
|
475 |
+
2024-08-03 19:20:51,500 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 10
|
476 |
+
2024-08-03 19:20:51,500 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: poll_exit
|
477 |
+
2024-08-03 19:20:51,502 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
478 |
+
2024-08-03 19:20:51,502 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 10
|
479 |
+
2024-08-03 19:20:51,502 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
480 |
+
2024-08-03 19:20:51,502 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 10
|
481 |
+
2024-08-03 19:20:51,502 INFO SenderThread:9504 [file_pusher.py:finish():172] shutting down file pusher
|
482 |
+
2024-08-03 19:20:51,911 INFO wandb-upload_0:9504 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191815-jdwps0z3/files/config.yaml
|
483 |
+
2024-08-03 19:20:51,994 INFO wandb-upload_1:9504 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191815-jdwps0z3/files/requirements.txt
|
484 |
+
2024-08-03 19:20:52,073 INFO wandb-upload_2:9504 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191815-jdwps0z3/files/wandb-summary.json
|
485 |
+
2024-08-03 19:20:52,122 INFO wandb-upload_3:9504 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240803_191815-jdwps0z3/files/output.log
|
486 |
+
2024-08-03 19:20:52,322 INFO Thread-11 (_thread_body):9504 [sender.py:transition_state():617] send defer: 11
|
487 |
+
2024-08-03 19:20:52,322 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
488 |
+
2024-08-03 19:20:52,322 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 11
|
489 |
+
2024-08-03 19:20:52,323 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
490 |
+
2024-08-03 19:20:52,323 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 11
|
491 |
+
2024-08-03 19:20:52,323 INFO SenderThread:9504 [file_pusher.py:join():178] waiting for file pusher
|
492 |
+
2024-08-03 19:20:52,323 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 12
|
493 |
+
2024-08-03 19:20:52,323 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
494 |
+
2024-08-03 19:20:52,323 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 12
|
495 |
+
2024-08-03 19:20:52,323 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
496 |
+
2024-08-03 19:20:52,323 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 12
|
497 |
+
2024-08-03 19:20:52,323 INFO SenderThread:9504 [file_stream.py:finish():595] file stream finish called
|
498 |
+
2024-08-03 19:20:52,498 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: poll_exit
|
499 |
+
2024-08-03 19:20:52,502 INFO SenderThread:9504 [file_stream.py:finish():599] file stream finish is done
|
500 |
+
2024-08-03 19:20:52,502 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 13
|
501 |
+
2024-08-03 19:20:52,502 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: poll_exit
|
502 |
+
2024-08-03 19:20:52,502 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
503 |
+
2024-08-03 19:20:52,502 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 13
|
504 |
+
2024-08-03 19:20:52,503 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
505 |
+
2024-08-03 19:20:52,503 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 13
|
506 |
+
2024-08-03 19:20:52,503 INFO SenderThread:9504 [sender.py:transition_state():617] send defer: 14
|
507 |
+
2024-08-03 19:20:52,503 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: defer
|
508 |
+
2024-08-03 19:20:52,503 DEBUG SenderThread:9504 [sender.py:send():382] send: final
|
509 |
+
2024-08-03 19:20:52,503 INFO HandlerThread:9504 [handler.py:handle_request_defer():172] handle defer: 14
|
510 |
+
2024-08-03 19:20:52,503 DEBUG SenderThread:9504 [sender.py:send():382] send: footer
|
511 |
+
2024-08-03 19:20:52,503 DEBUG SenderThread:9504 [sender.py:send_request():409] send_request: defer
|
512 |
+
2024-08-03 19:20:52,503 INFO SenderThread:9504 [sender.py:send_request_defer():613] handle sender defer: 14
|
513 |
+
2024-08-03 19:20:56,504 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
514 |
+
2024-08-03 19:21:01,505 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
515 |
+
2024-08-03 19:21:06,505 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
516 |
+
2024-08-03 19:21:11,506 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
517 |
+
2024-08-03 19:21:16,506 DEBUG HandlerThread:9504 [handler.py:handle_request():146] handle_request: status_report
|
518 |
+
2024-08-03 19:21:21,065 WARNING StreamThr :9504 [internal.py:is_dead():414] Internal process exiting, parent pid 9433 disappeared
|
519 |
+
2024-08-03 19:21:21,065 ERROR StreamThr :9504 [internal.py:wandb_internal():152] Internal process shutdown.
|
520 |
+
2024-08-03 19:21:21,507 INFO SenderThread:9504 [sender.py:finish():1572] shutting down sender
|
521 |
+
2024-08-03 19:21:21,507 INFO SenderThread:9504 [file_pusher.py:finish():172] shutting down file pusher
|
522 |
+
2024-08-03 19:21:21,507 INFO SenderThread:9504 [file_pusher.py:join():178] waiting for file pusher
|
523 |
+
2024-08-03 19:21:21,507 INFO HandlerThread:9504 [handler.py:finish():869] shutting down handler
|
524 |
+
2024-08-03 19:21:21,507 INFO WriterThread:9504 [datastore.py:close():296] close: /project/wandb/run-20240803_191815-jdwps0z3/run-jdwps0z3.wandb
|
wandb/run-20240803_191815-jdwps0z3/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-03 19:18:15,865 INFO MainThread:9433 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Configure stats pid to 9433
|
3 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
|
6 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240803_191815-jdwps0z3/logs/debug.log
|
9 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240803_191815-jdwps0z3/logs/debug-internal.log
|
10 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample2_train_2024-08-03-19:18:05', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample2', 'save': '/work/llm_recipes/models/tiny-mistral-sample2', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 1600, 'micro_batch_size': 40, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-03 19:18:15,866 INFO MainThread:9433 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-03 19:18:15,871 INFO MainThread:9433 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-03 19:18:15,872 INFO MainThread:9433 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-03 19:18:15,877 INFO MainThread:9433 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-03 19:18:15,887 INFO MainThread:9433 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-03 19:18:16,387 INFO MainThread:9433 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-03 19:18:16,470 INFO MainThread:9433 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-03 19:18:16,470 INFO MainThread:9433 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-03 19:18:16,525 INFO MainThread:9433 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-03 19:18:16,525 INFO MainThread:9433 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-03 19:18:16,525 INFO MainThread:9433 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-03 19:18:16,525 INFO MainThread:9433 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-03 19:18:16,526 INFO MainThread:9433 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-03 19:18:17,990 INFO MainThread:9433 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
|
29 |
+
2024-08-03 19:18:17,990 INFO MainThread:9433 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
wandb/run-20240803_191815-jdwps0z3/run-jdwps0z3.wandb
ADDED
Binary file (107 kB). View file
|
|
wandb/run-20240803_192355-n3hnzq4n/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 512
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-mistral-sample2_train_2024-08-03-19:23:42
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-mistral-sample2
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-mistral-sample2
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/custom/tiny-mistral
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 1600
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 40
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-mistral-sample2
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32768
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722680635.371313
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 256
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: mistral
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 512
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 4
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 4
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: MistralForCausalLM
|
wandb/run-20240803_192355-n3hnzq4n/files/output.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240803_192355-n3hnzq4n/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240803_192355-n3hnzq4n/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-03T10:23:55.999895",
|
5 |
+
"startedAt": "2024-08-03T10:23:55.358106",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"512",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"40",
|
15 |
+
"--global-batch-size",
|
16 |
+
"1600",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/custom/tiny-mistral",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-mistral-sample2",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-mistral-sample2",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-mistral-sample2",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-mistral-sample2_train_2024-08-03-19:23:42"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.034,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.034,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.034,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.034,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.034,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.034,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.034,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.034,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.034,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.034,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.034,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.034,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.034,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.034,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.034,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.034,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.034,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.034,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.034,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240803_192355-n3hnzq4n/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 8.676288604736328, "training/perplexity": 5862.24904336478, "utils/batch_size": 40, "utils/global_batch_size": 1600, "utils/seq_len": 513, "utils/gradient_accumulation_steps": 40, "utils/iteration": 2828, "optimizer/lr": 1.933962231874466e-05, "optimizer/variance_l2": 0.014160344368509084, "optimizer/variance_sqrt_l2": 0.9983195251588314, "optimizer/momentum_l2": 0.9847836932741917, "optimizer/weight_l2": 101.93656115447489, "optimizer/variance_l1": 1.0000762939453125, "optimizer/variance_sqrt_l1": 530.25, "optimizer/momentum_l1": 418.75, "optimizer/weight_l1": 333248.0, "optimizer/variance_abs_max": 0.00130462646484375, "optimizer/variance_sqrt_abs_max": 0.0361328125, "optimizer/momentum_abs_max": 0.03662109375, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 2.9201389469999413, "stats/tokens_per_sec": 281082.5152149914, "stats/tokens_per_sec_per_gpu": 281082.5152149914, "stats/tflops": 19.898159556447013, "_timestamp": 1722688970.2608888, "_runtime": 8334.889575719833, "_step": 2828, "evaluation/val_loss": 8.68109130859375, "evaluation/val_ppl": 5890.47216796875, "_wandb": {"runtime": 8336}}
|
wandb/run-20240803_192355-n3hnzq4n/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240803_192355-n3hnzq4n/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-03 19:23:55,363 INFO MainThread:10080 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Configure stats pid to 10080
|
3 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
|
6 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240803_192355-n3hnzq4n/logs/debug.log
|
9 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240803_192355-n3hnzq4n/logs/debug-internal.log
|
10 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample2_train_2024-08-03-19:23:42', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample2', 'save': '/work/llm_recipes/models/tiny-mistral-sample2', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 1600, 'micro_batch_size': 40, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-03 19:23:55,364 INFO MainThread:10080 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-03 19:23:55,369 INFO MainThread:10080 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-03 19:23:55,371 INFO MainThread:10080 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-03 19:23:55,375 INFO MainThread:10080 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-03 19:23:55,403 INFO MainThread:10080 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-03 19:23:55,888 INFO MainThread:10080 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-03 19:23:55,974 INFO MainThread:10080 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-03 19:23:55,974 INFO MainThread:10080 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-03 19:23:56,034 INFO MainThread:10080 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-03 19:23:56,034 INFO MainThread:10080 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-03 19:23:56,034 INFO MainThread:10080 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-03 19:23:56,035 INFO MainThread:10080 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-03 19:23:56,035 INFO MainThread:10080 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-03 19:23:57,141 INFO MainThread:10080 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
|
29 |
+
2024-08-03 19:23:57,141 INFO MainThread:10080 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
wandb/run-20240812_063027-j1htzx7q/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '235289369'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '235289369'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '235289369'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 4096
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/google/gemma-2-2b
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-sample-gemma-2-2b_train_2024-08-12-06:30:12
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-sample-gemma-2-2b
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-sample-gemma-2-2b
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/google/gemma-2-2b
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 3
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 3
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: anyprecision
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-sample-gemma-2-2b
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 256000
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1723411827.601845
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
model_architecture:
|
316 |
+
desc: null
|
317 |
+
value: Gemma2ForCausalLM
|
318 |
+
activation_function:
|
319 |
+
desc: null
|
320 |
+
value: gelu_pytorch_tanh
|
321 |
+
hidden_size:
|
322 |
+
desc: null
|
323 |
+
value: 2304
|
324 |
+
model_type:
|
325 |
+
desc: null
|
326 |
+
value: gemma2
|
327 |
+
max_position_embeddings:
|
328 |
+
desc: null
|
329 |
+
value: 4096
|
330 |
+
num_attention_heads:
|
331 |
+
desc: null
|
332 |
+
value: 8
|
333 |
+
num_hidden_layers:
|
334 |
+
desc: null
|
335 |
+
value: 26
|
wandb/run-20240812_063027-j1htzx7q/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240812_063027-j1htzx7q/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-11T21:30:28.265073",
|
5 |
+
"startedAt": "2024-08-11T21:30:27.589443",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/google/gemma-2-2b",
|
23 |
+
"--train-data-path",
|
24 |
+
"235289369",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"235289369",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"235289369",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"anyprecision",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"3",
|
56 |
+
"--eval-interval",
|
57 |
+
"3",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/google/gemma-2-2b",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-sample-gemma-2-2b",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-sample-gemma-2-2b",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-sample-gemma-2-2b",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-sample-gemma-2-2b_train_2024-08-12-06:30:12"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0429999999997,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.043,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.043,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.043,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.043,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.043,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.043,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.043,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.043,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.043,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.043,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.043,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.043,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.043,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.043,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.043,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.043,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.043,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.043,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.487823486328125
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240812_063027-j1htzx7q/logs/debug-internal.log
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 06:30:27,603 INFO StreamThr :12721 [internal.py:wandb_internal():86] W&B internal server running at pid: 12721, started at: 2024-08-12 06:30:27.602612
|
2 |
+
2024-08-12 06:30:27,605 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-12 06:30:27,607 INFO WriterThread:12721 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_063027-j1htzx7q/run-j1htzx7q.wandb
|
4 |
+
2024-08-12 06:30:27,608 DEBUG SenderThread:12721 [sender.py:send():382] send: header
|
5 |
+
2024-08-12 06:30:27,640 DEBUG SenderThread:12721 [sender.py:send():382] send: run
|
6 |
+
2024-08-12 06:30:28,148 INFO SenderThread:12721 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_063027-j1htzx7q/files
|
7 |
+
2024-08-12 06:30:28,148 INFO SenderThread:12721 [sender.py:_start_run_threads():1136] run started: j1htzx7q with start time 1723411827.601845
|
8 |
+
2024-08-12 06:30:28,154 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-12 06:30:28,154 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-12 06:30:28,244 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-12 06:30:28,251 DEBUG HandlerThread:12721 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-12 06:30:28,251 DEBUG HandlerThread:12721 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-12 06:30:28,251 INFO HandlerThread:12721 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-12 06:30:28,251 INFO SystemMonitor:12721 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-12 06:30:28,251 INFO HandlerThread:12721 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-12 06:30:28,252 INFO SystemMonitor:12721 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-12 06:30:28,252 INFO SystemMonitor:12721 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-12 06:30:28,253 INFO SystemMonitor:12721 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-12 06:30:28,254 INFO SystemMonitor:12721 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-12 06:30:28,255 INFO SystemMonitor:12721 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-12 06:30:28,264 DEBUG HandlerThread:12721 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-12 06:30:28,267 DEBUG HandlerThread:12721 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-12 06:30:28,279 DEBUG HandlerThread:12721 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-12 06:30:28,279 DEBUG HandlerThread:12721 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-12 06:30:28,279 DEBUG HandlerThread:12721 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T21:30:28.265073', 'startedAt': '2024-08-11T21:30:27.589443', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/google/gemma-2-2b', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '3', '--eval-interval', '3', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/google/gemma-2-2b', '--save', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--load', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-gemma-2-2b', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-gemma-2-2b_train_2024-08-12-06:30:12'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
|
26 |
+
2024-08-12 06:30:28,279 INFO HandlerThread:12721 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-12 06:30:28,279 INFO HandlerThread:12721 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-12 06:30:28,281 INFO HandlerThread:12721 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-12 06:30:28,287 DEBUG SenderThread:12721 [sender.py:send():382] send: files
|
30 |
+
2024-08-12 06:30:28,287 INFO SenderThread:12721 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-12 06:30:28,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-12 06:30:28,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: python_packages
|
33 |
+
2024-08-12 06:30:28,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
34 |
+
2024-08-12 06:30:28,298 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
35 |
+
2024-08-12 06:30:28,299 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-12 06:30:28,566 DEBUG SenderThread:12721 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-12 06:30:28,941 INFO wandb-upload_0:12721 [upload_job.py:push():131] Uploaded file /tmp/tmpagb8lhaywandb/h9wwuria-wandb-metadata.json
|
38 |
+
2024-08-12 06:30:29,150 INFO Thread-12 :12721 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063027-j1htzx7q/files/requirements.txt
|
39 |
+
2024-08-12 06:30:29,150 INFO Thread-12 :12721 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
40 |
+
2024-08-12 06:30:29,150 INFO Thread-12 :12721 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-metadata.json
|
41 |
+
2024-08-12 06:30:31,151 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
42 |
+
2024-08-12 06:30:32,987 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-12 06:30:37,988 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
44 |
+
2024-08-12 06:30:42,988 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
45 |
+
2024-08-12 06:30:43,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
46 |
+
2024-08-12 06:30:43,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
47 |
+
2024-08-12 06:30:43,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
48 |
+
2024-08-12 06:30:48,530 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
49 |
+
2024-08-12 06:30:53,531 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
50 |
+
2024-08-12 06:30:58,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
51 |
+
2024-08-12 06:30:58,296 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
52 |
+
2024-08-12 06:30:58,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
53 |
+
2024-08-12 06:30:58,553 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
54 |
+
2024-08-12 06:30:59,169 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/config.yaml
|
55 |
+
2024-08-12 06:31:03,753 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
56 |
+
2024-08-12 06:31:08,754 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
57 |
+
2024-08-12 06:31:13,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
58 |
+
2024-08-12 06:31:13,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
59 |
+
2024-08-12 06:31:13,336 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
60 |
+
2024-08-12 06:31:14,505 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
61 |
+
2024-08-12 06:31:19,506 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-12 06:31:24,507 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
63 |
+
2024-08-12 06:31:28,255 DEBUG SystemMonitor:12721 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
64 |
+
2024-08-12 06:31:28,257 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
|
65 |
+
2024-08-12 06:31:28,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
66 |
+
2024-08-12 06:31:28,296 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
67 |
+
2024-08-12 06:31:28,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
68 |
+
2024-08-12 06:31:30,499 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
69 |
+
2024-08-12 06:31:35,500 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
70 |
+
2024-08-12 06:31:40,561 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
71 |
+
2024-08-12 06:31:41,196 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
72 |
+
2024-08-12 06:31:43,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
73 |
+
2024-08-12 06:31:43,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
74 |
+
2024-08-12 06:31:43,298 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
75 |
+
2024-08-12 06:31:46,508 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
76 |
+
2024-08-12 06:31:51,509 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
77 |
+
2024-08-12 06:31:56,510 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
78 |
+
2024-08-12 06:31:58,258 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
|
79 |
+
2024-08-12 06:31:58,296 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
80 |
+
2024-08-12 06:31:58,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
81 |
+
2024-08-12 06:31:58,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
82 |
+
2024-08-12 06:32:02,497 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
83 |
+
2024-08-12 06:32:07,497 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
84 |
+
2024-08-12 06:32:12,498 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
85 |
+
2024-08-12 06:32:13,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
86 |
+
2024-08-12 06:32:13,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
87 |
+
2024-08-12 06:32:13,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
88 |
+
2024-08-12 06:32:17,531 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
89 |
+
2024-08-12 06:32:22,531 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
90 |
+
2024-08-12 06:32:27,532 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
91 |
+
2024-08-12 06:32:28,259 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
|
92 |
+
2024-08-12 06:32:28,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
93 |
+
2024-08-12 06:32:28,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
94 |
+
2024-08-12 06:32:28,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
95 |
+
2024-08-12 06:32:33,512 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
96 |
+
2024-08-12 06:32:38,513 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
97 |
+
2024-08-12 06:32:43,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
98 |
+
2024-08-12 06:32:43,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
99 |
+
2024-08-12 06:32:43,340 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
100 |
+
2024-08-12 06:32:43,534 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
101 |
+
2024-08-12 06:32:48,534 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
102 |
+
2024-08-12 06:32:53,535 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
103 |
+
2024-08-12 06:32:58,260 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
|
104 |
+
2024-08-12 06:32:58,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
105 |
+
2024-08-12 06:32:58,297 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
106 |
+
2024-08-12 06:32:58,344 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
107 |
+
2024-08-12 06:32:59,520 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
108 |
+
2024-08-12 06:33:01,246 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
109 |
+
2024-08-12 06:33:05,103 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
110 |
+
2024-08-12 06:33:10,126 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
111 |
+
2024-08-12 06:33:11,252 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
112 |
+
2024-08-12 06:33:12,515 DEBUG SenderThread:12721 [sender.py:send():382] send: config
|
113 |
+
2024-08-12 06:33:12,515 DEBUG SenderThread:12721 [sender.py:send():382] send: config
|
114 |
+
2024-08-12 06:33:13,253 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
115 |
+
2024-08-12 06:33:13,297 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: stop_status
|
116 |
+
2024-08-12 06:33:13,298 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: stop_status
|
117 |
+
2024-08-12 06:33:13,298 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
118 |
+
2024-08-12 06:33:15,255 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
119 |
+
2024-08-12 06:33:15,581 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
120 |
+
2024-08-12 06:33:15,887 DEBUG SenderThread:12721 [sender.py:send():382] send: exit
|
121 |
+
2024-08-12 06:33:15,887 INFO SenderThread:12721 [sender.py:send_exit():589] handling exit code: 1
|
122 |
+
2024-08-12 06:33:15,887 INFO SenderThread:12721 [sender.py:send_exit():591] handling runtime: 167
|
123 |
+
2024-08-12 06:33:15,889 INFO SenderThread:12721 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
124 |
+
2024-08-12 06:33:15,889 INFO SenderThread:12721 [sender.py:send_exit():597] send defer
|
125 |
+
2024-08-12 06:33:15,889 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
126 |
+
2024-08-12 06:33:15,889 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 0
|
127 |
+
2024-08-12 06:33:15,889 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
128 |
+
2024-08-12 06:33:15,889 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 0
|
129 |
+
2024-08-12 06:33:15,889 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 1
|
130 |
+
2024-08-12 06:33:15,890 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
131 |
+
2024-08-12 06:33:15,890 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 1
|
132 |
+
2024-08-12 06:33:15,890 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
133 |
+
2024-08-12 06:33:15,890 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 1
|
134 |
+
2024-08-12 06:33:15,890 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 2
|
135 |
+
2024-08-12 06:33:15,890 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
136 |
+
2024-08-12 06:33:15,890 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 2
|
137 |
+
2024-08-12 06:33:15,890 INFO HandlerThread:12721 [system_monitor.py:finish():203] Stopping system monitor
|
138 |
+
2024-08-12 06:33:15,890 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined cpu monitor
|
139 |
+
2024-08-12 06:33:15,890 DEBUG SystemMonitor:12721 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
140 |
+
2024-08-12 06:33:15,891 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined disk monitor
|
141 |
+
2024-08-12 06:33:15,891 DEBUG SystemMonitor:12721 [system_monitor.py:_start():183] Publishing last batch of metrics
|
142 |
+
2024-08-12 06:33:15,924 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined gpu monitor
|
143 |
+
2024-08-12 06:33:15,925 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined memory monitor
|
144 |
+
2024-08-12 06:33:15,925 INFO HandlerThread:12721 [interfaces.py:finish():202] Joined network monitor
|
145 |
+
2024-08-12 06:33:15,925 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
146 |
+
2024-08-12 06:33:15,925 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 2
|
147 |
+
2024-08-12 06:33:15,925 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 3
|
148 |
+
2024-08-12 06:33:15,926 DEBUG SenderThread:12721 [sender.py:send():382] send: stats
|
149 |
+
2024-08-12 06:33:15,926 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
150 |
+
2024-08-12 06:33:15,926 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 3
|
151 |
+
2024-08-12 06:33:15,926 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
152 |
+
2024-08-12 06:33:15,926 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 3
|
153 |
+
2024-08-12 06:33:15,926 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 4
|
154 |
+
2024-08-12 06:33:15,926 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
155 |
+
2024-08-12 06:33:15,926 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 4
|
156 |
+
2024-08-12 06:33:15,926 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
157 |
+
2024-08-12 06:33:15,926 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 4
|
158 |
+
2024-08-12 06:33:15,927 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 5
|
159 |
+
2024-08-12 06:33:15,927 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
160 |
+
2024-08-12 06:33:15,927 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 5
|
161 |
+
2024-08-12 06:33:15,927 DEBUG SenderThread:12721 [sender.py:send():382] send: summary
|
162 |
+
2024-08-12 06:33:15,928 INFO SenderThread:12721 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
163 |
+
2024-08-12 06:33:15,928 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
164 |
+
2024-08-12 06:33:15,928 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 5
|
165 |
+
2024-08-12 06:33:15,928 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 6
|
166 |
+
2024-08-12 06:33:15,928 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
167 |
+
2024-08-12 06:33:15,928 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 6
|
168 |
+
2024-08-12 06:33:15,928 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
169 |
+
2024-08-12 06:33:15,928 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 6
|
170 |
+
2024-08-12 06:33:15,931 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: status_report
|
171 |
+
2024-08-12 06:33:16,132 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 7
|
172 |
+
2024-08-12 06:33:16,132 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
173 |
+
2024-08-12 06:33:16,132 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 7
|
174 |
+
2024-08-12 06:33:16,132 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
175 |
+
2024-08-12 06:33:16,132 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 7
|
176 |
+
2024-08-12 06:33:16,256 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/config.yaml
|
177 |
+
2024-08-12 06:33:16,256 INFO Thread-12 :12721 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json
|
178 |
+
2024-08-12 06:33:16,887 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
|
179 |
+
2024-08-12 06:33:17,100 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 8
|
180 |
+
2024-08-12 06:33:17,100 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
|
181 |
+
2024-08-12 06:33:17,100 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
182 |
+
2024-08-12 06:33:17,100 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 8
|
183 |
+
2024-08-12 06:33:17,100 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
184 |
+
2024-08-12 06:33:17,100 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 8
|
185 |
+
2024-08-12 06:33:17,100 INFO SenderThread:12721 [job_builder.py:build():296] Attempting to build job artifact
|
186 |
+
2024-08-12 06:33:17,101 INFO SenderThread:12721 [job_builder.py:_get_source_type():426] is repo sourced job
|
187 |
+
2024-08-12 06:33:17,116 INFO SenderThread:12721 [job_builder.py:build():402] adding wandb-job metadata file
|
188 |
+
2024-08-12 06:33:17,124 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 9
|
189 |
+
2024-08-12 06:33:17,125 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
190 |
+
2024-08-12 06:33:17,125 DEBUG SenderThread:12721 [sender.py:send():382] send: artifact
|
191 |
+
2024-08-12 06:33:17,125 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 9
|
192 |
+
2024-08-12 06:33:17,257 INFO Thread-12 :12721 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
193 |
+
2024-08-12 06:33:17,887 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
|
194 |
+
2024-08-12 06:33:18,153 INFO SenderThread:12721 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE0MDAxODM0Nw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
|
195 |
+
2024-08-12 06:33:18,154 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
196 |
+
2024-08-12 06:33:18,154 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 9
|
197 |
+
2024-08-12 06:33:18,154 INFO SenderThread:12721 [dir_watcher.py:finish():358] shutting down directory watcher
|
198 |
+
2024-08-12 06:33:18,258 INFO SenderThread:12721 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_063027-j1htzx7q/files
|
199 |
+
2024-08-12 06:33:18,258 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/requirements.txt requirements.txt
|
200 |
+
2024-08-12 06:33:18,258 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/config.yaml config.yaml
|
201 |
+
2024-08-12 06:33:18,259 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-metadata.json wandb-metadata.json
|
202 |
+
2024-08-12 06:33:18,260 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json wandb-summary.json
|
203 |
+
2024-08-12 06:33:18,262 INFO SenderThread:12721 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063027-j1htzx7q/files/output.log output.log
|
204 |
+
2024-08-12 06:33:18,262 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 10
|
205 |
+
2024-08-12 06:33:18,262 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
|
206 |
+
2024-08-12 06:33:18,264 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
207 |
+
2024-08-12 06:33:18,265 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 10
|
208 |
+
2024-08-12 06:33:18,266 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
209 |
+
2024-08-12 06:33:18,266 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 10
|
210 |
+
2024-08-12 06:33:18,266 INFO SenderThread:12721 [file_pusher.py:finish():172] shutting down file pusher
|
211 |
+
2024-08-12 06:33:18,655 INFO wandb-upload_0:12721 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063027-j1htzx7q/files/requirements.txt
|
212 |
+
2024-08-12 06:33:18,745 INFO wandb-upload_1:12721 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063027-j1htzx7q/files/config.yaml
|
213 |
+
2024-08-12 06:33:18,843 INFO wandb-upload_2:12721 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json
|
214 |
+
2024-08-12 06:33:18,858 INFO wandb-upload_3:12721 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063027-j1htzx7q/files/output.log
|
215 |
+
2024-08-12 06:33:18,888 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
|
216 |
+
2024-08-12 06:33:18,889 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
|
217 |
+
2024-08-12 06:33:19,058 INFO Thread-11 (_thread_body):12721 [sender.py:transition_state():617] send defer: 11
|
218 |
+
2024-08-12 06:33:19,059 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
219 |
+
2024-08-12 06:33:19,059 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 11
|
220 |
+
2024-08-12 06:33:19,059 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
221 |
+
2024-08-12 06:33:19,059 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 11
|
222 |
+
2024-08-12 06:33:19,059 INFO SenderThread:12721 [file_pusher.py:join():178] waiting for file pusher
|
223 |
+
2024-08-12 06:33:19,059 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 12
|
224 |
+
2024-08-12 06:33:19,059 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
225 |
+
2024-08-12 06:33:19,059 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 12
|
226 |
+
2024-08-12 06:33:19,059 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
227 |
+
2024-08-12 06:33:19,059 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 12
|
228 |
+
2024-08-12 06:33:19,059 INFO SenderThread:12721 [file_stream.py:finish():595] file stream finish called
|
229 |
+
2024-08-12 06:33:19,821 INFO SenderThread:12721 [file_stream.py:finish():599] file stream finish is done
|
230 |
+
2024-08-12 06:33:19,821 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 13
|
231 |
+
2024-08-12 06:33:19,822 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
232 |
+
2024-08-12 06:33:19,822 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 13
|
233 |
+
2024-08-12 06:33:19,822 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
234 |
+
2024-08-12 06:33:19,822 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 13
|
235 |
+
2024-08-12 06:33:19,822 INFO SenderThread:12721 [sender.py:transition_state():617] send defer: 14
|
236 |
+
2024-08-12 06:33:19,822 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: defer
|
237 |
+
2024-08-12 06:33:19,823 DEBUG SenderThread:12721 [sender.py:send():382] send: final
|
238 |
+
2024-08-12 06:33:19,823 INFO HandlerThread:12721 [handler.py:handle_request_defer():172] handle defer: 14
|
239 |
+
2024-08-12 06:33:19,823 DEBUG SenderThread:12721 [sender.py:send():382] send: footer
|
240 |
+
2024-08-12 06:33:19,823 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: defer
|
241 |
+
2024-08-12 06:33:19,823 INFO SenderThread:12721 [sender.py:send_request_defer():613] handle sender defer: 14
|
242 |
+
2024-08-12 06:33:19,823 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
|
243 |
+
2024-08-12 06:33:19,823 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
|
244 |
+
2024-08-12 06:33:19,824 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: poll_exit
|
245 |
+
2024-08-12 06:33:19,824 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: poll_exit
|
246 |
+
2024-08-12 06:33:19,824 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: server_info
|
247 |
+
2024-08-12 06:33:19,824 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: get_summary
|
248 |
+
2024-08-12 06:33:19,825 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: server_info
|
249 |
+
2024-08-12 06:33:19,826 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: sampled_history
|
250 |
+
2024-08-12 06:33:19,826 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: internal_messages
|
251 |
+
2024-08-12 06:33:19,827 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: job_info
|
252 |
+
2024-08-12 06:33:19,994 DEBUG SenderThread:12721 [sender.py:send_request():409] send_request: job_info
|
253 |
+
2024-08-12 06:33:19,994 INFO MainThread:12721 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
254 |
+
2024-08-12 06:33:19,994 INFO MainThread:12721 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
255 |
+
2024-08-12 06:33:19,995 INFO MainThread:12721 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
256 |
+
2024-08-12 06:33:19,995 DEBUG HandlerThread:12721 [handler.py:handle_request():146] handle_request: shutdown
|
257 |
+
2024-08-12 06:33:19,995 INFO HandlerThread:12721 [handler.py:finish():869] shutting down handler
|
258 |
+
2024-08-12 06:33:20,827 INFO WriterThread:12721 [datastore.py:close():296] close: /project/wandb/run-20240812_063027-j1htzx7q/run-j1htzx7q.wandb
|
259 |
+
2024-08-12 06:33:20,994 INFO SenderThread:12721 [sender.py:finish():1572] shutting down sender
|
260 |
+
2024-08-12 06:33:20,995 INFO SenderThread:12721 [file_pusher.py:finish():172] shutting down file pusher
|
261 |
+
2024-08-12 06:33:20,995 INFO SenderThread:12721 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240812_063027-j1htzx7q/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Configure stats pid to 12650
|
3 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
6 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_063027-j1htzx7q/logs/debug.log
|
9 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_063027-j1htzx7q/logs/debug-internal.log
|
10 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-12 06:30:27,595 INFO MainThread:12650 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-12-06:30:12', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 3, 'save_interval': 3, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-12 06:30:27,596 INFO MainThread:12650 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-12 06:30:27,596 INFO MainThread:12650 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-12 06:30:27,600 INFO MainThread:12650 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-12 06:30:27,601 INFO MainThread:12650 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-12 06:30:27,606 INFO MainThread:12650 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-12 06:30:27,632 INFO MainThread:12650 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-12 06:30:28,153 INFO MainThread:12650 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-12 06:30:28,237 INFO MainThread:12650 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-12 06:30:28,237 INFO MainThread:12650 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-12 06:30:28,296 INFO MainThread:12650 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-12 06:30:28,296 INFO MainThread:12650 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-12 06:30:28,296 INFO MainThread:12650 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-12 06:30:28,296 INFO MainThread:12650 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-12 06:30:28,298 INFO MainThread:12650 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-12 06:33:12,514 INFO MainThread:12650 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Gemma2ForCausalLM', 'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 4096, 'num_attention_heads': 8, 'num_hidden_layers': 26}
|
29 |
+
2024-08-12 06:33:12,515 INFO MainThread:12650 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-12 06:33:20,996 WARNING MsgRouterThr:12650 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240812_063027-j1htzx7q/run-j1htzx7q.wandb
ADDED
Binary file (25 kB). View file
|
|
wandb/run-20240823_163849-faey1t8u/files/config.yaml
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '1754785366'
|
31 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
32 |
+
- '28623823675'
|
33 |
+
- /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
|
34 |
+
valid_data_path:
|
35 |
+
desc: null
|
36 |
+
value:
|
37 |
+
- '1754785366'
|
38 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
39 |
+
test_data_path:
|
40 |
+
desc: null
|
41 |
+
value:
|
42 |
+
- '1754785366'
|
43 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
44 |
+
data_cache_path:
|
45 |
+
desc: null
|
46 |
+
value: null
|
47 |
+
vocab_size:
|
48 |
+
desc: null
|
49 |
+
value: null
|
50 |
+
vocab_file:
|
51 |
+
desc: null
|
52 |
+
value: null
|
53 |
+
merge_file:
|
54 |
+
desc: null
|
55 |
+
value: null
|
56 |
+
seq_length:
|
57 |
+
desc: null
|
58 |
+
value: 4096
|
59 |
+
num_workers:
|
60 |
+
desc: null
|
61 |
+
value: 2
|
62 |
+
tokenizer_type:
|
63 |
+
desc: null
|
64 |
+
value: HFPreTrainedTokenizer
|
65 |
+
tokenizer_model:
|
66 |
+
desc: null
|
67 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
68 |
+
reset_position_ids:
|
69 |
+
desc: null
|
70 |
+
value: false
|
71 |
+
reset_attention_mask:
|
72 |
+
desc: null
|
73 |
+
value: false
|
74 |
+
eod_mask_loss:
|
75 |
+
desc: null
|
76 |
+
value: false
|
77 |
+
retro_return_doc_ids:
|
78 |
+
desc: null
|
79 |
+
value: false
|
80 |
+
short_seq_prob:
|
81 |
+
desc: null
|
82 |
+
value: 0.1
|
83 |
+
vocab_extra_ids:
|
84 |
+
desc: null
|
85 |
+
value: 0
|
86 |
+
seed:
|
87 |
+
desc: null
|
88 |
+
value: 1234
|
89 |
+
use_mpi:
|
90 |
+
desc: null
|
91 |
+
value: false
|
92 |
+
wandb_entity:
|
93 |
+
desc: null
|
94 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
95 |
+
wandb_name:
|
96 |
+
desc: null
|
97 |
+
value: Qwen2-0.5b-0.2_train_2024-08-23-16:38:35
|
98 |
+
wandb_project:
|
99 |
+
desc: null
|
100 |
+
value: llm_tutorial-0.2
|
101 |
+
quantization:
|
102 |
+
desc: null
|
103 |
+
value: false
|
104 |
+
use_freeze_layers:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
freeze_layers:
|
108 |
+
desc: null
|
109 |
+
value: null
|
110 |
+
bf16:
|
111 |
+
desc: null
|
112 |
+
value: true
|
113 |
+
fp16:
|
114 |
+
desc: null
|
115 |
+
value: false
|
116 |
+
mixed_precision:
|
117 |
+
desc: null
|
118 |
+
value: true
|
119 |
+
param_dtype:
|
120 |
+
desc: null
|
121 |
+
value: null
|
122 |
+
load:
|
123 |
+
desc: null
|
124 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
125 |
+
save:
|
126 |
+
desc: null
|
127 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
128 |
+
base_model:
|
129 |
+
desc: null
|
130 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
131 |
+
use_better_transformer:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
grad_clip_norm:
|
135 |
+
desc: null
|
136 |
+
value: 1.0
|
137 |
+
eval_interval:
|
138 |
+
desc: null
|
139 |
+
value: 10
|
140 |
+
save_interval:
|
141 |
+
desc: null
|
142 |
+
value: 10
|
143 |
+
eval_iters:
|
144 |
+
desc: null
|
145 |
+
value: 10
|
146 |
+
optimizer:
|
147 |
+
desc: null
|
148 |
+
value: anyprecision
|
149 |
+
lr:
|
150 |
+
desc: null
|
151 |
+
value: 2.0e-05
|
152 |
+
lr_decay_style:
|
153 |
+
desc: null
|
154 |
+
value: cosine
|
155 |
+
lr_decay_iters:
|
156 |
+
desc: null
|
157 |
+
value: 7500
|
158 |
+
lr_warmup_iters:
|
159 |
+
desc: null
|
160 |
+
value: 500
|
161 |
+
min_lr:
|
162 |
+
desc: null
|
163 |
+
value: 1.0e-06
|
164 |
+
train_iters:
|
165 |
+
desc: null
|
166 |
+
value: 7500
|
167 |
+
train_samples:
|
168 |
+
desc: null
|
169 |
+
value: null
|
170 |
+
global_batch_size:
|
171 |
+
desc: null
|
172 |
+
value: 320
|
173 |
+
micro_batch_size:
|
174 |
+
desc: null
|
175 |
+
value: 3
|
176 |
+
make_vocab_size_divisible_by:
|
177 |
+
desc: null
|
178 |
+
value: 128
|
179 |
+
sliding_window_size:
|
180 |
+
desc: null
|
181 |
+
value: 131072
|
182 |
+
skip_batch:
|
183 |
+
desc: null
|
184 |
+
value: null
|
185 |
+
no_save_optimizer_state:
|
186 |
+
desc: null
|
187 |
+
value: false
|
188 |
+
continual_pretraining:
|
189 |
+
desc: null
|
190 |
+
value: false
|
191 |
+
instruction_tuning:
|
192 |
+
desc: null
|
193 |
+
value: false
|
194 |
+
direct_preference_optimization:
|
195 |
+
desc: null
|
196 |
+
value: false
|
197 |
+
attention_dropout:
|
198 |
+
desc: null
|
199 |
+
value: 0.1
|
200 |
+
hidden_dropout:
|
201 |
+
desc: null
|
202 |
+
value: 0.1
|
203 |
+
weight_decay:
|
204 |
+
desc: null
|
205 |
+
value: 0.1
|
206 |
+
adam_beta1:
|
207 |
+
desc: null
|
208 |
+
value: 0.9
|
209 |
+
adam_beta2:
|
210 |
+
desc: null
|
211 |
+
value: 0.95
|
212 |
+
adam_eps:
|
213 |
+
desc: null
|
214 |
+
value: 1.0e-06
|
215 |
+
hf_transformer_model_dir:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
instruction_train_data_path:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
instruction_valid_data_path:
|
222 |
+
desc: null
|
223 |
+
value: null
|
224 |
+
epoch:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
instruction_dataset_size:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
save_sampler_state:
|
231 |
+
desc: null
|
232 |
+
value: false
|
233 |
+
label_smoothing:
|
234 |
+
desc: null
|
235 |
+
value: 0.0
|
236 |
+
save_n_checkpoints:
|
237 |
+
desc: null
|
238 |
+
value: 10
|
239 |
+
hf_repo_id:
|
240 |
+
desc: null
|
241 |
+
value: koichi12/Qwen2-0.5b-0.2
|
242 |
+
create_public_hf_repo:
|
243 |
+
desc: null
|
244 |
+
value: false
|
245 |
+
upload_all_checkpoints_to_hf:
|
246 |
+
desc: null
|
247 |
+
value: true
|
248 |
+
hf_upload_retry_limit:
|
249 |
+
desc: null
|
250 |
+
value: 2
|
251 |
+
exit_duration_in_mins:
|
252 |
+
desc: null
|
253 |
+
value: null
|
254 |
+
source_key:
|
255 |
+
desc: null
|
256 |
+
value: null
|
257 |
+
target_key:
|
258 |
+
desc: null
|
259 |
+
value: null
|
260 |
+
attn_implementation:
|
261 |
+
desc: null
|
262 |
+
value: flash_attention_2
|
263 |
+
efficient_instruction_tuning:
|
264 |
+
desc: null
|
265 |
+
value: false
|
266 |
+
remove_padding_masking:
|
267 |
+
desc: null
|
268 |
+
value: false
|
269 |
+
save_start_iter:
|
270 |
+
desc: null
|
271 |
+
value: null
|
272 |
+
valid_micro_batch_size:
|
273 |
+
desc: null
|
274 |
+
value: 1
|
275 |
+
rank:
|
276 |
+
desc: null
|
277 |
+
value: 0
|
278 |
+
world_size:
|
279 |
+
desc: null
|
280 |
+
value: 1
|
281 |
+
padded_vocab_size:
|
282 |
+
desc: null
|
283 |
+
value: 151680
|
284 |
+
gradient_accumulation_steps:
|
285 |
+
desc: null
|
286 |
+
value: 106
|
287 |
+
_wandb:
|
288 |
+
desc: null
|
289 |
+
value:
|
290 |
+
python_version: 3.10.12
|
291 |
+
cli_version: 0.16.3
|
292 |
+
framework: huggingface
|
293 |
+
huggingface_version: 4.43.3
|
294 |
+
is_jupyter_run: false
|
295 |
+
is_kaggle_kernel: false
|
296 |
+
start_time: 1724398729.364923
|
297 |
+
t:
|
298 |
+
1:
|
299 |
+
- 1
|
300 |
+
- 11
|
301 |
+
- 49
|
302 |
+
- 55
|
303 |
+
- 71
|
304 |
+
- 105
|
305 |
+
2:
|
306 |
+
- 1
|
307 |
+
- 11
|
308 |
+
- 49
|
309 |
+
- 55
|
310 |
+
- 71
|
311 |
+
- 105
|
312 |
+
3:
|
313 |
+
- 13
|
314 |
+
- 16
|
315 |
+
- 23
|
316 |
+
4: 3.10.12
|
317 |
+
5: 0.16.3
|
318 |
+
6: 4.43.3
|
319 |
+
8:
|
320 |
+
- 5
|
321 |
+
13: linux-x86_64
|
322 |
+
model_architecture:
|
323 |
+
desc: null
|
324 |
+
value: Qwen2ForCausalLM
|
325 |
+
activation_function:
|
326 |
+
desc: null
|
327 |
+
value: silu
|
328 |
+
hidden_size:
|
329 |
+
desc: null
|
330 |
+
value: 896
|
331 |
+
model_type:
|
332 |
+
desc: null
|
333 |
+
value: qwen2
|
334 |
+
max_position_embeddings:
|
335 |
+
desc: null
|
336 |
+
value: 4096
|
337 |
+
num_attention_heads:
|
338 |
+
desc: null
|
339 |
+
value: 14
|
340 |
+
num_hidden_layers:
|
341 |
+
desc: null
|
342 |
+
value: 24
|
wandb/run-20240823_163849-faey1t8u/files/output.log
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
|
5 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
6 |
+
Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
|
7 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
8 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
9 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
10 |
+
--> applying fsdp activation checkpointing...
|
11 |
+
> datasets target sizes (minimum size):
|
12 |
+
train: 2400000
|
13 |
+
validation: 2403200
|
14 |
+
test: 3200
|
15 |
+
> building train, validation, and test datasets for GPT ...
|
16 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
17 |
+
warnings.warn(
|
18 |
+
Let split = None
|
19 |
+
Unable to save the indexes because path_to_cache is None
|
20 |
+
Building a BlendedDataset for a single MegatronDataset
|
21 |
+
Unable to save the indexes because path_to_cache is None
|
22 |
+
Building a BlendedDataset for a single MegatronDataset
|
23 |
+
Unable to save the indexes because path_to_cache is None
|
24 |
+
> finished creating GPT datasets ...
|
25 |
+
Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
|
26 |
+
[rank0]:[2024-08-23 16:38:58,062] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
27 |
+
Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
|
28 |
+
model info: FullyShardedDataParallel(
|
29 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
30 |
+
(model): Qwen2Model(
|
31 |
+
(embed_tokens): Embedding(151936, 896)
|
32 |
+
(layers): ModuleList(
|
33 |
+
(0-23): 24 x FullyShardedDataParallel(
|
34 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
35 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
36 |
+
(self_attn): Qwen2FlashAttention2(
|
37 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
38 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
39 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
40 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
41 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
42 |
+
)
|
43 |
+
(mlp): Qwen2MLP(
|
44 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
45 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
46 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
47 |
+
(act_fn): SiLU()
|
48 |
+
)
|
49 |
+
(input_layernorm): Qwen2RMSNorm()
|
50 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
51 |
+
)
|
52 |
+
)
|
53 |
+
)
|
54 |
+
)
|
55 |
+
(norm): Qwen2RMSNorm()
|
56 |
+
)
|
57 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
58 |
+
)
|
59 |
+
)
|
60 |
+
model config: Qwen2Config {
|
61 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
62 |
+
"architectures": [
|
63 |
+
"Qwen2ForCausalLM"
|
64 |
+
],
|
65 |
+
"attention_dropout": 0.0,
|
66 |
+
"bos_token_id": 151643,
|
67 |
+
"eos_token_id": 151643,
|
68 |
+
"hidden_act": "silu",
|
69 |
+
"hidden_size": 896,
|
70 |
+
"initializer_range": 0.02,
|
71 |
+
"intermediate_size": 4864,
|
72 |
+
"label_smoothing": 0.0,
|
73 |
+
"max_position_embeddings": 4096,
|
74 |
+
"max_window_layers": 24,
|
75 |
+
"model_type": "qwen2",
|
76 |
+
"num_attention_heads": 14,
|
77 |
+
"num_hidden_layers": 24,
|
78 |
+
"num_key_value_heads": 2,
|
79 |
+
"rms_norm_eps": 1e-06,
|
80 |
+
"rope_theta": 1000000.0,
|
81 |
+
"sliding_window": 131072,
|
82 |
+
"tie_word_embeddings": true,
|
83 |
+
"torch_dtype": "bfloat16",
|
84 |
+
"transformers_version": "4.43.3",
|
85 |
+
"use_cache": false,
|
86 |
+
"use_sliding_window": false,
|
87 |
+
"vocab_size": 151936
|
88 |
+
}
|
89 |
+
------------------------------------------------------------------
|
90 |
+
iteration: 41 , TFLOPS: 89.20992749379542, Tokens per sec: 22186.06078570857, Loss: 4.376823425292969
|
91 |
+
------------------------------------------------------------------
|
92 |
+
------------------------------------------------------------------
|
93 |
+
iteration: 42 , TFLOPS: 90.73548605717187, Tokens per sec: 22565.459536162463, Loss: 4.388589382171631
|
94 |
+
------------------------------------------------------------------
|
95 |
+
------------------------------------------------------------------
|
96 |
+
iteration: 43 , TFLOPS: 90.773373714071, Tokens per sec: 22574.882006089374, Loss: 4.334207057952881
|
97 |
+
------------------------------------------------------------------
|
98 |
+
------------------------------------------------------------------
|
99 |
+
iteration: 44 , TFLOPS: 90.78534040748795, Tokens per sec: 22577.85806262267, Loss: 4.347831726074219
|
100 |
+
------------------------------------------------------------------
|
101 |
+
------------------------------------------------------------------
|
102 |
+
iteration: 45 , TFLOPS: 90.91842135677658, Tokens per sec: 22610.954626125003, Loss: 4.369765281677246
|
103 |
+
------------------------------------------------------------------
|
104 |
+
------------------------------------------------------------------
|
105 |
+
iteration: 46 , TFLOPS: 90.73681564901436, Tokens per sec: 22565.79019897395, Loss: 4.371013164520264
|
106 |
+
------------------------------------------------------------------
|
107 |
+
------------------------------------------------------------------
|
108 |
+
iteration: 47 , TFLOPS: 90.85118295674725, Tokens per sec: 22594.23277383689, Loss: 4.347028732299805
|
109 |
+
------------------------------------------------------------------
|
110 |
+
------------------------------------------------------------------
|
111 |
+
iteration: 48 , TFLOPS: 90.81419242163798, Tokens per sec: 22585.033413592068, Loss: 4.319859504699707
|
112 |
+
------------------------------------------------------------------
|
113 |
+
Traceback (most recent call last):
|
114 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
115 |
+
main()
|
116 |
+
File "/project/src/llama_recipes/finetuning.py", line 282, in main
|
117 |
+
train(
|
118 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
|
119 |
+
loss.backward()
|
120 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
|
121 |
+
torch.autograd.backward(
|
122 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
|
123 |
+
_engine_run_backward(
|
124 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
|
125 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
126 |
+
KeyboardInterrupt
|
wandb/run-20240823_163849-faey1t8u/files/requirements.txt
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
build==1.2.1
|
23 |
+
cachecontrol==0.14.0
|
24 |
+
cachetools==5.3.2
|
25 |
+
catalogue==2.0.10
|
26 |
+
certifi==2024.2.2
|
27 |
+
cffi==1.16.0
|
28 |
+
chardet==5.2.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
cleo==2.1.0
|
31 |
+
click==8.1.7
|
32 |
+
cloudpathlib==0.16.0
|
33 |
+
cloudpickle==3.0.0
|
34 |
+
cmake==3.28.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.1
|
37 |
+
confection==0.1.4
|
38 |
+
contourpy==1.2.0
|
39 |
+
cramjam==2.8.3
|
40 |
+
crashtest==0.4.1
|
41 |
+
cryptography==43.0.0
|
42 |
+
cubinlinker==0.3.0+2.g405ac64
|
43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
44 |
+
cudf==23.12.0
|
45 |
+
cugraph-dgl==23.12.0
|
46 |
+
cugraph-service-client==23.12.0
|
47 |
+
cugraph-service-server==23.12.0
|
48 |
+
cugraph==23.12.0
|
49 |
+
cuml==23.12.0
|
50 |
+
cupy-cuda12x==12.3.0
|
51 |
+
cycler==0.12.1
|
52 |
+
cymem==2.0.8
|
53 |
+
cython==3.0.8
|
54 |
+
dask-cuda==23.12.0
|
55 |
+
dask-cudf==23.12.0
|
56 |
+
dask==2023.11.0
|
57 |
+
dataclasses-json==0.6.7
|
58 |
+
dataproperty==1.0.1
|
59 |
+
datasets==2.20.0
|
60 |
+
debugpy==1.8.1
|
61 |
+
decorator==5.1.1
|
62 |
+
defusedxml==0.7.1
|
63 |
+
dill==0.3.8
|
64 |
+
distlib==0.3.8
|
65 |
+
distributed==2023.11.0
|
66 |
+
distro==1.9.0
|
67 |
+
dm-tree==0.1.8
|
68 |
+
docker-pycreds==0.4.0
|
69 |
+
dulwich==0.21.7
|
70 |
+
einops==0.7.0
|
71 |
+
emoji==2.12.1
|
72 |
+
entmax==1.3
|
73 |
+
evaluate==0.4.2
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
execnet==2.0.2
|
76 |
+
executing==2.0.1
|
77 |
+
expecttest==0.1.3
|
78 |
+
fastjsonschema==2.19.1
|
79 |
+
fastparquet==2023.10.1
|
80 |
+
fastrlock==0.8.2
|
81 |
+
filelock==3.13.1
|
82 |
+
flash-attn==2.4.2
|
83 |
+
fonttools==4.48.1
|
84 |
+
frozenlist==1.4.1
|
85 |
+
fsspec==2023.12.2
|
86 |
+
fugashi==1.3.2
|
87 |
+
fuzzywuzzy==0.18.0
|
88 |
+
gast==0.5.4
|
89 |
+
gitdb==4.0.11
|
90 |
+
gitpython==3.1.43
|
91 |
+
google-auth-oauthlib==0.4.6
|
92 |
+
google-auth==2.27.0
|
93 |
+
graphsurgeon==0.4.6
|
94 |
+
greenlet==3.0.3
|
95 |
+
grpcio==1.60.1
|
96 |
+
h11==0.14.0
|
97 |
+
httpcore==1.0.5
|
98 |
+
httpx==0.27.0
|
99 |
+
huggingface-hub==0.24.5
|
100 |
+
hydra-core==1.3.2
|
101 |
+
hypothesis==5.35.1
|
102 |
+
idna==3.6
|
103 |
+
importlib-metadata==7.0.1
|
104 |
+
iniconfig==2.0.0
|
105 |
+
installer==0.7.0
|
106 |
+
intel-openmp==2021.4.0
|
107 |
+
ipadic==1.0.0
|
108 |
+
ipykernel==6.29.2
|
109 |
+
ipython-genutils==0.2.0
|
110 |
+
ipython==8.21.0
|
111 |
+
isort==5.13.2
|
112 |
+
jaraco.classes==3.4.0
|
113 |
+
jedi==0.19.1
|
114 |
+
jeepney==0.8.0
|
115 |
+
jinja2==3.1.3
|
116 |
+
jiter==0.5.0
|
117 |
+
joblib==1.3.2
|
118 |
+
json5==0.9.14
|
119 |
+
jsonargparse==3.13.1
|
120 |
+
jsonlines==4.0.0
|
121 |
+
jsonnet==0.19.1
|
122 |
+
jsonpatch==1.33
|
123 |
+
jsonpointer==3.0.0
|
124 |
+
jsonschema-specifications==2023.12.1
|
125 |
+
jsonschema==4.21.1
|
126 |
+
jupyter-client==8.6.0
|
127 |
+
jupyter-core==5.7.1
|
128 |
+
jupyter-tensorboard==0.2.0
|
129 |
+
jupyterlab-pygments==0.3.0
|
130 |
+
jupyterlab-server==1.2.0
|
131 |
+
jupyterlab==2.3.2
|
132 |
+
jupytext==1.16.1
|
133 |
+
keyring==24.3.1
|
134 |
+
kiwisolver==1.4.5
|
135 |
+
langchain-community==0.2.12
|
136 |
+
langchain-core==0.2.31
|
137 |
+
langchain-huggingface==0.0.2
|
138 |
+
langchain-openai==0.1.21
|
139 |
+
langchain-text-splitters==0.2.2
|
140 |
+
langchain==0.2.13
|
141 |
+
langcodes==3.3.0
|
142 |
+
langsmith==0.1.99
|
143 |
+
lazy-loader==0.3
|
144 |
+
levenshtein==0.25.1
|
145 |
+
librosa==0.10.1
|
146 |
+
lightning-utilities==0.11.6
|
147 |
+
llm-jp-eval==1.4.0
|
148 |
+
llvmlite==0.40.1
|
149 |
+
lm-eval==0.3.0
|
150 |
+
locket==1.0.0
|
151 |
+
logzero==1.7.0
|
152 |
+
lxml==5.2.2
|
153 |
+
markdown-it-py==3.0.0
|
154 |
+
markdown==3.5.2
|
155 |
+
markupsafe==2.1.4
|
156 |
+
marshmallow==3.21.3
|
157 |
+
matplotlib-inline==0.1.6
|
158 |
+
matplotlib==3.8.2
|
159 |
+
mbstrdecoder==1.1.3
|
160 |
+
mccabe==0.7.0
|
161 |
+
mdit-py-plugins==0.4.0
|
162 |
+
mdurl==0.1.2
|
163 |
+
mecab-python3==1.0.6
|
164 |
+
mistune==3.0.2
|
165 |
+
mkl-devel==2021.1.1
|
166 |
+
mkl-include==2021.1.1
|
167 |
+
mkl==2021.1.1
|
168 |
+
mock==5.1.0
|
169 |
+
mojimoji==0.0.13
|
170 |
+
more-itertools==9.1.0
|
171 |
+
mpmath==1.3.0
|
172 |
+
msgpack==1.0.7
|
173 |
+
multidict==6.0.4
|
174 |
+
multiprocess==0.70.16
|
175 |
+
murmurhash==1.0.10
|
176 |
+
mypy-extensions==1.0.0
|
177 |
+
nbclient==0.9.0
|
178 |
+
nbconvert==7.16.0
|
179 |
+
nbformat==5.9.2
|
180 |
+
neologdn==0.5.3
|
181 |
+
nest-asyncio==1.6.0
|
182 |
+
networkx==2.6.3
|
183 |
+
ninja==1.11.1.1
|
184 |
+
nltk==3.8.1
|
185 |
+
notebook==6.4.10
|
186 |
+
numba==0.57.1+1.g1ff679645
|
187 |
+
numexpr==2.10.1
|
188 |
+
numpy==1.24.4
|
189 |
+
nvfuser==0.1.4a0+d0bb811
|
190 |
+
nvidia-dali-cuda120==1.34.0
|
191 |
+
nvidia-pyindex==1.0.9
|
192 |
+
nvtx==0.2.5
|
193 |
+
oauthlib==3.2.2
|
194 |
+
omegaconf==2.3.0
|
195 |
+
onnx==1.15.0rc2
|
196 |
+
openai==1.40.6
|
197 |
+
opencv==4.7.0
|
198 |
+
optree==0.10.0
|
199 |
+
orjson==3.10.7
|
200 |
+
packaging==23.2
|
201 |
+
pandas==2.2.2
|
202 |
+
pandocfilters==1.5.1
|
203 |
+
parso==0.8.3
|
204 |
+
partd==1.4.1
|
205 |
+
pathvalidate==3.2.0
|
206 |
+
peft==0.5.0
|
207 |
+
pexpect==4.9.0
|
208 |
+
pillow==10.2.0
|
209 |
+
pip==24.0
|
210 |
+
pkginfo==1.11.1
|
211 |
+
plac==1.4.3
|
212 |
+
platformdirs==4.2.0
|
213 |
+
pluggy==1.4.0
|
214 |
+
ply==3.11
|
215 |
+
poetry-core==1.9.0
|
216 |
+
poetry-plugin-export==1.8.0
|
217 |
+
poetry==1.8.3
|
218 |
+
polygraphy==0.49.4
|
219 |
+
pooch==1.8.0
|
220 |
+
portalocker==2.10.1
|
221 |
+
preshed==3.0.9
|
222 |
+
prettytable==3.9.0
|
223 |
+
prometheus-client==0.19.0
|
224 |
+
prompt-toolkit==3.0.43
|
225 |
+
protobuf==4.24.4
|
226 |
+
psutil==5.9.4
|
227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
228 |
+
ptyprocess==0.7.0
|
229 |
+
pure-eval==0.2.2
|
230 |
+
pyarrow-hotfix==0.6
|
231 |
+
pyarrow==15.0.2
|
232 |
+
pyasn1-modules==0.3.0
|
233 |
+
pyasn1==0.5.1
|
234 |
+
pybind11-global==2.11.1
|
235 |
+
pybind11==2.11.1
|
236 |
+
pycocotools==2.0+nv0.8.0
|
237 |
+
pycountry==24.6.1
|
238 |
+
pycparser==2.21
|
239 |
+
pydantic-core==2.16.2
|
240 |
+
pydantic==2.6.1
|
241 |
+
pygments==2.17.2
|
242 |
+
pylibcugraph==23.12.0
|
243 |
+
pylibcugraphops==23.12.0
|
244 |
+
pylibraft==23.12.0
|
245 |
+
pylint==3.2.6
|
246 |
+
pynvml==11.4.1
|
247 |
+
pyparsing==3.1.1
|
248 |
+
pyproject-hooks==1.1.0
|
249 |
+
pytablewriter==1.2.0
|
250 |
+
pytest-flakefinder==1.1.0
|
251 |
+
pytest-rerunfailures==13.0
|
252 |
+
pytest-shard==0.1.2
|
253 |
+
pytest-xdist==3.5.0
|
254 |
+
pytest==8.0.0
|
255 |
+
python-dateutil==2.8.2
|
256 |
+
python-dotenv==1.0.0
|
257 |
+
python-hostlist==1.23.0
|
258 |
+
python-levenshtein==0.25.1
|
259 |
+
pytorch-lightning==2.4.0
|
260 |
+
pytorch-quantization==2.1.2
|
261 |
+
pytz==2023.3.post1
|
262 |
+
pyyaml==6.0.1
|
263 |
+
pyzmq==25.1.2
|
264 |
+
raft-dask==23.12.0
|
265 |
+
rapidfuzz==3.9.6
|
266 |
+
rapids-dask-dependency==23.12.1
|
267 |
+
referencing==0.33.0
|
268 |
+
regex==2023.12.25
|
269 |
+
requests-oauthlib==1.3.1
|
270 |
+
requests-toolbelt==1.0.0
|
271 |
+
requests==2.32.3
|
272 |
+
rhoknp==1.7.0
|
273 |
+
rich==13.7.0
|
274 |
+
rmm==23.12.0
|
275 |
+
rouge-score==0.1.2
|
276 |
+
rpds-py==0.17.1
|
277 |
+
rsa==4.9
|
278 |
+
sacrebleu==2.4.2
|
279 |
+
safetensors==0.4.3
|
280 |
+
scikit-learn==1.5.1
|
281 |
+
scipy==1.12.0
|
282 |
+
secretstorage==3.3.3
|
283 |
+
send2trash==1.8.2
|
284 |
+
sentence-transformers==3.0.1
|
285 |
+
sentencepiece==0.1.99
|
286 |
+
sentry-sdk==2.12.0
|
287 |
+
setproctitle==1.3.3
|
288 |
+
setuptools==68.2.2
|
289 |
+
shellingham==1.5.4
|
290 |
+
six==1.16.0
|
291 |
+
smart-open==6.4.0
|
292 |
+
smmap==5.0.1
|
293 |
+
sniffio==1.3.1
|
294 |
+
sortedcontainers==2.4.0
|
295 |
+
soundfile==0.12.1
|
296 |
+
soupsieve==2.5
|
297 |
+
soxr==0.3.7
|
298 |
+
spacy-legacy==3.0.12
|
299 |
+
spacy-loggers==1.0.5
|
300 |
+
spacy==3.7.2
|
301 |
+
sphinx-glpi-theme==0.6
|
302 |
+
sqlalchemy==2.0.32
|
303 |
+
sqlitedict==2.1.0
|
304 |
+
srsly==2.4.8
|
305 |
+
stack-data==0.6.3
|
306 |
+
sumeval==0.2.2
|
307 |
+
sympy==1.12
|
308 |
+
tabledata==1.3.3
|
309 |
+
tabulate==0.9.0
|
310 |
+
tbb==2021.11.0
|
311 |
+
tblib==3.0.0
|
312 |
+
tcolorpy==0.1.6
|
313 |
+
tenacity==8.5.0
|
314 |
+
tensorboard-data-server==0.6.1
|
315 |
+
tensorboard-plugin-wit==1.8.1
|
316 |
+
tensorboard==2.9.0
|
317 |
+
tensorrt==8.6.3
|
318 |
+
terminado==0.18.0
|
319 |
+
termplotlib==0.3.9
|
320 |
+
text-generation==0.7.0
|
321 |
+
thinc==8.2.3
|
322 |
+
threadpoolctl==3.2.0
|
323 |
+
thriftpy2==0.4.17
|
324 |
+
tiktoken==0.7.0
|
325 |
+
tinycss2==1.2.1
|
326 |
+
tokenizers==0.19.1
|
327 |
+
toml==0.10.2
|
328 |
+
tomli==2.0.1
|
329 |
+
tomlkit==0.13.2
|
330 |
+
toolz==0.12.1
|
331 |
+
torch-tensorrt==2.3.0a0
|
332 |
+
torch==2.3.0a0+ebedce2
|
333 |
+
torchdata==0.7.1a0
|
334 |
+
torchmetrics==0.10.3
|
335 |
+
torchtext==0.17.0a0
|
336 |
+
torchvision==0.18.0a0
|
337 |
+
tornado==6.4
|
338 |
+
tqdm-multiprocess==0.0.11
|
339 |
+
tqdm==4.66.5
|
340 |
+
traitlets==5.9.0
|
341 |
+
transformer-engine==1.3.0+5b90b7f
|
342 |
+
transformers==4.43.3
|
343 |
+
treelite-runtime==3.9.1
|
344 |
+
treelite==3.9.1
|
345 |
+
triton==2.2.0+e28a256
|
346 |
+
trove-classifiers==2024.7.2
|
347 |
+
typepy==1.3.2
|
348 |
+
typer==0.9.0
|
349 |
+
types-dataclasses==0.6.6
|
350 |
+
typing-extensions==4.12.2
|
351 |
+
typing-inspect==0.9.0
|
352 |
+
tzdata==2024.1
|
353 |
+
ucx-py==0.35.0
|
354 |
+
uff==0.6.9
|
355 |
+
ujson==5.8.0
|
356 |
+
unbabel-comet==2.2.2
|
357 |
+
unidic-lite==1.0.8
|
358 |
+
urllib3==1.26.18
|
359 |
+
virtualenv==20.26.3
|
360 |
+
wandb==0.16.3
|
361 |
+
wasabi==1.1.2
|
362 |
+
wcwidth==0.2.13
|
363 |
+
weasel==0.3.4
|
364 |
+
webencodings==0.5.1
|
365 |
+
werkzeug==3.0.1
|
366 |
+
wheel==0.42.0
|
367 |
+
word2number==1.1
|
368 |
+
xdoctest==1.0.2
|
369 |
+
xgboost==1.7.6
|
370 |
+
xmltodict==0.13.0
|
371 |
+
xxhash==3.4.1
|
372 |
+
yarl==1.9.4
|
373 |
+
zict==3.0.0
|
374 |
+
zipp==3.17.0
|
375 |
+
zstandard==0.23.0
|
wandb/run-20240823_163849-faey1t8u/files/wandb-metadata.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-23T07:38:49.894670",
|
5 |
+
"startedAt": "2024-08-23T07:38:49.352718",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"131072",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"3",
|
15 |
+
"--valid_micro_batch_size",
|
16 |
+
"1",
|
17 |
+
"--global-batch-size",
|
18 |
+
"320",
|
19 |
+
"--train-iters",
|
20 |
+
"7500",
|
21 |
+
"--tokenizer-type",
|
22 |
+
"HFPreTrainedTokenizer",
|
23 |
+
"--tokenizer-model",
|
24 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
25 |
+
"--train-data-path",
|
26 |
+
"1754785366",
|
27 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
28 |
+
"28623823675",
|
29 |
+
"/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
|
30 |
+
"--valid-data-path",
|
31 |
+
"1754785366",
|
32 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
33 |
+
"--test-data-path",
|
34 |
+
"1754785366",
|
35 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
36 |
+
"--lr",
|
37 |
+
"2e-5",
|
38 |
+
"--min-lr",
|
39 |
+
"1e-6",
|
40 |
+
"--lr-decay-style",
|
41 |
+
"cosine",
|
42 |
+
"--lr-warmup-iters",
|
43 |
+
"500",
|
44 |
+
"--lr-decay-iters",
|
45 |
+
"7500",
|
46 |
+
"--weight-decay",
|
47 |
+
"0.1",
|
48 |
+
"--grad-clip-norm",
|
49 |
+
"1.0",
|
50 |
+
"--optimizer",
|
51 |
+
"anyprecision",
|
52 |
+
"--adam-beta1",
|
53 |
+
"0.9",
|
54 |
+
"--adam-beta2",
|
55 |
+
"0.95",
|
56 |
+
"--adam-eps",
|
57 |
+
"1e-6",
|
58 |
+
"--save-interval",
|
59 |
+
"10",
|
60 |
+
"--eval-interval",
|
61 |
+
"10",
|
62 |
+
"--eval-iters",
|
63 |
+
"10",
|
64 |
+
"--bf16",
|
65 |
+
"--mixed-precision",
|
66 |
+
"--base-model",
|
67 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
68 |
+
"--save",
|
69 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
70 |
+
"--load",
|
71 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
72 |
+
"--fsdp-activation-checkpointing",
|
73 |
+
"--sharding-strategy",
|
74 |
+
"FULL_SHARD",
|
75 |
+
"--checkpoint-type",
|
76 |
+
"LOCAL_STATE_DICT",
|
77 |
+
"--save-n-checkpoints",
|
78 |
+
"10",
|
79 |
+
"--upload-all-checkpoints-to-hf",
|
80 |
+
"--hf-upload-retry-limit",
|
81 |
+
"2",
|
82 |
+
"--hf-repo-id",
|
83 |
+
"koichi12/Qwen2-0.5b-0.2",
|
84 |
+
"--wandb-entity",
|
85 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
86 |
+
"--wandb-project",
|
87 |
+
"llm_tutorial-0.2",
|
88 |
+
"--wandb-name",
|
89 |
+
"Qwen2-0.5b-0.2_train_2024-08-23-16:38:35"
|
90 |
+
],
|
91 |
+
"state": "running",
|
92 |
+
"program": "/project/examples/finetuning.py",
|
93 |
+
"codePathLocal": "examples/finetuning.py",
|
94 |
+
"codePath": "examples/finetuning.py",
|
95 |
+
"git": {
|
96 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
97 |
+
"commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
|
98 |
+
},
|
99 |
+
"email": null,
|
100 |
+
"root": "/project",
|
101 |
+
"host": "gpu-koiwa-00",
|
102 |
+
"username": "koiwa",
|
103 |
+
"executable": "/usr/bin/python",
|
104 |
+
"cpu_count": 18,
|
105 |
+
"cpu_count_logical": 18,
|
106 |
+
"cpu_freq": {
|
107 |
+
"current": 2400.0389999999993,
|
108 |
+
"min": 0.0,
|
109 |
+
"max": 0.0
|
110 |
+
},
|
111 |
+
"cpu_freq_per_core": [
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"current": 2400.039,
|
199 |
+
"min": 0.0,
|
200 |
+
"max": 0.0
|
201 |
+
}
|
202 |
+
],
|
203 |
+
"disk": {
|
204 |
+
"/": {
|
205 |
+
"total": 0.0625,
|
206 |
+
"used": 1.1444091796875e-05
|
207 |
+
}
|
208 |
+
},
|
209 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
210 |
+
"gpu_count": 1,
|
211 |
+
"gpu_devices": [
|
212 |
+
{
|
213 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
214 |
+
"memory_total": 42949672960
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"memory": {
|
218 |
+
"total": 56.487831115722656
|
219 |
+
}
|
220 |
+
}
|
wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 4.319859504699707, "training/perplexity": 75.17806538514934, "utils/batch_size": 3, "utils/global_batch_size": 318, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 106, "utils/iteration": 48, "optimizer/lr": 2.8240000000000004e-06, "optimizer/variance_l2": 0.050272112510912605, "optimizer/variance_sqrt_l2": 0.9579955556165142, "optimizer/momentum_l2": 0.9571913293356521, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.9176788330078125, "optimizer/variance_sqrt_l1": 4099.5, "optimizer/momentum_l1": 3579.0, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.0419921875, "optimizer/variance_sqrt_abs_max": 0.205078125, "optimizer/momentum_abs_max": 0.2236328125, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 57.68625514700034, "stats/tokens_per_sec": 22585.033413592068, "stats/tokens_per_sec_per_gpu": 22585.033413592068, "stats/tflops": 90.81419242163798, "_timestamp": 1724399202.2699971, "_runtime": 472.90507411956787, "_step": 48, "_wandb": {"runtime": 476}}
|
wandb/run-20240823_163849-faey1t8u/logs/debug-internal.log
ADDED
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-23 16:38:49,367 INFO StreamThr :12305 [internal.py:wandb_internal():86] W&B internal server running at pid: 12305, started at: 2024-08-23 16:38:49.366185
|
2 |
+
2024-08-23 16:38:49,368 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-23 16:38:49,370 INFO WriterThread:12305 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_163849-faey1t8u/run-faey1t8u.wandb
|
4 |
+
2024-08-23 16:38:49,371 DEBUG SenderThread:12305 [sender.py:send():382] send: header
|
5 |
+
2024-08-23 16:38:49,405 DEBUG SenderThread:12305 [sender.py:send():382] send: run
|
6 |
+
2024-08-23 16:38:49,800 INFO SenderThread:12305 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_163849-faey1t8u/files
|
7 |
+
2024-08-23 16:38:49,800 INFO SenderThread:12305 [sender.py:_start_run_threads():1136] run started: faey1t8u with start time 1724398729.364923
|
8 |
+
2024-08-23 16:38:49,806 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-23 16:38:49,806 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-23 16:38:49,876 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-23 16:38:49,882 DEBUG HandlerThread:12305 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-23 16:38:49,882 DEBUG HandlerThread:12305 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-23 16:38:49,882 INFO HandlerThread:12305 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-23 16:38:49,882 INFO SystemMonitor:12305 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-23 16:38:49,882 INFO HandlerThread:12305 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-23 16:38:49,883 INFO SystemMonitor:12305 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-23 16:38:49,883 INFO SystemMonitor:12305 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-23 16:38:49,884 INFO SystemMonitor:12305 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-23 16:38:49,885 INFO SystemMonitor:12305 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-23 16:38:49,886 INFO SystemMonitor:12305 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-23 16:38:49,894 DEBUG HandlerThread:12305 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-23 16:38:49,896 DEBUG HandlerThread:12305 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-23 16:38:49,908 DEBUG HandlerThread:12305 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-23 16:38:49,909 DEBUG HandlerThread:12305 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-23 16:38:49,909 DEBUG HandlerThread:12305 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T07:38:49.894670', 'startedAt': '2024-08-23T07:38:49.352718', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '131072', '--micro-batch-size', '3', '--valid_micro_batch_size', '1', '--global-batch-size', '320', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-16:38:35'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
|
26 |
+
2024-08-23 16:38:49,909 INFO HandlerThread:12305 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-23 16:38:49,909 INFO HandlerThread:12305 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-23 16:38:49,910 INFO HandlerThread:12305 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-23 16:38:49,915 DEBUG SenderThread:12305 [sender.py:send():382] send: files
|
30 |
+
2024-08-23 16:38:49,916 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-23 16:38:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-23 16:38:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
33 |
+
2024-08-23 16:38:49,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
34 |
+
2024-08-23 16:38:49,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-23 16:38:49,930 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-23 16:38:50,183 DEBUG SenderThread:12305 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-23 16:38:50,520 INFO wandb-upload_0:12305 [upload_job.py:push():131] Uploaded file /tmp/tmpljn_2vd6wandb/tfd8n6zw-wandb-metadata.json
|
38 |
+
2024-08-23 16:38:50,802 INFO Thread-12 :12305 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-metadata.json
|
39 |
+
2024-08-23 16:38:50,802 INFO Thread-12 :12305 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_163849-faey1t8u/files/requirements.txt
|
40 |
+
2024-08-23 16:38:50,803 INFO Thread-12 :12305 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
41 |
+
2024-08-23 16:38:52,803 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
42 |
+
2024-08-23 16:38:54,804 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
43 |
+
2024-08-23 16:38:54,958 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
44 |
+
2024-08-23 16:38:56,805 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
45 |
+
2024-08-23 16:38:58,352 DEBUG SenderThread:12305 [sender.py:send():382] send: config
|
46 |
+
2024-08-23 16:38:58,353 DEBUG SenderThread:12305 [sender.py:send():382] send: config
|
47 |
+
2024-08-23 16:38:58,807 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
48 |
+
2024-08-23 16:39:00,353 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
49 |
+
2024-08-23 16:39:00,808 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
50 |
+
2024-08-23 16:39:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
51 |
+
2024-08-23 16:39:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
52 |
+
2024-08-23 16:39:04,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
53 |
+
2024-08-23 16:39:06,113 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
54 |
+
2024-08-23 16:39:11,113 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
55 |
+
2024-08-23 16:39:16,114 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
56 |
+
2024-08-23 16:39:19,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
57 |
+
2024-08-23 16:39:19,926 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
58 |
+
2024-08-23 16:39:19,967 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
59 |
+
2024-08-23 16:39:21,162 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
60 |
+
2024-08-23 16:39:21,819 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/config.yaml
|
61 |
+
2024-08-23 16:39:26,345 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-23 16:39:31,346 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
63 |
+
2024-08-23 16:39:34,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
64 |
+
2024-08-23 16:39:34,926 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
65 |
+
2024-08-23 16:39:34,967 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
66 |
+
2024-08-23 16:39:37,125 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
67 |
+
2024-08-23 16:39:42,126 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
68 |
+
2024-08-23 16:39:47,127 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
69 |
+
2024-08-23 16:39:49,886 DEBUG SystemMonitor:12305 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
70 |
+
2024-08-23 16:39:49,888 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
71 |
+
2024-08-23 16:39:49,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
72 |
+
2024-08-23 16:39:49,926 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
73 |
+
2024-08-23 16:39:49,967 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
74 |
+
2024-08-23 16:39:53,111 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
75 |
+
2024-08-23 16:39:58,112 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
76 |
+
2024-08-23 16:39:58,398 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
|
77 |
+
2024-08-23 16:40:00,838 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
78 |
+
2024-08-23 16:40:03,440 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
79 |
+
2024-08-23 16:40:04,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
80 |
+
2024-08-23 16:40:04,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
81 |
+
2024-08-23 16:40:04,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
82 |
+
2024-08-23 16:40:09,115 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
83 |
+
2024-08-23 16:40:14,116 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
84 |
+
2024-08-23 16:40:19,117 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
85 |
+
2024-08-23 16:40:19,889 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
86 |
+
2024-08-23 16:40:19,926 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
87 |
+
2024-08-23 16:40:19,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
88 |
+
2024-08-23 16:40:19,967 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
89 |
+
2024-08-23 16:40:25,115 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
90 |
+
2024-08-23 16:40:30,116 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
91 |
+
2024-08-23 16:40:34,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
92 |
+
2024-08-23 16:40:34,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
93 |
+
2024-08-23 16:40:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
94 |
+
2024-08-23 16:40:35,173 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
95 |
+
2024-08-23 16:40:40,174 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
96 |
+
2024-08-23 16:40:45,174 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
97 |
+
2024-08-23 16:40:49,890 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
98 |
+
2024-08-23 16:40:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
99 |
+
2024-08-23 16:40:49,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
100 |
+
2024-08-23 16:40:49,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
101 |
+
2024-08-23 16:40:51,147 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
102 |
+
2024-08-23 16:40:56,136 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
|
103 |
+
2024-08-23 16:40:56,139 DEBUG SenderThread:12305 [sender.py:send():382] send: history
|
104 |
+
2024-08-23 16:40:56,139 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
|
105 |
+
2024-08-23 16:40:56,154 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
106 |
+
2024-08-23 16:40:56,154 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
107 |
+
2024-08-23 16:40:56,867 INFO Thread-12 :12305 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
108 |
+
2024-08-23 16:40:58,868 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
109 |
+
2024-08-23 16:41:01,180 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
110 |
+
2024-08-23 16:41:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
111 |
+
2024-08-23 16:41:04,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
112 |
+
2024-08-23 16:41:04,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
113 |
+
2024-08-23 16:41:07,173 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
114 |
+
2024-08-23 16:41:12,174 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
115 |
+
2024-08-23 16:41:17,174 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
116 |
+
2024-08-23 16:41:19,891 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
117 |
+
2024-08-23 16:41:19,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
118 |
+
2024-08-23 16:41:19,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
119 |
+
2024-08-23 16:41:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
120 |
+
2024-08-23 16:41:23,138 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
121 |
+
2024-08-23 16:41:28,138 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
122 |
+
2024-08-23 16:41:33,139 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
123 |
+
2024-08-23 16:41:34,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
124 |
+
2024-08-23 16:41:34,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
125 |
+
2024-08-23 16:41:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
126 |
+
2024-08-23 16:41:39,118 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
127 |
+
2024-08-23 16:41:44,119 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
128 |
+
2024-08-23 16:41:49,120 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
129 |
+
2024-08-23 16:41:49,892 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
130 |
+
2024-08-23 16:41:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
131 |
+
2024-08-23 16:41:49,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
132 |
+
2024-08-23 16:41:49,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
133 |
+
2024-08-23 16:41:53,851 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
|
134 |
+
2024-08-23 16:41:53,852 DEBUG SenderThread:12305 [sender.py:send():382] send: history
|
135 |
+
2024-08-23 16:41:53,852 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
|
136 |
+
2024-08-23 16:41:53,854 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
137 |
+
2024-08-23 16:41:53,896 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
138 |
+
2024-08-23 16:41:54,892 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
139 |
+
2024-08-23 16:41:54,897 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
140 |
+
2024-08-23 16:41:59,892 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
141 |
+
2024-08-23 16:42:04,893 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
142 |
+
2024-08-23 16:42:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
143 |
+
2024-08-23 16:42:04,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
144 |
+
2024-08-23 16:42:04,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
145 |
+
2024-08-23 16:42:10,105 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
146 |
+
2024-08-23 16:42:15,106 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
147 |
+
2024-08-23 16:42:19,893 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
148 |
+
2024-08-23 16:42:19,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
149 |
+
2024-08-23 16:42:19,927 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
150 |
+
2024-08-23 16:42:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
151 |
+
2024-08-23 16:42:20,146 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
152 |
+
2024-08-23 16:42:25,146 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
153 |
+
2024-08-23 16:42:30,147 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
154 |
+
2024-08-23 16:42:34,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
155 |
+
2024-08-23 16:42:34,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
156 |
+
2024-08-23 16:42:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
157 |
+
2024-08-23 16:42:36,138 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
158 |
+
2024-08-23 16:42:41,138 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
159 |
+
2024-08-23 16:42:46,139 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
160 |
+
2024-08-23 16:42:49,894 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
161 |
+
2024-08-23 16:42:49,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
162 |
+
2024-08-23 16:42:49,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
163 |
+
2024-08-23 16:42:49,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
164 |
+
2024-08-23 16:42:51,557 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
|
165 |
+
2024-08-23 16:42:51,559 DEBUG SenderThread:12305 [sender.py:send():382] send: history
|
166 |
+
2024-08-23 16:42:51,559 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
|
167 |
+
2024-08-23 16:42:51,560 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
168 |
+
2024-08-23 16:42:51,561 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
169 |
+
2024-08-23 16:42:51,926 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
170 |
+
2024-08-23 16:42:52,927 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
171 |
+
2024-08-23 16:42:56,600 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
172 |
+
2024-08-23 16:43:01,601 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
173 |
+
2024-08-23 16:43:04,927 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
174 |
+
2024-08-23 16:43:04,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
175 |
+
2024-08-23 16:43:04,930 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
176 |
+
2024-08-23 16:43:07,132 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
177 |
+
2024-08-23 16:43:12,132 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
178 |
+
2024-08-23 16:43:17,132 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
179 |
+
2024-08-23 16:43:19,895 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
180 |
+
2024-08-23 16:43:19,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
181 |
+
2024-08-23 16:43:19,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
182 |
+
2024-08-23 16:43:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
183 |
+
2024-08-23 16:43:23,107 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
184 |
+
2024-08-23 16:43:28,108 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
185 |
+
2024-08-23 16:43:33,108 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
186 |
+
2024-08-23 16:43:34,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
187 |
+
2024-08-23 16:43:34,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
188 |
+
2024-08-23 16:43:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
189 |
+
2024-08-23 16:43:38,202 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
190 |
+
2024-08-23 16:43:43,203 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
191 |
+
2024-08-23 16:43:48,203 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
192 |
+
2024-08-23 16:43:49,180 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
|
193 |
+
2024-08-23 16:43:49,181 DEBUG SenderThread:12305 [sender.py:send():382] send: history
|
194 |
+
2024-08-23 16:43:49,182 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
|
195 |
+
2024-08-23 16:43:49,184 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
196 |
+
2024-08-23 16:43:49,896 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
197 |
+
2024-08-23 16:43:49,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
198 |
+
2024-08-23 16:43:49,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
199 |
+
2024-08-23 16:43:49,930 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
200 |
+
2024-08-23 16:43:49,954 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
201 |
+
2024-08-23 16:43:50,955 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
202 |
+
2024-08-23 16:43:54,116 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
203 |
+
2024-08-23 16:43:59,116 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
204 |
+
2024-08-23 16:44:04,117 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
205 |
+
2024-08-23 16:44:04,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
206 |
+
2024-08-23 16:44:04,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
207 |
+
2024-08-23 16:44:04,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
208 |
+
2024-08-23 16:44:09,123 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
209 |
+
2024-08-23 16:44:14,124 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
210 |
+
2024-08-23 16:44:19,124 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
211 |
+
2024-08-23 16:44:19,897 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
212 |
+
2024-08-23 16:44:19,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
213 |
+
2024-08-23 16:44:19,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
214 |
+
2024-08-23 16:44:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
215 |
+
2024-08-23 16:44:25,120 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
216 |
+
2024-08-23 16:44:30,120 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
217 |
+
2024-08-23 16:44:34,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
218 |
+
2024-08-23 16:44:34,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
219 |
+
2024-08-23 16:44:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
220 |
+
2024-08-23 16:44:35,144 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
221 |
+
2024-08-23 16:44:40,145 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
222 |
+
2024-08-23 16:44:45,145 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
223 |
+
2024-08-23 16:44:46,917 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
|
224 |
+
2024-08-23 16:44:46,920 DEBUG SenderThread:12305 [sender.py:send():382] send: history
|
225 |
+
2024-08-23 16:44:46,921 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
|
226 |
+
2024-08-23 16:44:46,922 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
227 |
+
2024-08-23 16:44:46,983 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
228 |
+
2024-08-23 16:44:48,984 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
229 |
+
2024-08-23 16:44:49,898 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
230 |
+
2024-08-23 16:44:49,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
231 |
+
2024-08-23 16:44:49,928 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
232 |
+
2024-08-23 16:44:49,931 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
233 |
+
2024-08-23 16:44:50,159 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
234 |
+
2024-08-23 16:44:55,160 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
235 |
+
2024-08-23 16:45:00,160 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
236 |
+
2024-08-23 16:45:04,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
237 |
+
2024-08-23 16:45:04,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
238 |
+
2024-08-23 16:45:04,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
239 |
+
2024-08-23 16:45:06,142 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
240 |
+
2024-08-23 16:45:11,142 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
241 |
+
2024-08-23 16:45:16,142 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
242 |
+
2024-08-23 16:45:19,899 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
243 |
+
2024-08-23 16:45:19,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
244 |
+
2024-08-23 16:45:19,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
245 |
+
2024-08-23 16:45:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
246 |
+
2024-08-23 16:45:21,199 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
247 |
+
2024-08-23 16:45:26,199 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
248 |
+
2024-08-23 16:45:31,200 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
249 |
+
2024-08-23 16:45:34,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
250 |
+
2024-08-23 16:45:34,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
251 |
+
2024-08-23 16:45:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
252 |
+
2024-08-23 16:45:37,098 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
253 |
+
2024-08-23 16:45:42,098 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
254 |
+
2024-08-23 16:45:44,582 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
|
255 |
+
2024-08-23 16:45:44,584 DEBUG SenderThread:12305 [sender.py:send():382] send: history
|
256 |
+
2024-08-23 16:45:44,584 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
|
257 |
+
2024-08-23 16:45:44,585 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
258 |
+
2024-08-23 16:45:45,011 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
259 |
+
2024-08-23 16:45:47,011 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
260 |
+
2024-08-23 16:45:47,624 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
261 |
+
2024-08-23 16:45:49,899 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
262 |
+
2024-08-23 16:45:49,928 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
263 |
+
2024-08-23 16:45:49,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
264 |
+
2024-08-23 16:45:49,931 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
265 |
+
2024-08-23 16:45:53,153 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
266 |
+
2024-08-23 16:45:58,153 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
267 |
+
2024-08-23 16:46:03,154 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
268 |
+
2024-08-23 16:46:04,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
269 |
+
2024-08-23 16:46:04,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
270 |
+
2024-08-23 16:46:04,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
271 |
+
2024-08-23 16:46:09,099 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
272 |
+
2024-08-23 16:46:14,100 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
273 |
+
2024-08-23 16:46:19,101 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
274 |
+
2024-08-23 16:46:19,900 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
275 |
+
2024-08-23 16:46:19,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
276 |
+
2024-08-23 16:46:19,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
277 |
+
2024-08-23 16:46:19,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
278 |
+
2024-08-23 16:46:24,145 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
279 |
+
2024-08-23 16:46:29,146 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
280 |
+
2024-08-23 16:46:34,146 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
281 |
+
2024-08-23 16:46:34,929 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: stop_status
|
282 |
+
2024-08-23 16:46:34,929 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: stop_status
|
283 |
+
2024-08-23 16:46:34,971 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
284 |
+
2024-08-23 16:46:40,145 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
285 |
+
2024-08-23 16:46:42,271 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: partial_history
|
286 |
+
2024-08-23 16:46:42,273 DEBUG SenderThread:12305 [sender.py:send():382] send: history
|
287 |
+
2024-08-23 16:46:42,273 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
|
288 |
+
2024-08-23 16:46:42,274 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
289 |
+
2024-08-23 16:46:43,038 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
290 |
+
2024-08-23 16:46:45,038 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
291 |
+
2024-08-23 16:46:45,316 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
292 |
+
2024-08-23 16:46:46,081 DEBUG SenderThread:12305 [sender.py:send():382] send: exit
|
293 |
+
2024-08-23 16:46:46,081 INFO SenderThread:12305 [sender.py:send_exit():589] handling exit code: 255
|
294 |
+
2024-08-23 16:46:46,081 INFO SenderThread:12305 [sender.py:send_exit():591] handling runtime: 476
|
295 |
+
2024-08-23 16:46:46,083 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
296 |
+
2024-08-23 16:46:46,083 INFO SenderThread:12305 [sender.py:send_exit():597] send defer
|
297 |
+
2024-08-23 16:46:46,083 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
298 |
+
2024-08-23 16:46:46,083 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 0
|
299 |
+
2024-08-23 16:46:46,083 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
300 |
+
2024-08-23 16:46:46,083 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 0
|
301 |
+
2024-08-23 16:46:46,083 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 1
|
302 |
+
2024-08-23 16:46:46,083 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
303 |
+
2024-08-23 16:46:46,083 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 1
|
304 |
+
2024-08-23 16:46:46,084 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
305 |
+
2024-08-23 16:46:46,084 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 1
|
306 |
+
2024-08-23 16:46:46,084 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 2
|
307 |
+
2024-08-23 16:46:46,084 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
308 |
+
2024-08-23 16:46:46,084 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 2
|
309 |
+
2024-08-23 16:46:46,084 INFO HandlerThread:12305 [system_monitor.py:finish():203] Stopping system monitor
|
310 |
+
2024-08-23 16:46:46,084 DEBUG SystemMonitor:12305 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
311 |
+
2024-08-23 16:46:46,084 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined cpu monitor
|
312 |
+
2024-08-23 16:46:46,084 DEBUG SystemMonitor:12305 [system_monitor.py:_start():183] Publishing last batch of metrics
|
313 |
+
2024-08-23 16:46:46,084 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined disk monitor
|
314 |
+
2024-08-23 16:46:46,118 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined gpu monitor
|
315 |
+
2024-08-23 16:46:46,118 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined memory monitor
|
316 |
+
2024-08-23 16:46:46,118 INFO HandlerThread:12305 [interfaces.py:finish():202] Joined network monitor
|
317 |
+
2024-08-23 16:46:46,118 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
318 |
+
2024-08-23 16:46:46,119 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 2
|
319 |
+
2024-08-23 16:46:46,119 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 3
|
320 |
+
2024-08-23 16:46:46,119 DEBUG SenderThread:12305 [sender.py:send():382] send: stats
|
321 |
+
2024-08-23 16:46:46,119 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
322 |
+
2024-08-23 16:46:46,119 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 3
|
323 |
+
2024-08-23 16:46:46,120 DEBUG SenderThread:12305 [sender.py:send():382] send: history
|
324 |
+
2024-08-23 16:46:46,121 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: summary_record
|
325 |
+
2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
326 |
+
2024-08-23 16:46:46,122 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
327 |
+
2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 3
|
328 |
+
2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 4
|
329 |
+
2024-08-23 16:46:46,122 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
330 |
+
2024-08-23 16:46:46,122 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 4
|
331 |
+
2024-08-23 16:46:46,122 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
332 |
+
2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 4
|
333 |
+
2024-08-23 16:46:46,122 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 5
|
334 |
+
2024-08-23 16:46:46,122 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
335 |
+
2024-08-23 16:46:46,122 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 5
|
336 |
+
2024-08-23 16:46:46,123 DEBUG SenderThread:12305 [sender.py:send():382] send: summary
|
337 |
+
2024-08-23 16:46:46,124 INFO SenderThread:12305 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
338 |
+
2024-08-23 16:46:46,124 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
339 |
+
2024-08-23 16:46:46,124 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 5
|
340 |
+
2024-08-23 16:46:46,124 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 6
|
341 |
+
2024-08-23 16:46:46,124 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
342 |
+
2024-08-23 16:46:46,124 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 6
|
343 |
+
2024-08-23 16:46:46,125 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
344 |
+
2024-08-23 16:46:46,125 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 6
|
345 |
+
2024-08-23 16:46:46,125 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 7
|
346 |
+
2024-08-23 16:46:46,125 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: status_report
|
347 |
+
2024-08-23 16:46:46,125 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
348 |
+
2024-08-23 16:46:46,125 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 7
|
349 |
+
2024-08-23 16:46:46,125 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
350 |
+
2024-08-23 16:46:46,125 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 7
|
351 |
+
2024-08-23 16:46:47,040 INFO Thread-12 :12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
352 |
+
2024-08-23 16:46:47,081 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
|
353 |
+
2024-08-23 16:46:47,096 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 8
|
354 |
+
2024-08-23 16:46:47,096 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
|
355 |
+
2024-08-23 16:46:47,096 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
356 |
+
2024-08-23 16:46:47,097 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 8
|
357 |
+
2024-08-23 16:46:47,097 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
358 |
+
2024-08-23 16:46:47,097 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 8
|
359 |
+
2024-08-23 16:46:47,097 INFO SenderThread:12305 [job_builder.py:build():296] Attempting to build job artifact
|
360 |
+
2024-08-23 16:46:47,098 INFO SenderThread:12305 [job_builder.py:_get_source_type():426] is repo sourced job
|
361 |
+
2024-08-23 16:46:47,113 INFO SenderThread:12305 [job_builder.py:build():402] adding wandb-job metadata file
|
362 |
+
2024-08-23 16:46:47,121 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 9
|
363 |
+
2024-08-23 16:46:47,122 DEBUG SenderThread:12305 [sender.py:send():382] send: artifact
|
364 |
+
2024-08-23 16:46:47,122 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
365 |
+
2024-08-23 16:46:47,123 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 9
|
366 |
+
2024-08-23 16:46:48,001 INFO SenderThread:12305 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MjAxODA1Mw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MjAxODA1Mw==', 'versionIndex': 3}}}
|
367 |
+
2024-08-23 16:46:48,001 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
368 |
+
2024-08-23 16:46:48,001 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 9
|
369 |
+
2024-08-23 16:46:48,001 INFO SenderThread:12305 [dir_watcher.py:finish():358] shutting down directory watcher
|
370 |
+
2024-08-23 16:46:48,041 INFO SenderThread:12305 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
371 |
+
2024-08-23 16:46:48,041 INFO SenderThread:12305 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_163849-faey1t8u/files
|
372 |
+
2024-08-23 16:46:48,042 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/requirements.txt requirements.txt
|
373 |
+
2024-08-23 16:46:48,042 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/config.yaml config.yaml
|
374 |
+
2024-08-23 16:46:48,043 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-metadata.json wandb-metadata.json
|
375 |
+
2024-08-23 16:46:48,043 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json wandb-summary.json
|
376 |
+
2024-08-23 16:46:48,045 INFO SenderThread:12305 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_163849-faey1t8u/files/output.log output.log
|
377 |
+
2024-08-23 16:46:48,046 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 10
|
378 |
+
2024-08-23 16:46:48,047 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
379 |
+
2024-08-23 16:46:48,048 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 10
|
380 |
+
2024-08-23 16:46:48,048 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
381 |
+
2024-08-23 16:46:48,048 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 10
|
382 |
+
2024-08-23 16:46:48,048 INFO SenderThread:12305 [file_pusher.py:finish():172] shutting down file pusher
|
383 |
+
2024-08-23 16:46:48,082 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
|
384 |
+
2024-08-23 16:46:48,082 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
|
385 |
+
2024-08-23 16:46:49,082 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
|
386 |
+
2024-08-23 16:46:49,082 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
|
387 |
+
2024-08-23 16:46:50,083 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
|
388 |
+
2024-08-23 16:46:50,083 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
|
389 |
+
2024-08-23 16:46:50,119 INFO wandb-upload_2:12305 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_163849-faey1t8u/files/wandb-summary.json
|
390 |
+
2024-08-23 16:46:50,148 INFO wandb-upload_3:12305 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_163849-faey1t8u/files/output.log
|
391 |
+
2024-08-23 16:46:50,166 INFO wandb-upload_0:12305 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_163849-faey1t8u/files/requirements.txt
|
392 |
+
2024-08-23 16:46:50,204 INFO wandb-upload_1:12305 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_163849-faey1t8u/files/config.yaml
|
393 |
+
2024-08-23 16:46:50,405 INFO Thread-11 (_thread_body):12305 [sender.py:transition_state():617] send defer: 11
|
394 |
+
2024-08-23 16:46:50,405 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
395 |
+
2024-08-23 16:46:50,405 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 11
|
396 |
+
2024-08-23 16:46:50,405 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
397 |
+
2024-08-23 16:46:50,405 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 11
|
398 |
+
2024-08-23 16:46:50,405 INFO SenderThread:12305 [file_pusher.py:join():178] waiting for file pusher
|
399 |
+
2024-08-23 16:46:50,405 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 12
|
400 |
+
2024-08-23 16:46:50,405 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
401 |
+
2024-08-23 16:46:50,406 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 12
|
402 |
+
2024-08-23 16:46:50,406 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
403 |
+
2024-08-23 16:46:50,406 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 12
|
404 |
+
2024-08-23 16:46:50,406 INFO SenderThread:12305 [file_stream.py:finish():595] file stream finish called
|
405 |
+
2024-08-23 16:46:51,083 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
|
406 |
+
2024-08-23 16:46:51,097 INFO SenderThread:12305 [file_stream.py:finish():599] file stream finish is done
|
407 |
+
2024-08-23 16:46:51,097 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 13
|
408 |
+
2024-08-23 16:46:51,097 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
|
409 |
+
2024-08-23 16:46:51,097 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
410 |
+
2024-08-23 16:46:51,098 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 13
|
411 |
+
2024-08-23 16:46:51,098 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
412 |
+
2024-08-23 16:46:51,098 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 13
|
413 |
+
2024-08-23 16:46:51,098 INFO SenderThread:12305 [sender.py:transition_state():617] send defer: 14
|
414 |
+
2024-08-23 16:46:51,098 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: defer
|
415 |
+
2024-08-23 16:46:51,098 DEBUG SenderThread:12305 [sender.py:send():382] send: final
|
416 |
+
2024-08-23 16:46:51,098 INFO HandlerThread:12305 [handler.py:handle_request_defer():172] handle defer: 14
|
417 |
+
2024-08-23 16:46:51,098 DEBUG SenderThread:12305 [sender.py:send():382] send: footer
|
418 |
+
2024-08-23 16:46:51,098 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: defer
|
419 |
+
2024-08-23 16:46:51,099 INFO SenderThread:12305 [sender.py:send_request_defer():613] handle sender defer: 14
|
420 |
+
2024-08-23 16:46:51,099 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
|
421 |
+
2024-08-23 16:46:51,099 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
|
422 |
+
2024-08-23 16:46:51,099 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: poll_exit
|
423 |
+
2024-08-23 16:46:51,100 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: server_info
|
424 |
+
2024-08-23 16:46:51,100 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: poll_exit
|
425 |
+
2024-08-23 16:46:51,100 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: get_summary
|
426 |
+
2024-08-23 16:46:51,100 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: server_info
|
427 |
+
2024-08-23 16:46:51,101 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: sampled_history
|
428 |
+
2024-08-23 16:46:51,103 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: internal_messages
|
429 |
+
2024-08-23 16:46:51,103 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: job_info
|
430 |
+
2024-08-23 16:46:51,261 DEBUG SenderThread:12305 [sender.py:send_request():409] send_request: job_info
|
431 |
+
2024-08-23 16:46:51,262 INFO MainThread:12305 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
432 |
+
2024-08-23 16:46:51,262 INFO MainThread:12305 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
433 |
+
2024-08-23 16:46:51,262 INFO MainThread:12305 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
434 |
+
2024-08-23 16:46:51,263 DEBUG HandlerThread:12305 [handler.py:handle_request():146] handle_request: shutdown
|
435 |
+
2024-08-23 16:46:51,263 INFO HandlerThread:12305 [handler.py:finish():869] shutting down handler
|
436 |
+
2024-08-23 16:46:52,103 INFO WriterThread:12305 [datastore.py:close():296] close: /project/wandb/run-20240823_163849-faey1t8u/run-faey1t8u.wandb
|
437 |
+
2024-08-23 16:46:52,262 INFO SenderThread:12305 [sender.py:finish():1572] shutting down sender
|
438 |
+
2024-08-23 16:46:52,262 INFO SenderThread:12305 [file_pusher.py:finish():172] shutting down file pusher
|
439 |
+
2024-08-23 16:46:52,262 INFO SenderThread:12305 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240823_163849-faey1t8u/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-23 16:38:49,358 INFO MainThread:12234 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-23 16:38:49,358 INFO MainThread:12234 [wandb_setup.py:_flush():76] Configure stats pid to 12234
|
3 |
+
2024-08-23 16:38:49,358 INFO MainThread:12234 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-23 16:38:49,358 INFO MainThread:12234 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
6 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_163849-faey1t8u/logs/debug.log
|
9 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_163849-faey1t8u/logs/debug-internal.log
|
10 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-16:38:35', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 3, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 106}
|
13 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-23 16:38:49,359 INFO MainThread:12234 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-23 16:38:49,364 INFO MainThread:12234 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-23 16:38:49,364 INFO MainThread:12234 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-23 16:38:49,369 INFO MainThread:12234 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-23 16:38:49,401 INFO MainThread:12234 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-23 16:38:49,805 INFO MainThread:12234 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-23 16:38:49,830 INFO MainThread:12234 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-23 16:38:49,830 INFO MainThread:12234 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-23 16:38:49,926 INFO MainThread:12234 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-23 16:38:49,926 INFO MainThread:12234 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-23 16:38:49,926 INFO MainThread:12234 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-23 16:38:49,927 INFO MainThread:12234 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-23 16:38:49,927 INFO MainThread:12234 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-23 16:38:58,351 INFO MainThread:12234 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-23 16:38:58,352 INFO MainThread:12234 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-23 16:46:52,263 WARNING MsgRouterThr:12234 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240823_163849-faey1t8u/run-faey1t8u.wandb
ADDED
Binary file (49.1 kB). View file
|
|
wandb/run-20240823_202540-om09pls8/files/config.yaml
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '1754785366'
|
31 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
32 |
+
- '28623823675'
|
33 |
+
- /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
|
34 |
+
valid_data_path:
|
35 |
+
desc: null
|
36 |
+
value:
|
37 |
+
- '1754785366'
|
38 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
39 |
+
test_data_path:
|
40 |
+
desc: null
|
41 |
+
value:
|
42 |
+
- '1754785366'
|
43 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
44 |
+
data_cache_path:
|
45 |
+
desc: null
|
46 |
+
value: null
|
47 |
+
vocab_size:
|
48 |
+
desc: null
|
49 |
+
value: null
|
50 |
+
vocab_file:
|
51 |
+
desc: null
|
52 |
+
value: null
|
53 |
+
merge_file:
|
54 |
+
desc: null
|
55 |
+
value: null
|
56 |
+
seq_length:
|
57 |
+
desc: null
|
58 |
+
value: 1024
|
59 |
+
num_workers:
|
60 |
+
desc: null
|
61 |
+
value: 2
|
62 |
+
tokenizer_type:
|
63 |
+
desc: null
|
64 |
+
value: HFPreTrainedTokenizer
|
65 |
+
tokenizer_model:
|
66 |
+
desc: null
|
67 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
68 |
+
reset_position_ids:
|
69 |
+
desc: null
|
70 |
+
value: false
|
71 |
+
reset_attention_mask:
|
72 |
+
desc: null
|
73 |
+
value: false
|
74 |
+
eod_mask_loss:
|
75 |
+
desc: null
|
76 |
+
value: false
|
77 |
+
retro_return_doc_ids:
|
78 |
+
desc: null
|
79 |
+
value: false
|
80 |
+
short_seq_prob:
|
81 |
+
desc: null
|
82 |
+
value: 0.1
|
83 |
+
vocab_extra_ids:
|
84 |
+
desc: null
|
85 |
+
value: 0
|
86 |
+
seed:
|
87 |
+
desc: null
|
88 |
+
value: 1234
|
89 |
+
use_mpi:
|
90 |
+
desc: null
|
91 |
+
value: false
|
92 |
+
wandb_entity:
|
93 |
+
desc: null
|
94 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
95 |
+
wandb_name:
|
96 |
+
desc: null
|
97 |
+
value: Qwen2-0.5b-0.2_train_2024-08-23-20:25:00
|
98 |
+
wandb_project:
|
99 |
+
desc: null
|
100 |
+
value: llm_tutorial-0.2
|
101 |
+
quantization:
|
102 |
+
desc: null
|
103 |
+
value: false
|
104 |
+
use_freeze_layers:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
freeze_layers:
|
108 |
+
desc: null
|
109 |
+
value: null
|
110 |
+
bf16:
|
111 |
+
desc: null
|
112 |
+
value: true
|
113 |
+
fp16:
|
114 |
+
desc: null
|
115 |
+
value: false
|
116 |
+
mixed_precision:
|
117 |
+
desc: null
|
118 |
+
value: true
|
119 |
+
param_dtype:
|
120 |
+
desc: null
|
121 |
+
value: null
|
122 |
+
load:
|
123 |
+
desc: null
|
124 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
125 |
+
save:
|
126 |
+
desc: null
|
127 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
128 |
+
base_model:
|
129 |
+
desc: null
|
130 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
131 |
+
use_better_transformer:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
grad_clip_norm:
|
135 |
+
desc: null
|
136 |
+
value: 1.0
|
137 |
+
eval_interval:
|
138 |
+
desc: null
|
139 |
+
value: 3
|
140 |
+
save_interval:
|
141 |
+
desc: null
|
142 |
+
value: 500
|
143 |
+
eval_iters:
|
144 |
+
desc: null
|
145 |
+
value: 10
|
146 |
+
optimizer:
|
147 |
+
desc: null
|
148 |
+
value: anyprecision
|
149 |
+
lr:
|
150 |
+
desc: null
|
151 |
+
value: 2.0e-05
|
152 |
+
lr_decay_style:
|
153 |
+
desc: null
|
154 |
+
value: cosine
|
155 |
+
lr_decay_iters:
|
156 |
+
desc: null
|
157 |
+
value: 16000
|
158 |
+
lr_warmup_iters:
|
159 |
+
desc: null
|
160 |
+
value: 500
|
161 |
+
min_lr:
|
162 |
+
desc: null
|
163 |
+
value: 1.0e-06
|
164 |
+
train_iters:
|
165 |
+
desc: null
|
166 |
+
value: 16000
|
167 |
+
train_samples:
|
168 |
+
desc: null
|
169 |
+
value: null
|
170 |
+
global_batch_size:
|
171 |
+
desc: null
|
172 |
+
value: 612
|
173 |
+
micro_batch_size:
|
174 |
+
desc: null
|
175 |
+
value: 17
|
176 |
+
make_vocab_size_divisible_by:
|
177 |
+
desc: null
|
178 |
+
value: 128
|
179 |
+
sliding_window_size:
|
180 |
+
desc: null
|
181 |
+
value: 131072
|
182 |
+
skip_batch:
|
183 |
+
desc: null
|
184 |
+
value: null
|
185 |
+
no_save_optimizer_state:
|
186 |
+
desc: null
|
187 |
+
value: false
|
188 |
+
continual_pretraining:
|
189 |
+
desc: null
|
190 |
+
value: false
|
191 |
+
instruction_tuning:
|
192 |
+
desc: null
|
193 |
+
value: false
|
194 |
+
direct_preference_optimization:
|
195 |
+
desc: null
|
196 |
+
value: false
|
197 |
+
attention_dropout:
|
198 |
+
desc: null
|
199 |
+
value: 0.1
|
200 |
+
hidden_dropout:
|
201 |
+
desc: null
|
202 |
+
value: 0.1
|
203 |
+
weight_decay:
|
204 |
+
desc: null
|
205 |
+
value: 0.1
|
206 |
+
adam_beta1:
|
207 |
+
desc: null
|
208 |
+
value: 0.9
|
209 |
+
adam_beta2:
|
210 |
+
desc: null
|
211 |
+
value: 0.95
|
212 |
+
adam_eps:
|
213 |
+
desc: null
|
214 |
+
value: 1.0e-06
|
215 |
+
hf_transformer_model_dir:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
instruction_train_data_path:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
instruction_valid_data_path:
|
222 |
+
desc: null
|
223 |
+
value: null
|
224 |
+
epoch:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
instruction_dataset_size:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
save_sampler_state:
|
231 |
+
desc: null
|
232 |
+
value: false
|
233 |
+
label_smoothing:
|
234 |
+
desc: null
|
235 |
+
value: 0.0
|
236 |
+
save_n_checkpoints:
|
237 |
+
desc: null
|
238 |
+
value: 10
|
239 |
+
hf_repo_id:
|
240 |
+
desc: null
|
241 |
+
value: koichi12/Qwen2-0.5b-0.2
|
242 |
+
create_public_hf_repo:
|
243 |
+
desc: null
|
244 |
+
value: false
|
245 |
+
upload_all_checkpoints_to_hf:
|
246 |
+
desc: null
|
247 |
+
value: true
|
248 |
+
hf_upload_retry_limit:
|
249 |
+
desc: null
|
250 |
+
value: 2
|
251 |
+
exit_duration_in_mins:
|
252 |
+
desc: null
|
253 |
+
value: null
|
254 |
+
source_key:
|
255 |
+
desc: null
|
256 |
+
value: null
|
257 |
+
target_key:
|
258 |
+
desc: null
|
259 |
+
value: null
|
260 |
+
attn_implementation:
|
261 |
+
desc: null
|
262 |
+
value: flash_attention_2
|
263 |
+
efficient_instruction_tuning:
|
264 |
+
desc: null
|
265 |
+
value: false
|
266 |
+
remove_padding_masking:
|
267 |
+
desc: null
|
268 |
+
value: false
|
269 |
+
save_start_iter:
|
270 |
+
desc: null
|
271 |
+
value: null
|
272 |
+
valid_micro_batch_size:
|
273 |
+
desc: null
|
274 |
+
value: 10
|
275 |
+
rank:
|
276 |
+
desc: null
|
277 |
+
value: 0
|
278 |
+
world_size:
|
279 |
+
desc: null
|
280 |
+
value: 4
|
281 |
+
padded_vocab_size:
|
282 |
+
desc: null
|
283 |
+
value: 151680
|
284 |
+
gradient_accumulation_steps:
|
285 |
+
desc: null
|
286 |
+
value: 9
|
287 |
+
_wandb:
|
288 |
+
desc: null
|
289 |
+
value:
|
290 |
+
python_version: 3.10.12
|
291 |
+
cli_version: 0.16.3
|
292 |
+
framework: huggingface
|
293 |
+
huggingface_version: 4.43.3
|
294 |
+
is_jupyter_run: false
|
295 |
+
is_kaggle_kernel: false
|
296 |
+
start_time: 1724412340.7504
|
297 |
+
t:
|
298 |
+
1:
|
299 |
+
- 1
|
300 |
+
- 11
|
301 |
+
- 49
|
302 |
+
- 55
|
303 |
+
- 71
|
304 |
+
- 105
|
305 |
+
2:
|
306 |
+
- 1
|
307 |
+
- 11
|
308 |
+
- 49
|
309 |
+
- 55
|
310 |
+
- 71
|
311 |
+
- 105
|
312 |
+
3:
|
313 |
+
- 13
|
314 |
+
- 16
|
315 |
+
- 23
|
316 |
+
4: 3.10.12
|
317 |
+
5: 0.16.3
|
318 |
+
6: 4.43.3
|
319 |
+
8:
|
320 |
+
- 5
|
321 |
+
13: linux-x86_64
|
322 |
+
model_architecture:
|
323 |
+
desc: null
|
324 |
+
value: Qwen2ForCausalLM
|
325 |
+
activation_function:
|
326 |
+
desc: null
|
327 |
+
value: silu
|
328 |
+
hidden_size:
|
329 |
+
desc: null
|
330 |
+
value: 896
|
331 |
+
model_type:
|
332 |
+
desc: null
|
333 |
+
value: qwen2
|
334 |
+
max_position_embeddings:
|
335 |
+
desc: null
|
336 |
+
value: 1024
|
337 |
+
num_attention_heads:
|
338 |
+
desc: null
|
339 |
+
value: 14
|
340 |
+
num_hidden_layers:
|
341 |
+
desc: null
|
342 |
+
value: 24
|
wandb/run-20240823_202540-om09pls8/files/output.log
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000100/model.pt
|
5 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
6 |
+
Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000100/model.pt
|
7 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
8 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
9 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
10 |
+
--> applying fsdp activation checkpointing...
|
11 |
+
> datasets target sizes (minimum size):
|
12 |
+
train: 9792000
|
13 |
+
validation: 32644080
|
14 |
+
test: 6120
|
15 |
+
> building train, validation, and test datasets for GPT ...
|
16 |
+
Let split = None
|
17 |
+
Unable to save the indexes because path_to_cache is None
|
18 |
+
Building a BlendedDataset for a single MegatronDataset
|
19 |
+
Unable to save the indexes because path_to_cache is None
|
20 |
+
> finished creating GPT datasets ...
|
21 |
+
Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000100/optimizer.pt
|
22 |
+
Building a BlendedDataset for a single MegatronDataset
|
23 |
+
Unable to save the indexes because path_to_cache is None
|
24 |
+
[rank0]:[2024-08-23 20:26:02,460] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
25 |
+
Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000100/optimizer.pt
|
26 |
+
model info: FullyShardedDataParallel(
|
27 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
28 |
+
(model): Qwen2Model(
|
29 |
+
(embed_tokens): Embedding(151936, 896)
|
30 |
+
(layers): ModuleList(
|
31 |
+
(0-23): 24 x FullyShardedDataParallel(
|
32 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
33 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
34 |
+
(self_attn): Qwen2FlashAttention2(
|
35 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
36 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
37 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
38 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
39 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
40 |
+
)
|
41 |
+
(mlp): Qwen2MLP(
|
42 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
43 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
44 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
45 |
+
(act_fn): SiLU()
|
46 |
+
)
|
47 |
+
(input_layernorm): Qwen2RMSNorm()
|
48 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
49 |
+
)
|
50 |
+
)
|
51 |
+
)
|
52 |
+
)
|
53 |
+
(norm): Qwen2RMSNorm()
|
54 |
+
)
|
55 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
56 |
+
)
|
57 |
+
)
|
58 |
+
model config: Qwen2Config {
|
59 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
60 |
+
"architectures": [
|
61 |
+
"Qwen2ForCausalLM"
|
62 |
+
],
|
63 |
+
"attention_dropout": 0.0,
|
64 |
+
"bos_token_id": 151643,
|
65 |
+
"eos_token_id": 151643,
|
66 |
+
"hidden_act": "silu",
|
67 |
+
"hidden_size": 896,
|
68 |
+
"initializer_range": 0.02,
|
69 |
+
"intermediate_size": 4864,
|
70 |
+
"label_smoothing": 0.0,
|
71 |
+
"max_position_embeddings": 1024,
|
72 |
+
"max_window_layers": 24,
|
73 |
+
"model_type": "qwen2",
|
74 |
+
"num_attention_heads": 14,
|
75 |
+
"num_hidden_layers": 24,
|
76 |
+
"num_key_value_heads": 2,
|
77 |
+
"rms_norm_eps": 1e-06,
|
78 |
+
"rope_theta": 1000000.0,
|
79 |
+
"sliding_window": 131072,
|
80 |
+
"tie_word_embeddings": true,
|
81 |
+
"torch_dtype": "bfloat16",
|
82 |
+
"transformers_version": "4.43.3",
|
83 |
+
"use_cache": false,
|
84 |
+
"use_sliding_window": false,
|
85 |
+
"vocab_size": 151936
|
86 |
+
}
|
87 |
+
------------------------------------------------------------------
|
88 |
+
iteration: 101 , TFLOPS: 23.617124223738323, Tokens per sec: 29262.917733714043, Loss: 3.7116615772247314
|
89 |
+
------------------------------------------------------------------
|
90 |
+
------------------------------------------------------------------
|
91 |
+
iteration: 102 , TFLOPS: 72.97505903558803, Tokens per sec: 90420.11757828314, Loss: 3.7358791828155518
|
92 |
+
------------------------------------------------------------------
|
93 |
+
eval ppl=30.41041374206543, eval loss=3.414785146713257
|
94 |
+
------------------------------------------------------------------
|
95 |
+
iteration: 103 , TFLOPS: 65.5275483737983, Tokens per sec: 81192.2416628126, Loss: 3.757955551147461
|
96 |
+
------------------------------------------------------------------
|
97 |
+
------------------------------------------------------------------
|
98 |
+
iteration: 104 , TFLOPS: 73.41964709398604, Tokens per sec: 90970.9866703471, Loss: 3.730485439300537
|
99 |
+
------------------------------------------------------------------
|
100 |
+
------------------------------------------------------------------
|
101 |
+
iteration: 105 , TFLOPS: 70.30393549004248, Tokens per sec: 87110.44838107748, Loss: 3.7091140747070312
|
102 |
+
------------------------------------------------------------------
|
103 |
+
eval ppl=30.61298179626465, eval loss=3.421424150466919
|
104 |
+
------------------------------------------------------------------
|
105 |
+
iteration: 106 , TFLOPS: 67.02992418712539, Tokens per sec: 83053.76804570397, Loss: 3.732792377471924
|
106 |
+
------------------------------------------------------------------
|
107 |
+
------------------------------------------------------------------
|
108 |
+
iteration: 107 , TFLOPS: 72.30676298983876, Tokens per sec: 89592.06196815638, Loss: 3.7457761764526367
|
109 |
+
------------------------------------------------------------------
|
110 |
+
------------------------------------------------------------------
|
111 |
+
iteration: 108 , TFLOPS: 73.24548732883933, Tokens per sec: 90755.19312868938, Loss: 3.7291133403778076
|
112 |
+
------------------------------------------------------------------
|
113 |
+
eval ppl=31.11376953125, eval loss=3.437650442123413
|
114 |
+
------------------------------------------------------------------
|
115 |
+
iteration: 109 , TFLOPS: 66.08722155273428, Tokens per sec: 81885.7075580594, Loss: 3.71726131439209
|
116 |
+
------------------------------------------------------------------
|
117 |
+
------------------------------------------------------------------
|
118 |
+
iteration: 110 , TFLOPS: 73.33268637131981, Tokens per sec: 90863.23754520644, Loss: 3.7274892330169678
|
119 |
+
------------------------------------------------------------------
|
120 |
+
Traceback (most recent call last):
|
121 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
122 |
+
main()
|
123 |
+
File "/project/src/llama_recipes/finetuning.py", line 282, in main
|
124 |
+
train(
|
125 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
|
126 |
+
loss.backward()
|
127 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
|
128 |
+
torch.autograd.backward(
|
129 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
|
130 |
+
_engine_run_backward(
|
131 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
|
132 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
133 |
+
KeyboardInterrupt
|
wandb/run-20240823_202540-om09pls8/files/requirements.txt
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
build==1.2.1
|
23 |
+
cachecontrol==0.14.0
|
24 |
+
cachetools==5.3.2
|
25 |
+
catalogue==2.0.10
|
26 |
+
certifi==2024.2.2
|
27 |
+
cffi==1.16.0
|
28 |
+
chardet==5.2.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
cleo==2.1.0
|
31 |
+
click==8.1.7
|
32 |
+
cloudpathlib==0.16.0
|
33 |
+
cloudpickle==3.0.0
|
34 |
+
cmake==3.28.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.1
|
37 |
+
confection==0.1.4
|
38 |
+
contourpy==1.2.0
|
39 |
+
cramjam==2.8.3
|
40 |
+
crashtest==0.4.1
|
41 |
+
cryptography==43.0.0
|
42 |
+
cubinlinker==0.3.0+2.g405ac64
|
43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
44 |
+
cudf==23.12.0
|
45 |
+
cugraph-dgl==23.12.0
|
46 |
+
cugraph-service-client==23.12.0
|
47 |
+
cugraph-service-server==23.12.0
|
48 |
+
cugraph==23.12.0
|
49 |
+
cuml==23.12.0
|
50 |
+
cupy-cuda12x==12.3.0
|
51 |
+
cycler==0.12.1
|
52 |
+
cymem==2.0.8
|
53 |
+
cython==3.0.8
|
54 |
+
dask-cuda==23.12.0
|
55 |
+
dask-cudf==23.12.0
|
56 |
+
dask==2023.11.0
|
57 |
+
dataclasses-json==0.6.7
|
58 |
+
dataproperty==1.0.1
|
59 |
+
datasets==2.20.0
|
60 |
+
debugpy==1.8.1
|
61 |
+
decorator==5.1.1
|
62 |
+
defusedxml==0.7.1
|
63 |
+
dill==0.3.8
|
64 |
+
distlib==0.3.8
|
65 |
+
distributed==2023.11.0
|
66 |
+
distro==1.9.0
|
67 |
+
dm-tree==0.1.8
|
68 |
+
docker-pycreds==0.4.0
|
69 |
+
dulwich==0.21.7
|
70 |
+
einops==0.7.0
|
71 |
+
emoji==2.12.1
|
72 |
+
entmax==1.3
|
73 |
+
evaluate==0.4.2
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
execnet==2.0.2
|
76 |
+
executing==2.0.1
|
77 |
+
expecttest==0.1.3
|
78 |
+
fastjsonschema==2.19.1
|
79 |
+
fastparquet==2023.10.1
|
80 |
+
fastrlock==0.8.2
|
81 |
+
filelock==3.13.1
|
82 |
+
flash-attn==2.4.2
|
83 |
+
fonttools==4.48.1
|
84 |
+
frozenlist==1.4.1
|
85 |
+
fsspec==2023.12.2
|
86 |
+
fugashi==1.3.2
|
87 |
+
fuzzywuzzy==0.18.0
|
88 |
+
gast==0.5.4
|
89 |
+
gitdb==4.0.11
|
90 |
+
gitpython==3.1.43
|
91 |
+
google-auth-oauthlib==0.4.6
|
92 |
+
google-auth==2.27.0
|
93 |
+
graphsurgeon==0.4.6
|
94 |
+
greenlet==3.0.3
|
95 |
+
grpcio==1.60.1
|
96 |
+
h11==0.14.0
|
97 |
+
httpcore==1.0.5
|
98 |
+
httpx==0.27.0
|
99 |
+
huggingface-hub==0.24.5
|
100 |
+
hydra-core==1.3.2
|
101 |
+
hypothesis==5.35.1
|
102 |
+
idna==3.6
|
103 |
+
importlib-metadata==7.0.1
|
104 |
+
iniconfig==2.0.0
|
105 |
+
installer==0.7.0
|
106 |
+
intel-openmp==2021.4.0
|
107 |
+
ipadic==1.0.0
|
108 |
+
ipykernel==6.29.2
|
109 |
+
ipython-genutils==0.2.0
|
110 |
+
ipython==8.21.0
|
111 |
+
isort==5.13.2
|
112 |
+
jaraco.classes==3.4.0
|
113 |
+
jedi==0.19.1
|
114 |
+
jeepney==0.8.0
|
115 |
+
jinja2==3.1.3
|
116 |
+
jiter==0.5.0
|
117 |
+
joblib==1.3.2
|
118 |
+
json5==0.9.14
|
119 |
+
jsonargparse==3.13.1
|
120 |
+
jsonlines==4.0.0
|
121 |
+
jsonnet==0.19.1
|
122 |
+
jsonpatch==1.33
|
123 |
+
jsonpointer==3.0.0
|
124 |
+
jsonschema-specifications==2023.12.1
|
125 |
+
jsonschema==4.21.1
|
126 |
+
jupyter-client==8.6.0
|
127 |
+
jupyter-core==5.7.1
|
128 |
+
jupyter-tensorboard==0.2.0
|
129 |
+
jupyterlab-pygments==0.3.0
|
130 |
+
jupyterlab-server==1.2.0
|
131 |
+
jupyterlab==2.3.2
|
132 |
+
jupytext==1.16.1
|
133 |
+
keyring==24.3.1
|
134 |
+
kiwisolver==1.4.5
|
135 |
+
langchain-community==0.2.12
|
136 |
+
langchain-core==0.2.31
|
137 |
+
langchain-huggingface==0.0.2
|
138 |
+
langchain-openai==0.1.21
|
139 |
+
langchain-text-splitters==0.2.2
|
140 |
+
langchain==0.2.13
|
141 |
+
langcodes==3.3.0
|
142 |
+
langsmith==0.1.99
|
143 |
+
lazy-loader==0.3
|
144 |
+
levenshtein==0.25.1
|
145 |
+
librosa==0.10.1
|
146 |
+
lightning-utilities==0.11.6
|
147 |
+
llm-jp-eval==1.4.0
|
148 |
+
llvmlite==0.40.1
|
149 |
+
lm-eval==0.3.0
|
150 |
+
locket==1.0.0
|
151 |
+
logzero==1.7.0
|
152 |
+
lxml==5.2.2
|
153 |
+
markdown-it-py==3.0.0
|
154 |
+
markdown==3.5.2
|
155 |
+
markupsafe==2.1.4
|
156 |
+
marshmallow==3.21.3
|
157 |
+
matplotlib-inline==0.1.6
|
158 |
+
matplotlib==3.8.2
|
159 |
+
mbstrdecoder==1.1.3
|
160 |
+
mccabe==0.7.0
|
161 |
+
mdit-py-plugins==0.4.0
|
162 |
+
mdurl==0.1.2
|
163 |
+
mecab-python3==1.0.6
|
164 |
+
mistune==3.0.2
|
165 |
+
mkl-devel==2021.1.1
|
166 |
+
mkl-include==2021.1.1
|
167 |
+
mkl==2021.1.1
|
168 |
+
mock==5.1.0
|
169 |
+
mojimoji==0.0.13
|
170 |
+
more-itertools==9.1.0
|
171 |
+
mpmath==1.3.0
|
172 |
+
msgpack==1.0.7
|
173 |
+
multidict==6.0.4
|
174 |
+
multiprocess==0.70.16
|
175 |
+
murmurhash==1.0.10
|
176 |
+
mypy-extensions==1.0.0
|
177 |
+
nbclient==0.9.0
|
178 |
+
nbconvert==7.16.0
|
179 |
+
nbformat==5.9.2
|
180 |
+
neologdn==0.5.3
|
181 |
+
nest-asyncio==1.6.0
|
182 |
+
networkx==2.6.3
|
183 |
+
ninja==1.11.1.1
|
184 |
+
nltk==3.8.1
|
185 |
+
notebook==6.4.10
|
186 |
+
numba==0.57.1+1.g1ff679645
|
187 |
+
numexpr==2.10.1
|
188 |
+
numpy==1.24.4
|
189 |
+
nvfuser==0.1.4a0+d0bb811
|
190 |
+
nvidia-dali-cuda120==1.34.0
|
191 |
+
nvidia-pyindex==1.0.9
|
192 |
+
nvtx==0.2.5
|
193 |
+
oauthlib==3.2.2
|
194 |
+
omegaconf==2.3.0
|
195 |
+
onnx==1.15.0rc2
|
196 |
+
openai==1.40.6
|
197 |
+
opencv==4.7.0
|
198 |
+
optree==0.10.0
|
199 |
+
orjson==3.10.7
|
200 |
+
packaging==23.2
|
201 |
+
pandas==2.2.2
|
202 |
+
pandocfilters==1.5.1
|
203 |
+
parso==0.8.3
|
204 |
+
partd==1.4.1
|
205 |
+
pathvalidate==3.2.0
|
206 |
+
peft==0.5.0
|
207 |
+
pexpect==4.9.0
|
208 |
+
pillow==10.2.0
|
209 |
+
pip==24.0
|
210 |
+
pkginfo==1.11.1
|
211 |
+
plac==1.4.3
|
212 |
+
platformdirs==4.2.0
|
213 |
+
pluggy==1.4.0
|
214 |
+
ply==3.11
|
215 |
+
poetry-core==1.9.0
|
216 |
+
poetry-plugin-export==1.8.0
|
217 |
+
poetry==1.8.3
|
218 |
+
polygraphy==0.49.4
|
219 |
+
pooch==1.8.0
|
220 |
+
portalocker==2.10.1
|
221 |
+
preshed==3.0.9
|
222 |
+
prettytable==3.9.0
|
223 |
+
prometheus-client==0.19.0
|
224 |
+
prompt-toolkit==3.0.43
|
225 |
+
protobuf==4.24.4
|
226 |
+
psutil==5.9.4
|
227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
228 |
+
ptyprocess==0.7.0
|
229 |
+
pure-eval==0.2.2
|
230 |
+
pyarrow-hotfix==0.6
|
231 |
+
pyarrow==15.0.2
|
232 |
+
pyasn1-modules==0.3.0
|
233 |
+
pyasn1==0.5.1
|
234 |
+
pybind11-global==2.11.1
|
235 |
+
pybind11==2.11.1
|
236 |
+
pycocotools==2.0+nv0.8.0
|
237 |
+
pycountry==24.6.1
|
238 |
+
pycparser==2.21
|
239 |
+
pydantic-core==2.16.2
|
240 |
+
pydantic==2.6.1
|
241 |
+
pygments==2.17.2
|
242 |
+
pylibcugraph==23.12.0
|
243 |
+
pylibcugraphops==23.12.0
|
244 |
+
pylibraft==23.12.0
|
245 |
+
pylint==3.2.6
|
246 |
+
pynvml==11.4.1
|
247 |
+
pyparsing==3.1.1
|
248 |
+
pyproject-hooks==1.1.0
|
249 |
+
pytablewriter==1.2.0
|
250 |
+
pytest-flakefinder==1.1.0
|
251 |
+
pytest-rerunfailures==13.0
|
252 |
+
pytest-shard==0.1.2
|
253 |
+
pytest-xdist==3.5.0
|
254 |
+
pytest==8.0.0
|
255 |
+
python-dateutil==2.8.2
|
256 |
+
python-dotenv==1.0.0
|
257 |
+
python-hostlist==1.23.0
|
258 |
+
python-levenshtein==0.25.1
|
259 |
+
pytorch-lightning==2.4.0
|
260 |
+
pytorch-quantization==2.1.2
|
261 |
+
pytz==2023.3.post1
|
262 |
+
pyyaml==6.0.1
|
263 |
+
pyzmq==25.1.2
|
264 |
+
raft-dask==23.12.0
|
265 |
+
rapidfuzz==3.9.6
|
266 |
+
rapids-dask-dependency==23.12.1
|
267 |
+
referencing==0.33.0
|
268 |
+
regex==2023.12.25
|
269 |
+
requests-oauthlib==1.3.1
|
270 |
+
requests-toolbelt==1.0.0
|
271 |
+
requests==2.32.3
|
272 |
+
rhoknp==1.7.0
|
273 |
+
rich==13.7.0
|
274 |
+
rmm==23.12.0
|
275 |
+
rouge-score==0.1.2
|
276 |
+
rpds-py==0.17.1
|
277 |
+
rsa==4.9
|
278 |
+
sacrebleu==2.4.2
|
279 |
+
safetensors==0.4.3
|
280 |
+
scikit-learn==1.5.1
|
281 |
+
scipy==1.12.0
|
282 |
+
secretstorage==3.3.3
|
283 |
+
send2trash==1.8.2
|
284 |
+
sentence-transformers==3.0.1
|
285 |
+
sentencepiece==0.1.99
|
286 |
+
sentry-sdk==2.12.0
|
287 |
+
setproctitle==1.3.3
|
288 |
+
setuptools==68.2.2
|
289 |
+
shellingham==1.5.4
|
290 |
+
six==1.16.0
|
291 |
+
smart-open==6.4.0
|
292 |
+
smmap==5.0.1
|
293 |
+
sniffio==1.3.1
|
294 |
+
sortedcontainers==2.4.0
|
295 |
+
soundfile==0.12.1
|
296 |
+
soupsieve==2.5
|
297 |
+
soxr==0.3.7
|
298 |
+
spacy-legacy==3.0.12
|
299 |
+
spacy-loggers==1.0.5
|
300 |
+
spacy==3.7.2
|
301 |
+
sphinx-glpi-theme==0.6
|
302 |
+
sqlalchemy==2.0.32
|
303 |
+
sqlitedict==2.1.0
|
304 |
+
srsly==2.4.8
|
305 |
+
stack-data==0.6.3
|
306 |
+
sumeval==0.2.2
|
307 |
+
sympy==1.12
|
308 |
+
tabledata==1.3.3
|
309 |
+
tabulate==0.9.0
|
310 |
+
tbb==2021.11.0
|
311 |
+
tblib==3.0.0
|
312 |
+
tcolorpy==0.1.6
|
313 |
+
tenacity==8.5.0
|
314 |
+
tensorboard-data-server==0.6.1
|
315 |
+
tensorboard-plugin-wit==1.8.1
|
316 |
+
tensorboard==2.9.0
|
317 |
+
tensorrt==8.6.3
|
318 |
+
terminado==0.18.0
|
319 |
+
termplotlib==0.3.9
|
320 |
+
text-generation==0.7.0
|
321 |
+
thinc==8.2.3
|
322 |
+
threadpoolctl==3.2.0
|
323 |
+
thriftpy2==0.4.17
|
324 |
+
tiktoken==0.7.0
|
325 |
+
tinycss2==1.2.1
|
326 |
+
tokenizers==0.19.1
|
327 |
+
toml==0.10.2
|
328 |
+
tomli==2.0.1
|
329 |
+
tomlkit==0.13.2
|
330 |
+
toolz==0.12.1
|
331 |
+
torch-tensorrt==2.3.0a0
|
332 |
+
torch==2.3.0a0+ebedce2
|
333 |
+
torchdata==0.7.1a0
|
334 |
+
torchmetrics==0.10.3
|
335 |
+
torchtext==0.17.0a0
|
336 |
+
torchvision==0.18.0a0
|
337 |
+
tornado==6.4
|
338 |
+
tqdm-multiprocess==0.0.11
|
339 |
+
tqdm==4.66.5
|
340 |
+
traitlets==5.9.0
|
341 |
+
transformer-engine==1.3.0+5b90b7f
|
342 |
+
transformers==4.43.3
|
343 |
+
treelite-runtime==3.9.1
|
344 |
+
treelite==3.9.1
|
345 |
+
triton==2.2.0+e28a256
|
346 |
+
trove-classifiers==2024.7.2
|
347 |
+
typepy==1.3.2
|
348 |
+
typer==0.9.0
|
349 |
+
types-dataclasses==0.6.6
|
350 |
+
typing-extensions==4.12.2
|
351 |
+
typing-inspect==0.9.0
|
352 |
+
tzdata==2024.1
|
353 |
+
ucx-py==0.35.0
|
354 |
+
uff==0.6.9
|
355 |
+
ujson==5.8.0
|
356 |
+
unbabel-comet==2.2.2
|
357 |
+
unidic-lite==1.0.8
|
358 |
+
urllib3==1.26.18
|
359 |
+
virtualenv==20.26.3
|
360 |
+
wandb==0.16.3
|
361 |
+
wasabi==1.1.2
|
362 |
+
wcwidth==0.2.13
|
363 |
+
weasel==0.3.4
|
364 |
+
webencodings==0.5.1
|
365 |
+
werkzeug==3.0.1
|
366 |
+
wheel==0.42.0
|
367 |
+
word2number==1.1
|
368 |
+
xdoctest==1.0.2
|
369 |
+
xgboost==1.7.6
|
370 |
+
xmltodict==0.13.0
|
371 |
+
xxhash==3.4.1
|
372 |
+
yarl==1.9.4
|
373 |
+
zict==3.0.0
|
374 |
+
zipp==3.17.0
|
375 |
+
zstandard==0.23.0
|
wandb/run-20240823_202540-om09pls8/files/wandb-metadata.json
ADDED
@@ -0,0 +1,502 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-23T11:25:41.454442",
|
5 |
+
"startedAt": "2024-08-23T11:25:40.735970",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"1024",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"131072",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"17",
|
15 |
+
"--valid_micro_batch_size",
|
16 |
+
"10",
|
17 |
+
"--global-batch-size",
|
18 |
+
"612",
|
19 |
+
"--train-iters",
|
20 |
+
"16000",
|
21 |
+
"--tokenizer-type",
|
22 |
+
"HFPreTrainedTokenizer",
|
23 |
+
"--tokenizer-model",
|
24 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
25 |
+
"--train-data-path",
|
26 |
+
"1754785366",
|
27 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
28 |
+
"28623823675",
|
29 |
+
"/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
|
30 |
+
"--valid-data-path",
|
31 |
+
"1754785366",
|
32 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
33 |
+
"--test-data-path",
|
34 |
+
"1754785366",
|
35 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
36 |
+
"--lr",
|
37 |
+
"2e-5",
|
38 |
+
"--min-lr",
|
39 |
+
"1e-6",
|
40 |
+
"--lr-decay-style",
|
41 |
+
"cosine",
|
42 |
+
"--lr-warmup-iters",
|
43 |
+
"500",
|
44 |
+
"--lr-decay-iters",
|
45 |
+
"16000",
|
46 |
+
"--weight-decay",
|
47 |
+
"0.1",
|
48 |
+
"--grad-clip-norm",
|
49 |
+
"1.0",
|
50 |
+
"--optimizer",
|
51 |
+
"anyprecision",
|
52 |
+
"--adam-beta1",
|
53 |
+
"0.9",
|
54 |
+
"--adam-beta2",
|
55 |
+
"0.95",
|
56 |
+
"--adam-eps",
|
57 |
+
"1e-6",
|
58 |
+
"--save-interval",
|
59 |
+
"500",
|
60 |
+
"--eval-interval",
|
61 |
+
"3",
|
62 |
+
"--eval-iters",
|
63 |
+
"10",
|
64 |
+
"--bf16",
|
65 |
+
"--mixed-precision",
|
66 |
+
"--base-model",
|
67 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
68 |
+
"--save",
|
69 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
70 |
+
"--load",
|
71 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
72 |
+
"--fsdp-activation-checkpointing",
|
73 |
+
"--sharding-strategy",
|
74 |
+
"FULL_SHARD",
|
75 |
+
"--checkpoint-type",
|
76 |
+
"LOCAL_STATE_DICT",
|
77 |
+
"--save-n-checkpoints",
|
78 |
+
"10",
|
79 |
+
"--upload-all-checkpoints-to-hf",
|
80 |
+
"--hf-upload-retry-limit",
|
81 |
+
"2",
|
82 |
+
"--hf-repo-id",
|
83 |
+
"koichi12/Qwen2-0.5b-0.2",
|
84 |
+
"--wandb-entity",
|
85 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
86 |
+
"--wandb-project",
|
87 |
+
"llm_tutorial-0.2",
|
88 |
+
"--wandb-name",
|
89 |
+
"Qwen2-0.5b-0.2_train_2024-08-23-20:25:00"
|
90 |
+
],
|
91 |
+
"state": "running",
|
92 |
+
"program": "/project/examples/finetuning.py",
|
93 |
+
"codePathLocal": "examples/finetuning.py",
|
94 |
+
"codePath": "examples/finetuning.py",
|
95 |
+
"git": {
|
96 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
97 |
+
"commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
|
98 |
+
},
|
99 |
+
"email": null,
|
100 |
+
"root": "/project",
|
101 |
+
"host": "gpu-koiwa-00",
|
102 |
+
"username": "koiwa",
|
103 |
+
"executable": "/usr/bin/python",
|
104 |
+
"cpu_count": 72,
|
105 |
+
"cpu_count_logical": 72,
|
106 |
+
"cpu_freq": {
|
107 |
+
"current": 2400.038999999999,
|
108 |
+
"min": 0.0,
|
109 |
+
"max": 0.0
|
110 |
+
},
|
111 |
+
"cpu_freq_per_core": [
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"current": 2400.039,
|
199 |
+
"min": 0.0,
|
200 |
+
"max": 0.0
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"current": 2400.039,
|
204 |
+
"min": 0.0,
|
205 |
+
"max": 0.0
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"current": 2400.039,
|
209 |
+
"min": 0.0,
|
210 |
+
"max": 0.0
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"current": 2400.039,
|
214 |
+
"min": 0.0,
|
215 |
+
"max": 0.0
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"current": 2400.039,
|
219 |
+
"min": 0.0,
|
220 |
+
"max": 0.0
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"current": 2400.039,
|
224 |
+
"min": 0.0,
|
225 |
+
"max": 0.0
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"current": 2400.039,
|
229 |
+
"min": 0.0,
|
230 |
+
"max": 0.0
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"current": 2400.039,
|
234 |
+
"min": 0.0,
|
235 |
+
"max": 0.0
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"current": 2400.039,
|
239 |
+
"min": 0.0,
|
240 |
+
"max": 0.0
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"current": 2400.039,
|
244 |
+
"min": 0.0,
|
245 |
+
"max": 0.0
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"current": 2400.039,
|
249 |
+
"min": 0.0,
|
250 |
+
"max": 0.0
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"current": 2400.039,
|
254 |
+
"min": 0.0,
|
255 |
+
"max": 0.0
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"current": 2400.039,
|
259 |
+
"min": 0.0,
|
260 |
+
"max": 0.0
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"current": 2400.039,
|
264 |
+
"min": 0.0,
|
265 |
+
"max": 0.0
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"current": 2400.039,
|
269 |
+
"min": 0.0,
|
270 |
+
"max": 0.0
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"current": 2400.039,
|
274 |
+
"min": 0.0,
|
275 |
+
"max": 0.0
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"current": 2400.039,
|
279 |
+
"min": 0.0,
|
280 |
+
"max": 0.0
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"current": 2400.039,
|
284 |
+
"min": 0.0,
|
285 |
+
"max": 0.0
|
286 |
+
},
|
287 |
+
{
|
288 |
+
"current": 2400.039,
|
289 |
+
"min": 0.0,
|
290 |
+
"max": 0.0
|
291 |
+
},
|
292 |
+
{
|
293 |
+
"current": 2400.039,
|
294 |
+
"min": 0.0,
|
295 |
+
"max": 0.0
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"current": 2400.039,
|
299 |
+
"min": 0.0,
|
300 |
+
"max": 0.0
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"current": 2400.039,
|
304 |
+
"min": 0.0,
|
305 |
+
"max": 0.0
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"current": 2400.039,
|
309 |
+
"min": 0.0,
|
310 |
+
"max": 0.0
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"current": 2400.039,
|
314 |
+
"min": 0.0,
|
315 |
+
"max": 0.0
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"current": 2400.039,
|
319 |
+
"min": 0.0,
|
320 |
+
"max": 0.0
|
321 |
+
},
|
322 |
+
{
|
323 |
+
"current": 2400.039,
|
324 |
+
"min": 0.0,
|
325 |
+
"max": 0.0
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"current": 2400.039,
|
329 |
+
"min": 0.0,
|
330 |
+
"max": 0.0
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"current": 2400.039,
|
334 |
+
"min": 0.0,
|
335 |
+
"max": 0.0
|
336 |
+
},
|
337 |
+
{
|
338 |
+
"current": 2400.039,
|
339 |
+
"min": 0.0,
|
340 |
+
"max": 0.0
|
341 |
+
},
|
342 |
+
{
|
343 |
+
"current": 2400.039,
|
344 |
+
"min": 0.0,
|
345 |
+
"max": 0.0
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"current": 2400.039,
|
349 |
+
"min": 0.0,
|
350 |
+
"max": 0.0
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"current": 2400.039,
|
354 |
+
"min": 0.0,
|
355 |
+
"max": 0.0
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"current": 2400.039,
|
359 |
+
"min": 0.0,
|
360 |
+
"max": 0.0
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"current": 2400.039,
|
364 |
+
"min": 0.0,
|
365 |
+
"max": 0.0
|
366 |
+
},
|
367 |
+
{
|
368 |
+
"current": 2400.039,
|
369 |
+
"min": 0.0,
|
370 |
+
"max": 0.0
|
371 |
+
},
|
372 |
+
{
|
373 |
+
"current": 2400.039,
|
374 |
+
"min": 0.0,
|
375 |
+
"max": 0.0
|
376 |
+
},
|
377 |
+
{
|
378 |
+
"current": 2400.039,
|
379 |
+
"min": 0.0,
|
380 |
+
"max": 0.0
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"current": 2400.039,
|
384 |
+
"min": 0.0,
|
385 |
+
"max": 0.0
|
386 |
+
},
|
387 |
+
{
|
388 |
+
"current": 2400.039,
|
389 |
+
"min": 0.0,
|
390 |
+
"max": 0.0
|
391 |
+
},
|
392 |
+
{
|
393 |
+
"current": 2400.039,
|
394 |
+
"min": 0.0,
|
395 |
+
"max": 0.0
|
396 |
+
},
|
397 |
+
{
|
398 |
+
"current": 2400.039,
|
399 |
+
"min": 0.0,
|
400 |
+
"max": 0.0
|
401 |
+
},
|
402 |
+
{
|
403 |
+
"current": 2400.039,
|
404 |
+
"min": 0.0,
|
405 |
+
"max": 0.0
|
406 |
+
},
|
407 |
+
{
|
408 |
+
"current": 2400.039,
|
409 |
+
"min": 0.0,
|
410 |
+
"max": 0.0
|
411 |
+
},
|
412 |
+
{
|
413 |
+
"current": 2400.039,
|
414 |
+
"min": 0.0,
|
415 |
+
"max": 0.0
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"current": 2400.039,
|
419 |
+
"min": 0.0,
|
420 |
+
"max": 0.0
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"current": 2400.039,
|
424 |
+
"min": 0.0,
|
425 |
+
"max": 0.0
|
426 |
+
},
|
427 |
+
{
|
428 |
+
"current": 2400.039,
|
429 |
+
"min": 0.0,
|
430 |
+
"max": 0.0
|
431 |
+
},
|
432 |
+
{
|
433 |
+
"current": 2400.039,
|
434 |
+
"min": 0.0,
|
435 |
+
"max": 0.0
|
436 |
+
},
|
437 |
+
{
|
438 |
+
"current": 2400.039,
|
439 |
+
"min": 0.0,
|
440 |
+
"max": 0.0
|
441 |
+
},
|
442 |
+
{
|
443 |
+
"current": 2400.039,
|
444 |
+
"min": 0.0,
|
445 |
+
"max": 0.0
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"current": 2400.039,
|
449 |
+
"min": 0.0,
|
450 |
+
"max": 0.0
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"current": 2400.039,
|
454 |
+
"min": 0.0,
|
455 |
+
"max": 0.0
|
456 |
+
},
|
457 |
+
{
|
458 |
+
"current": 2400.039,
|
459 |
+
"min": 0.0,
|
460 |
+
"max": 0.0
|
461 |
+
},
|
462 |
+
{
|
463 |
+
"current": 2400.039,
|
464 |
+
"min": 0.0,
|
465 |
+
"max": 0.0
|
466 |
+
},
|
467 |
+
{
|
468 |
+
"current": 2400.039,
|
469 |
+
"min": 0.0,
|
470 |
+
"max": 0.0
|
471 |
+
}
|
472 |
+
],
|
473 |
+
"disk": {
|
474 |
+
"/": {
|
475 |
+
"total": 0.0625,
|
476 |
+
"used": 1.1444091796875e-05
|
477 |
+
}
|
478 |
+
},
|
479 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
480 |
+
"gpu_count": 4,
|
481 |
+
"gpu_devices": [
|
482 |
+
{
|
483 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
484 |
+
"memory_total": 42949672960
|
485 |
+
},
|
486 |
+
{
|
487 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
488 |
+
"memory_total": 42949672960
|
489 |
+
},
|
490 |
+
{
|
491 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
492 |
+
"memory_total": 42949672960
|
493 |
+
},
|
494 |
+
{
|
495 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
496 |
+
"memory_total": 42949672960
|
497 |
+
}
|
498 |
+
],
|
499 |
+
"memory": {
|
500 |
+
"total": 226.66352462768555
|
501 |
+
}
|
502 |
+
}
|
wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 3.7274892330169678, "training/perplexity": 41.57459289701456, "utils/batch_size": 17, "utils/global_batch_size": 612, "utils/seq_len": 1025, "utils/gradient_accumulation_steps": 9, "utils/iteration": 110, "optimizer/lr": 5.18e-06, "optimizer/variance_l2": 0.04658828859035772, "optimizer/variance_sqrt_l2": 0.8728927373830674, "optimizer/momentum_l2": 0.8215464439288661, "optimizer/weight_l2": 640.8711356427281, "optimizer/variance_l1": 0.7605819702148438, "optimizer/variance_sqrt_l1": 1761.25, "optimizer/momentum_l1": 1269.75, "optimizer/weight_l1": 1809664.0, "optimizer/variance_abs_max": 0.04248046875, "optimizer/variance_sqrt_abs_max": 0.2060546875, "optimizer/momentum_abs_max": 0.18359375, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 6.903782178000256, "stats/tokens_per_sec": 90863.23754520644, "stats/tokens_per_sec_per_gpu": 22715.80938630161, "stats/tflops": 73.33268637131981, "_timestamp": 1724412471.451136, "_runtime": 130.7007360458374, "_step": 110, "evaluation/val_loss": 3.437650442123413, "evaluation/val_ppl": 31.11376953125, "_wandb": {"runtime": 131}}
|
wandb/run-20240823_202540-om09pls8/logs/debug-internal.log
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-23 20:25:40,751 INFO StreamThr :13176 [internal.py:wandb_internal():86] W&B internal server running at pid: 13176, started at: 2024-08-23 20:25:40.750453
|
2 |
+
2024-08-23 20:25:40,752 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-23 20:25:40,755 INFO WriterThread:13176 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_202540-om09pls8/run-om09pls8.wandb
|
4 |
+
2024-08-23 20:25:40,756 DEBUG SenderThread:13176 [sender.py:send():382] send: header
|
5 |
+
2024-08-23 20:25:40,869 DEBUG SenderThread:13176 [sender.py:send():382] send: run
|
6 |
+
2024-08-23 20:25:41,336 INFO SenderThread:13176 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_202540-om09pls8/files
|
7 |
+
2024-08-23 20:25:41,336 INFO SenderThread:13176 [sender.py:_start_run_threads():1136] run started: om09pls8 with start time 1724412340.7504
|
8 |
+
2024-08-23 20:25:41,342 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-23 20:25:41,342 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-23 20:25:41,412 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-23 20:25:41,418 DEBUG HandlerThread:13176 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-23 20:25:41,418 DEBUG HandlerThread:13176 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-23 20:25:41,418 INFO HandlerThread:13176 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-23 20:25:41,418 INFO SystemMonitor:13176 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-23 20:25:41,418 INFO HandlerThread:13176 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-23 20:25:41,419 INFO SystemMonitor:13176 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-23 20:25:41,419 INFO SystemMonitor:13176 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-23 20:25:41,420 INFO SystemMonitor:13176 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-23 20:25:41,421 INFO SystemMonitor:13176 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-23 20:25:41,423 INFO SystemMonitor:13176 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-23 20:25:41,454 DEBUG HandlerThread:13176 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-23 20:25:41,456 DEBUG HandlerThread:13176 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-23 20:25:41,470 DEBUG HandlerThread:13176 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-23 20:25:41,470 DEBUG HandlerThread:13176 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-23 20:25:41,470 DEBUG HandlerThread:13176 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T11:25:41.454442', 'startedAt': '2024-08-23T11:25:40.735970', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '131072', '--micro-batch-size', '17', '--valid_micro_batch_size', '10', '--global-batch-size', '612', '--train-iters', '16000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '16000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '500', '--eval-interval', '3', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-20:25:00'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 72, 'cpu_count_logical': 72, 'cpu_freq': {'current': 2400.038999999999, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 4, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 226.66352462768555}}
|
26 |
+
2024-08-23 20:25:41,470 INFO HandlerThread:13176 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-23 20:25:41,470 INFO HandlerThread:13176 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-23 20:25:41,472 INFO HandlerThread:13176 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-23 20:25:41,502 DEBUG SenderThread:13176 [sender.py:send():382] send: files
|
30 |
+
2024-08-23 20:25:41,502 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-23 20:25:41,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-23 20:25:41,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-23 20:25:41,514 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-23 20:25:41,514 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-23 20:25:41,516 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-23 20:25:41,704 DEBUG SenderThread:13176 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-23 20:25:42,229 INFO wandb-upload_0:13176 [upload_job.py:push():131] Uploaded file /tmp/tmpnyk2zt9mwandb/u7uqpthk-wandb-metadata.json
|
38 |
+
2024-08-23 20:25:42,338 INFO Thread-12 :13176 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
39 |
+
2024-08-23 20:25:42,339 INFO Thread-12 :13176 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_202540-om09pls8/files/wandb-metadata.json
|
40 |
+
2024-08-23 20:25:42,339 INFO Thread-12 :13176 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_202540-om09pls8/files/requirements.txt
|
41 |
+
2024-08-23 20:25:44,339 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
42 |
+
2024-08-23 20:25:46,083 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-23 20:25:46,340 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
44 |
+
2024-08-23 20:25:47,341 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
45 |
+
2024-08-23 20:25:51,887 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
46 |
+
2024-08-23 20:25:52,344 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
47 |
+
2024-08-23 20:25:53,345 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
48 |
+
2024-08-23 20:25:55,346 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
49 |
+
2024-08-23 20:25:56,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
50 |
+
2024-08-23 20:25:56,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
51 |
+
2024-08-23 20:25:56,513 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
52 |
+
2024-08-23 20:25:57,347 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
53 |
+
2024-08-23 20:25:57,765 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
54 |
+
2024-08-23 20:25:58,348 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
55 |
+
2024-08-23 20:25:59,349 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
56 |
+
2024-08-23 20:26:02,595 DEBUG SenderThread:13176 [sender.py:send():382] send: config
|
57 |
+
2024-08-23 20:26:02,595 DEBUG SenderThread:13176 [sender.py:send():382] send: config
|
58 |
+
2024-08-23 20:26:03,351 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
59 |
+
2024-08-23 20:26:03,596 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
60 |
+
2024-08-23 20:26:04,352 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
61 |
+
2024-08-23 20:26:08,596 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-23 20:26:11,513 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
63 |
+
2024-08-23 20:26:11,514 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
64 |
+
2024-08-23 20:26:11,514 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
65 |
+
2024-08-23 20:26:13,784 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
66 |
+
2024-08-23 20:26:14,359 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/config.yaml
|
67 |
+
2024-08-23 20:26:18,968 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
68 |
+
2024-08-23 20:26:23,969 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
69 |
+
2024-08-23 20:26:28,970 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
70 |
+
2024-08-23 20:26:30,283 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
71 |
+
2024-08-23 20:26:30,422 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
72 |
+
2024-08-23 20:26:30,422 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
73 |
+
2024-08-23 20:26:34,601 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
74 |
+
2024-08-23 20:26:39,602 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
75 |
+
2024-08-23 20:26:39,739 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
76 |
+
2024-08-23 20:26:41,424 DEBUG SystemMonitor:13176 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
77 |
+
2024-08-23 20:26:41,427 DEBUG SenderThread:13176 [sender.py:send():382] send: stats
|
78 |
+
2024-08-23 20:26:42,374 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
79 |
+
2024-08-23 20:26:45,083 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
80 |
+
2024-08-23 20:26:45,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
81 |
+
2024-08-23 20:26:45,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
82 |
+
2024-08-23 20:26:45,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
83 |
+
2024-08-23 20:26:46,679 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
84 |
+
2024-08-23 20:26:46,682 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
85 |
+
2024-08-23 20:26:46,682 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
86 |
+
2024-08-23 20:26:46,684 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
87 |
+
2024-08-23 20:26:47,378 INFO Thread-12 :13176 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
88 |
+
2024-08-23 20:26:48,379 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
89 |
+
2024-08-23 20:26:49,094 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
90 |
+
2024-08-23 20:26:50,093 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
91 |
+
2024-08-23 20:26:50,380 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
92 |
+
2024-08-23 20:26:55,094 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
93 |
+
2024-08-23 20:26:56,822 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
94 |
+
2024-08-23 20:26:56,825 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
95 |
+
2024-08-23 20:26:56,826 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
96 |
+
2024-08-23 20:26:56,829 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
97 |
+
2024-08-23 20:26:57,385 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
98 |
+
2024-08-23 20:26:58,386 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
99 |
+
2024-08-23 20:27:00,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
100 |
+
2024-08-23 20:27:00,085 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
101 |
+
2024-08-23 20:27:00,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
102 |
+
2024-08-23 20:27:00,326 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
103 |
+
2024-08-23 20:27:03,720 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
104 |
+
2024-08-23 20:27:03,722 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
105 |
+
2024-08-23 20:27:03,722 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
106 |
+
2024-08-23 20:27:03,723 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
107 |
+
2024-08-23 20:27:04,390 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
108 |
+
2024-08-23 20:27:05,764 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
109 |
+
2024-08-23 20:27:06,392 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
110 |
+
2024-08-23 20:27:10,765 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
111 |
+
2024-08-23 20:27:10,923 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
112 |
+
2024-08-23 20:27:10,926 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
113 |
+
2024-08-23 20:27:10,926 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
114 |
+
2024-08-23 20:27:10,928 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
115 |
+
2024-08-23 20:27:11,395 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
116 |
+
2024-08-23 20:27:11,429 DEBUG SenderThread:13176 [sender.py:send():382] send: stats
|
117 |
+
2024-08-23 20:27:12,396 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
118 |
+
2024-08-23 20:27:13,173 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
119 |
+
2024-08-23 20:27:14,397 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
120 |
+
2024-08-23 20:27:15,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
121 |
+
2024-08-23 20:27:15,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
122 |
+
2024-08-23 20:27:15,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
123 |
+
2024-08-23 20:27:16,306 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
124 |
+
2024-08-23 20:27:20,728 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
125 |
+
2024-08-23 20:27:20,730 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
126 |
+
2024-08-23 20:27:20,730 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
127 |
+
2024-08-23 20:27:20,731 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
128 |
+
2024-08-23 20:27:21,402 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
129 |
+
2024-08-23 20:27:21,772 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
130 |
+
2024-08-23 20:27:22,403 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
131 |
+
2024-08-23 20:27:26,773 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
132 |
+
2024-08-23 20:27:27,731 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
133 |
+
2024-08-23 20:27:27,734 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
134 |
+
2024-08-23 20:27:27,734 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
135 |
+
2024-08-23 20:27:27,736 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
136 |
+
2024-08-23 20:27:28,408 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
137 |
+
2024-08-23 20:27:30,084 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
138 |
+
2024-08-23 20:27:30,085 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
139 |
+
2024-08-23 20:27:30,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
140 |
+
2024-08-23 20:27:30,409 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
141 |
+
2024-08-23 20:27:32,346 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
142 |
+
2024-08-23 20:27:34,646 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
143 |
+
2024-08-23 20:27:34,649 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
144 |
+
2024-08-23 20:27:34,649 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
145 |
+
2024-08-23 20:27:34,651 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
146 |
+
2024-08-23 20:27:35,413 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
147 |
+
2024-08-23 20:27:36,413 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
148 |
+
2024-08-23 20:27:36,883 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
149 |
+
2024-08-23 20:27:37,883 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
150 |
+
2024-08-23 20:27:38,415 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
151 |
+
2024-08-23 20:27:41,432 DEBUG SenderThread:13176 [sender.py:send():382] send: stats
|
152 |
+
2024-08-23 20:27:43,433 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
153 |
+
2024-08-23 20:27:44,546 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
154 |
+
2024-08-23 20:27:44,549 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
155 |
+
2024-08-23 20:27:44,549 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
156 |
+
2024-08-23 20:27:44,551 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
157 |
+
2024-08-23 20:27:45,085 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: stop_status
|
158 |
+
2024-08-23 20:27:45,085 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
159 |
+
2024-08-23 20:27:45,085 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: stop_status
|
160 |
+
2024-08-23 20:27:45,420 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
161 |
+
2024-08-23 20:27:46,421 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
162 |
+
2024-08-23 20:27:49,351 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
163 |
+
2024-08-23 20:27:51,452 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: partial_history
|
164 |
+
2024-08-23 20:27:51,454 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
165 |
+
2024-08-23 20:27:51,454 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
166 |
+
2024-08-23 20:27:51,456 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
167 |
+
2024-08-23 20:27:52,425 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
168 |
+
2024-08-23 20:27:53,012 DEBUG SenderThread:13176 [sender.py:send():382] send: exit
|
169 |
+
2024-08-23 20:27:53,012 INFO SenderThread:13176 [sender.py:send_exit():589] handling exit code: 255
|
170 |
+
2024-08-23 20:27:53,013 INFO SenderThread:13176 [sender.py:send_exit():591] handling runtime: 131
|
171 |
+
2024-08-23 20:27:53,014 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
172 |
+
2024-08-23 20:27:53,014 INFO SenderThread:13176 [sender.py:send_exit():597] send defer
|
173 |
+
2024-08-23 20:27:53,014 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
174 |
+
2024-08-23 20:27:53,014 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 0
|
175 |
+
2024-08-23 20:27:53,015 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
176 |
+
2024-08-23 20:27:53,015 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 0
|
177 |
+
2024-08-23 20:27:53,015 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 1
|
178 |
+
2024-08-23 20:27:53,015 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
179 |
+
2024-08-23 20:27:53,015 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 1
|
180 |
+
2024-08-23 20:27:53,015 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
181 |
+
2024-08-23 20:27:53,015 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 1
|
182 |
+
2024-08-23 20:27:53,015 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 2
|
183 |
+
2024-08-23 20:27:53,015 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
184 |
+
2024-08-23 20:27:53,015 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 2
|
185 |
+
2024-08-23 20:27:53,015 INFO HandlerThread:13176 [system_monitor.py:finish():203] Stopping system monitor
|
186 |
+
2024-08-23 20:27:53,015 DEBUG SystemMonitor:13176 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
187 |
+
2024-08-23 20:27:53,016 DEBUG SystemMonitor:13176 [system_monitor.py:_start():183] Publishing last batch of metrics
|
188 |
+
2024-08-23 20:27:53,016 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined cpu monitor
|
189 |
+
2024-08-23 20:27:53,018 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined disk monitor
|
190 |
+
2024-08-23 20:27:53,348 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined gpu monitor
|
191 |
+
2024-08-23 20:27:53,349 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined memory monitor
|
192 |
+
2024-08-23 20:27:53,349 INFO HandlerThread:13176 [interfaces.py:finish():202] Joined network monitor
|
193 |
+
2024-08-23 20:27:53,350 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
194 |
+
2024-08-23 20:27:53,350 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 2
|
195 |
+
2024-08-23 20:27:53,350 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 3
|
196 |
+
2024-08-23 20:27:53,350 DEBUG SenderThread:13176 [sender.py:send():382] send: stats
|
197 |
+
2024-08-23 20:27:53,350 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
198 |
+
2024-08-23 20:27:53,351 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 3
|
199 |
+
2024-08-23 20:27:53,354 DEBUG SenderThread:13176 [sender.py:send():382] send: history
|
200 |
+
2024-08-23 20:27:53,354 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: summary_record
|
201 |
+
2024-08-23 20:27:53,355 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
202 |
+
2024-08-23 20:27:53,355 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
203 |
+
2024-08-23 20:27:53,355 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 3
|
204 |
+
2024-08-23 20:27:53,356 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 4
|
205 |
+
2024-08-23 20:27:53,356 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
206 |
+
2024-08-23 20:27:53,356 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 4
|
207 |
+
2024-08-23 20:27:53,356 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
208 |
+
2024-08-23 20:27:53,356 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 4
|
209 |
+
2024-08-23 20:27:53,356 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 5
|
210 |
+
2024-08-23 20:27:53,356 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
211 |
+
2024-08-23 20:27:53,356 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 5
|
212 |
+
2024-08-23 20:27:53,358 DEBUG SenderThread:13176 [sender.py:send():382] send: summary
|
213 |
+
2024-08-23 20:27:53,358 INFO SenderThread:13176 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
214 |
+
2024-08-23 20:27:53,359 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
215 |
+
2024-08-23 20:27:53,359 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 5
|
216 |
+
2024-08-23 20:27:53,359 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 6
|
217 |
+
2024-08-23 20:27:53,359 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
218 |
+
2024-08-23 20:27:53,359 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 6
|
219 |
+
2024-08-23 20:27:53,359 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
220 |
+
2024-08-23 20:27:53,359 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 6
|
221 |
+
2024-08-23 20:27:53,359 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 7
|
222 |
+
2024-08-23 20:27:53,359 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: status_report
|
223 |
+
2024-08-23 20:27:53,360 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
224 |
+
2024-08-23 20:27:53,360 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 7
|
225 |
+
2024-08-23 20:27:53,360 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
226 |
+
2024-08-23 20:27:53,360 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 7
|
227 |
+
2024-08-23 20:27:53,427 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
228 |
+
2024-08-23 20:27:54,012 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
|
229 |
+
2024-08-23 20:27:54,427 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
230 |
+
2024-08-23 20:27:54,598 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 8
|
231 |
+
2024-08-23 20:27:54,598 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
|
232 |
+
2024-08-23 20:27:54,598 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
233 |
+
2024-08-23 20:27:54,598 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 8
|
234 |
+
2024-08-23 20:27:54,599 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
235 |
+
2024-08-23 20:27:54,599 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 8
|
236 |
+
2024-08-23 20:27:54,599 INFO SenderThread:13176 [job_builder.py:build():296] Attempting to build job artifact
|
237 |
+
2024-08-23 20:27:54,600 INFO SenderThread:13176 [job_builder.py:_get_source_type():426] is repo sourced job
|
238 |
+
2024-08-23 20:27:54,620 INFO SenderThread:13176 [job_builder.py:build():402] adding wandb-job metadata file
|
239 |
+
2024-08-23 20:27:54,630 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 9
|
240 |
+
2024-08-23 20:27:54,631 DEBUG SenderThread:13176 [sender.py:send():382] send: artifact
|
241 |
+
2024-08-23 20:27:54,631 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
242 |
+
2024-08-23 20:27:54,632 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 9
|
243 |
+
2024-08-23 20:27:55,013 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
|
244 |
+
2024-08-23 20:27:55,428 INFO Thread-12 :13176 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
245 |
+
2024-08-23 20:27:55,507 INFO SenderThread:13176 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MTk4ODkxMQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MjAxODA1Mw==', 'versionIndex': 3}}}
|
246 |
+
2024-08-23 20:27:55,507 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
247 |
+
2024-08-23 20:27:55,507 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 9
|
248 |
+
2024-08-23 20:27:55,507 INFO SenderThread:13176 [dir_watcher.py:finish():358] shutting down directory watcher
|
249 |
+
2024-08-23 20:27:56,429 INFO SenderThread:13176 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_202540-om09pls8/files
|
250 |
+
2024-08-23 20:27:56,429 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/requirements.txt requirements.txt
|
251 |
+
2024-08-23 20:27:56,429 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/config.yaml config.yaml
|
252 |
+
2024-08-23 20:27:56,431 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/wandb-metadata.json wandb-metadata.json
|
253 |
+
2024-08-23 20:27:56,431 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json wandb-summary.json
|
254 |
+
2024-08-23 20:27:56,433 INFO SenderThread:13176 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_202540-om09pls8/files/output.log output.log
|
255 |
+
2024-08-23 20:27:56,433 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 10
|
256 |
+
2024-08-23 20:27:56,434 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
|
257 |
+
2024-08-23 20:27:56,435 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
258 |
+
2024-08-23 20:27:56,435 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 10
|
259 |
+
2024-08-23 20:27:56,436 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
260 |
+
2024-08-23 20:27:56,436 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 10
|
261 |
+
2024-08-23 20:27:56,436 INFO SenderThread:13176 [file_pusher.py:finish():172] shutting down file pusher
|
262 |
+
2024-08-23 20:27:56,841 INFO wandb-upload_0:13176 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_202540-om09pls8/files/requirements.txt
|
263 |
+
2024-08-23 20:27:56,894 INFO wandb-upload_2:13176 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_202540-om09pls8/files/wandb-summary.json
|
264 |
+
2024-08-23 20:27:56,896 INFO wandb-upload_3:13176 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_202540-om09pls8/files/output.log
|
265 |
+
2024-08-23 20:27:56,902 INFO wandb-upload_1:13176 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_202540-om09pls8/files/config.yaml
|
266 |
+
2024-08-23 20:27:57,013 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
|
267 |
+
2024-08-23 20:27:57,014 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
|
268 |
+
2024-08-23 20:27:57,103 INFO Thread-11 (_thread_body):13176 [sender.py:transition_state():617] send defer: 11
|
269 |
+
2024-08-23 20:27:57,103 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
270 |
+
2024-08-23 20:27:57,103 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 11
|
271 |
+
2024-08-23 20:27:57,103 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
272 |
+
2024-08-23 20:27:57,103 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 11
|
273 |
+
2024-08-23 20:27:57,103 INFO SenderThread:13176 [file_pusher.py:join():178] waiting for file pusher
|
274 |
+
2024-08-23 20:27:57,103 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 12
|
275 |
+
2024-08-23 20:27:57,103 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
276 |
+
2024-08-23 20:27:57,103 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 12
|
277 |
+
2024-08-23 20:27:57,104 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
278 |
+
2024-08-23 20:27:57,104 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 12
|
279 |
+
2024-08-23 20:27:57,104 INFO SenderThread:13176 [file_stream.py:finish():595] file stream finish called
|
280 |
+
2024-08-23 20:27:57,389 INFO SenderThread:13176 [file_stream.py:finish():599] file stream finish is done
|
281 |
+
2024-08-23 20:27:57,389 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 13
|
282 |
+
2024-08-23 20:27:57,389 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
283 |
+
2024-08-23 20:27:57,389 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 13
|
284 |
+
2024-08-23 20:27:57,389 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
285 |
+
2024-08-23 20:27:57,389 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 13
|
286 |
+
2024-08-23 20:27:57,389 INFO SenderThread:13176 [sender.py:transition_state():617] send defer: 14
|
287 |
+
2024-08-23 20:27:57,390 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: defer
|
288 |
+
2024-08-23 20:27:57,390 DEBUG SenderThread:13176 [sender.py:send():382] send: final
|
289 |
+
2024-08-23 20:27:57,390 INFO HandlerThread:13176 [handler.py:handle_request_defer():172] handle defer: 14
|
290 |
+
2024-08-23 20:27:57,390 DEBUG SenderThread:13176 [sender.py:send():382] send: footer
|
291 |
+
2024-08-23 20:27:57,390 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: defer
|
292 |
+
2024-08-23 20:27:57,390 INFO SenderThread:13176 [sender.py:send_request_defer():613] handle sender defer: 14
|
293 |
+
2024-08-23 20:27:57,391 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
|
294 |
+
2024-08-23 20:27:57,391 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
|
295 |
+
2024-08-23 20:27:57,391 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: poll_exit
|
296 |
+
2024-08-23 20:27:57,391 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: server_info
|
297 |
+
2024-08-23 20:27:57,392 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: poll_exit
|
298 |
+
2024-08-23 20:27:57,392 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: get_summary
|
299 |
+
2024-08-23 20:27:57,392 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: server_info
|
300 |
+
2024-08-23 20:27:57,393 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: sampled_history
|
301 |
+
2024-08-23 20:27:57,395 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: internal_messages
|
302 |
+
2024-08-23 20:27:57,395 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: job_info
|
303 |
+
2024-08-23 20:27:57,563 DEBUG SenderThread:13176 [sender.py:send_request():409] send_request: job_info
|
304 |
+
2024-08-23 20:27:57,563 INFO MainThread:13176 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
305 |
+
2024-08-23 20:27:57,564 INFO MainThread:13176 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
306 |
+
2024-08-23 20:27:57,564 INFO MainThread:13176 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
307 |
+
2024-08-23 20:27:57,564 DEBUG HandlerThread:13176 [handler.py:handle_request():146] handle_request: shutdown
|
308 |
+
2024-08-23 20:27:57,564 INFO HandlerThread:13176 [handler.py:finish():869] shutting down handler
|
309 |
+
2024-08-23 20:27:58,396 INFO WriterThread:13176 [datastore.py:close():296] close: /project/wandb/run-20240823_202540-om09pls8/run-om09pls8.wandb
|
310 |
+
2024-08-23 20:27:58,563 INFO SenderThread:13176 [sender.py:finish():1572] shutting down sender
|
311 |
+
2024-08-23 20:27:58,563 INFO SenderThread:13176 [file_pusher.py:finish():172] shutting down file pusher
|
312 |
+
2024-08-23 20:27:58,563 INFO SenderThread:13176 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240823_202540-om09pls8/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-23 20:25:40,742 INFO MainThread:12857 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Configure stats pid to 12857
|
3 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
6 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_202540-om09pls8/logs/debug.log
|
9 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_202540-om09pls8/logs/debug-internal.log
|
10 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-20:25:00', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 3, 'save_interval': 500, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 16000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 16000, 'train_samples': None, 'global_batch_size': 612, 'micro_batch_size': 17, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 10, 'rank': 0, 'world_size': 4, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 9}
|
13 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-23 20:25:40,743 INFO MainThread:12857 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-23 20:25:40,748 INFO MainThread:12857 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-23 20:25:40,750 INFO MainThread:12857 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-23 20:25:40,754 INFO MainThread:12857 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-23 20:25:40,864 INFO MainThread:12857 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-23 20:25:41,341 INFO MainThread:12857 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-23 20:25:41,364 INFO MainThread:12857 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-23 20:25:41,364 INFO MainThread:12857 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-23 20:25:41,512 INFO MainThread:12857 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-23 20:25:41,512 INFO MainThread:12857 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-23 20:25:41,512 INFO MainThread:12857 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-23 20:25:41,513 INFO MainThread:12857 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-23 20:25:41,514 INFO MainThread:12857 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-23 20:26:02,594 INFO MainThread:12857 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 1024, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-23 20:26:02,595 INFO MainThread:12857 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 4}
|
30 |
+
2024-08-23 20:27:58,565 WARNING MsgRouterThr:12857 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240823_202540-om09pls8/run-om09pls8.wandb
ADDED
Binary file (47.7 kB). View file
|
|
wandb/run-20240831_192346-5vo4p2k7/files/config.yaml
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: SHARD_GRAD_OP
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value: null
|
30 |
+
valid_data_path:
|
31 |
+
desc: null
|
32 |
+
value: null
|
33 |
+
test_data_path:
|
34 |
+
desc: null
|
35 |
+
value: null
|
36 |
+
data_cache_path:
|
37 |
+
desc: null
|
38 |
+
value: null
|
39 |
+
vocab_size:
|
40 |
+
desc: null
|
41 |
+
value: null
|
42 |
+
vocab_file:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
merge_file:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
seq_length:
|
49 |
+
desc: null
|
50 |
+
value: 2048
|
51 |
+
num_workers:
|
52 |
+
desc: null
|
53 |
+
value: 4
|
54 |
+
tokenizer_type:
|
55 |
+
desc: null
|
56 |
+
value: HFPreTrainedTokenizer
|
57 |
+
tokenizer_model:
|
58 |
+
desc: null
|
59 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
60 |
+
reset_position_ids:
|
61 |
+
desc: null
|
62 |
+
value: false
|
63 |
+
reset_attention_mask:
|
64 |
+
desc: null
|
65 |
+
value: false
|
66 |
+
eod_mask_loss:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
retro_return_doc_ids:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
short_seq_prob:
|
73 |
+
desc: null
|
74 |
+
value: 0.1
|
75 |
+
vocab_extra_ids:
|
76 |
+
desc: null
|
77 |
+
value: 0
|
78 |
+
seed:
|
79 |
+
desc: null
|
80 |
+
value: 1234
|
81 |
+
use_mpi:
|
82 |
+
desc: null
|
83 |
+
value: false
|
84 |
+
wandb_entity:
|
85 |
+
desc: null
|
86 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
87 |
+
wandb_name:
|
88 |
+
desc: null
|
89 |
+
value: yans-baseline-qwen2-0.5B-3.5e-5-ichikara_train_2024-08-31-19:23:33
|
90 |
+
wandb_project:
|
91 |
+
desc: null
|
92 |
+
value: yans_experiment
|
93 |
+
quantization:
|
94 |
+
desc: null
|
95 |
+
value: false
|
96 |
+
use_freeze_layers:
|
97 |
+
desc: null
|
98 |
+
value: false
|
99 |
+
freeze_layers:
|
100 |
+
desc: null
|
101 |
+
value: null
|
102 |
+
bf16:
|
103 |
+
desc: null
|
104 |
+
value: true
|
105 |
+
fp16:
|
106 |
+
desc: null
|
107 |
+
value: false
|
108 |
+
mixed_precision:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
param_dtype:
|
112 |
+
desc: null
|
113 |
+
value: null
|
114 |
+
load:
|
115 |
+
desc: null
|
116 |
+
value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara
|
117 |
+
save:
|
118 |
+
desc: null
|
119 |
+
value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara
|
120 |
+
base_model:
|
121 |
+
desc: null
|
122 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
123 |
+
use_better_transformer:
|
124 |
+
desc: null
|
125 |
+
value: false
|
126 |
+
grad_clip_norm:
|
127 |
+
desc: null
|
128 |
+
value: 1.0
|
129 |
+
eval_interval:
|
130 |
+
desc: null
|
131 |
+
value: 100
|
132 |
+
save_interval:
|
133 |
+
desc: null
|
134 |
+
value: 100
|
135 |
+
eval_iters:
|
136 |
+
desc: null
|
137 |
+
value: 10
|
138 |
+
optimizer:
|
139 |
+
desc: null
|
140 |
+
value: anyprecision
|
141 |
+
lr:
|
142 |
+
desc: null
|
143 |
+
value: 2.0e-05
|
144 |
+
lr_decay_style:
|
145 |
+
desc: null
|
146 |
+
value: cosine
|
147 |
+
lr_decay_iters:
|
148 |
+
desc: null
|
149 |
+
value: 1000
|
150 |
+
lr_warmup_iters:
|
151 |
+
desc: null
|
152 |
+
value: 25
|
153 |
+
min_lr:
|
154 |
+
desc: null
|
155 |
+
value: 1.0e-06
|
156 |
+
train_iters:
|
157 |
+
desc: null
|
158 |
+
value: 1000
|
159 |
+
train_samples:
|
160 |
+
desc: null
|
161 |
+
value: null
|
162 |
+
global_batch_size:
|
163 |
+
desc: null
|
164 |
+
value: 16
|
165 |
+
micro_batch_size:
|
166 |
+
desc: null
|
167 |
+
value: 4
|
168 |
+
make_vocab_size_divisible_by:
|
169 |
+
desc: null
|
170 |
+
value: 128
|
171 |
+
sliding_window_size:
|
172 |
+
desc: null
|
173 |
+
value: 4096
|
174 |
+
skip_batch:
|
175 |
+
desc: null
|
176 |
+
value: null
|
177 |
+
no_save_optimizer_state:
|
178 |
+
desc: null
|
179 |
+
value: false
|
180 |
+
continual_pretraining:
|
181 |
+
desc: null
|
182 |
+
value: false
|
183 |
+
instruction_tuning:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
direct_preference_optimization:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
attention_dropout:
|
190 |
+
desc: null
|
191 |
+
value: 0.1
|
192 |
+
hidden_dropout:
|
193 |
+
desc: null
|
194 |
+
value: 0.1
|
195 |
+
weight_decay:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
adam_beta1:
|
199 |
+
desc: null
|
200 |
+
value: 0.9
|
201 |
+
adam_beta2:
|
202 |
+
desc: null
|
203 |
+
value: 0.99
|
204 |
+
adam_eps:
|
205 |
+
desc: null
|
206 |
+
value: 1.0e-06
|
207 |
+
hf_transformer_model_dir:
|
208 |
+
desc: null
|
209 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
210 |
+
instruction_train_data_path:
|
211 |
+
desc: null
|
212 |
+
value: /work/datasets/bin/ichikara/train/data
|
213 |
+
instruction_valid_data_path:
|
214 |
+
desc: null
|
215 |
+
value: /work/datasets/bin/ichikara/valid/data
|
216 |
+
epoch:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_dataset_size:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
save_sampler_state:
|
223 |
+
desc: null
|
224 |
+
value: true
|
225 |
+
label_smoothing:
|
226 |
+
desc: null
|
227 |
+
value: 0.0
|
228 |
+
save_n_checkpoints:
|
229 |
+
desc: null
|
230 |
+
value: 10
|
231 |
+
hf_repo_id:
|
232 |
+
desc: null
|
233 |
+
value: koichi12/yans-baseline-qwen2-0.5B-3.5e-5-ichikara
|
234 |
+
create_public_hf_repo:
|
235 |
+
desc: null
|
236 |
+
value: false
|
237 |
+
upload_all_checkpoints_to_hf:
|
238 |
+
desc: null
|
239 |
+
value: false
|
240 |
+
hf_upload_retry_limit:
|
241 |
+
desc: null
|
242 |
+
value: 2
|
243 |
+
exit_duration_in_mins:
|
244 |
+
desc: null
|
245 |
+
value: null
|
246 |
+
source_key:
|
247 |
+
desc: null
|
248 |
+
value: source
|
249 |
+
target_key:
|
250 |
+
desc: null
|
251 |
+
value: target
|
252 |
+
attn_implementation:
|
253 |
+
desc: null
|
254 |
+
value: flash_attention_2
|
255 |
+
efficient_instruction_tuning:
|
256 |
+
desc: null
|
257 |
+
value: true
|
258 |
+
remove_padding_masking:
|
259 |
+
desc: null
|
260 |
+
value: true
|
261 |
+
save_start_iter:
|
262 |
+
desc: null
|
263 |
+
value: null
|
264 |
+
valid_micro_batch_size:
|
265 |
+
desc: null
|
266 |
+
value: 1
|
267 |
+
rank:
|
268 |
+
desc: null
|
269 |
+
value: 0
|
270 |
+
world_size:
|
271 |
+
desc: null
|
272 |
+
value: 1
|
273 |
+
padded_vocab_size:
|
274 |
+
desc: null
|
275 |
+
value: 151680
|
276 |
+
gradient_accumulation_steps:
|
277 |
+
desc: null
|
278 |
+
value: 4
|
279 |
+
_wandb:
|
280 |
+
desc: null
|
281 |
+
value:
|
282 |
+
python_version: 3.10.12
|
283 |
+
cli_version: 0.16.3
|
284 |
+
framework: huggingface
|
285 |
+
huggingface_version: 4.43.3
|
286 |
+
is_jupyter_run: false
|
287 |
+
is_kaggle_kernel: false
|
288 |
+
start_time: 1725099826.21653
|
289 |
+
t:
|
290 |
+
1:
|
291 |
+
- 1
|
292 |
+
- 11
|
293 |
+
- 49
|
294 |
+
- 55
|
295 |
+
- 71
|
296 |
+
- 105
|
297 |
+
2:
|
298 |
+
- 1
|
299 |
+
- 11
|
300 |
+
- 49
|
301 |
+
- 55
|
302 |
+
- 71
|
303 |
+
- 105
|
304 |
+
3:
|
305 |
+
- 13
|
306 |
+
- 16
|
307 |
+
- 23
|
308 |
+
4: 3.10.12
|
309 |
+
5: 0.16.3
|
310 |
+
6: 4.43.3
|
311 |
+
8:
|
312 |
+
- 5
|
313 |
+
13: linux-x86_64
|
wandb/run-20240831_192346-5vo4p2k7/files/output.log
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-baseline-qwen2-0.5B-3.5e-5-ichikara.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara/latest_iteration.txt
|
8 |
+
Traceback (most recent call last):
|
9 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
10 |
+
main()
|
11 |
+
File "/project/src/llama_recipes/finetuning.py", line 103, in main
|
12 |
+
model = get_model(
|
13 |
+
File "/project/src/llama_recipes/get_models.py", line 106, in get_model
|
14 |
+
assert sliding_window == 131072
|
15 |
+
AssertionError
|
wandb/run-20240831_192346-5vo4p2k7/files/requirements.txt
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
build==1.2.1
|
23 |
+
cachecontrol==0.14.0
|
24 |
+
cachetools==5.3.2
|
25 |
+
catalogue==2.0.10
|
26 |
+
certifi==2024.2.2
|
27 |
+
cffi==1.16.0
|
28 |
+
chardet==5.2.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
cleo==2.1.0
|
31 |
+
click==8.1.7
|
32 |
+
cloudpathlib==0.16.0
|
33 |
+
cloudpickle==3.0.0
|
34 |
+
cmake==3.28.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.1
|
37 |
+
confection==0.1.4
|
38 |
+
contourpy==1.2.0
|
39 |
+
cramjam==2.8.3
|
40 |
+
crashtest==0.4.1
|
41 |
+
cryptography==43.0.0
|
42 |
+
cubinlinker==0.3.0+2.g405ac64
|
43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
44 |
+
cudf==23.12.0
|
45 |
+
cugraph-dgl==23.12.0
|
46 |
+
cugraph-service-client==23.12.0
|
47 |
+
cugraph-service-server==23.12.0
|
48 |
+
cugraph==23.12.0
|
49 |
+
cuml==23.12.0
|
50 |
+
cupy-cuda12x==12.3.0
|
51 |
+
cycler==0.12.1
|
52 |
+
cymem==2.0.8
|
53 |
+
cython==3.0.8
|
54 |
+
dask-cuda==23.12.0
|
55 |
+
dask-cudf==23.12.0
|
56 |
+
dask==2023.11.0
|
57 |
+
dataclasses-json==0.6.7
|
58 |
+
dataproperty==1.0.1
|
59 |
+
datasets==2.20.0
|
60 |
+
debugpy==1.8.1
|
61 |
+
decorator==5.1.1
|
62 |
+
defusedxml==0.7.1
|
63 |
+
dill==0.3.8
|
64 |
+
distlib==0.3.8
|
65 |
+
distributed==2023.11.0
|
66 |
+
distro==1.9.0
|
67 |
+
dm-tree==0.1.8
|
68 |
+
docker-pycreds==0.4.0
|
69 |
+
dulwich==0.21.7
|
70 |
+
einops==0.7.0
|
71 |
+
emoji==2.12.1
|
72 |
+
entmax==1.3
|
73 |
+
evaluate==0.4.2
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
execnet==2.0.2
|
76 |
+
executing==2.0.1
|
77 |
+
expecttest==0.1.3
|
78 |
+
fastjsonschema==2.19.1
|
79 |
+
fastparquet==2023.10.1
|
80 |
+
fastrlock==0.8.2
|
81 |
+
filelock==3.13.1
|
82 |
+
flash-attn==2.4.2
|
83 |
+
fonttools==4.48.1
|
84 |
+
frozenlist==1.4.1
|
85 |
+
fsspec==2023.12.2
|
86 |
+
fugashi==1.3.2
|
87 |
+
fuzzywuzzy==0.18.0
|
88 |
+
gast==0.5.4
|
89 |
+
gitdb==4.0.11
|
90 |
+
gitpython==3.1.43
|
91 |
+
google-auth-oauthlib==0.4.6
|
92 |
+
google-auth==2.27.0
|
93 |
+
graphsurgeon==0.4.6
|
94 |
+
greenlet==3.0.3
|
95 |
+
grpcio==1.60.1
|
96 |
+
h11==0.14.0
|
97 |
+
httpcore==1.0.5
|
98 |
+
httpx==0.27.0
|
99 |
+
huggingface-hub==0.24.5
|
100 |
+
hydra-core==1.3.2
|
101 |
+
hypothesis==5.35.1
|
102 |
+
idna==3.6
|
103 |
+
importlib-metadata==7.0.1
|
104 |
+
iniconfig==2.0.0
|
105 |
+
installer==0.7.0
|
106 |
+
intel-openmp==2021.4.0
|
107 |
+
ipadic==1.0.0
|
108 |
+
ipykernel==6.29.2
|
109 |
+
ipython-genutils==0.2.0
|
110 |
+
ipython==8.21.0
|
111 |
+
isort==5.13.2
|
112 |
+
jaraco.classes==3.4.0
|
113 |
+
jedi==0.19.1
|
114 |
+
jeepney==0.8.0
|
115 |
+
jinja2==3.1.3
|
116 |
+
jiter==0.5.0
|
117 |
+
joblib==1.3.2
|
118 |
+
json5==0.9.14
|
119 |
+
jsonargparse==3.13.1
|
120 |
+
jsonlines==4.0.0
|
121 |
+
jsonnet==0.19.1
|
122 |
+
jsonpatch==1.33
|
123 |
+
jsonpointer==3.0.0
|
124 |
+
jsonschema-specifications==2023.12.1
|
125 |
+
jsonschema==4.21.1
|
126 |
+
jupyter-client==8.6.0
|
127 |
+
jupyter-core==5.7.1
|
128 |
+
jupyter-tensorboard==0.2.0
|
129 |
+
jupyterlab-pygments==0.3.0
|
130 |
+
jupyterlab-server==1.2.0
|
131 |
+
jupyterlab==2.3.2
|
132 |
+
jupytext==1.16.1
|
133 |
+
keyring==24.3.1
|
134 |
+
kiwisolver==1.4.5
|
135 |
+
langchain-community==0.2.12
|
136 |
+
langchain-core==0.2.31
|
137 |
+
langchain-huggingface==0.0.2
|
138 |
+
langchain-openai==0.1.21
|
139 |
+
langchain-text-splitters==0.2.2
|
140 |
+
langchain==0.2.13
|
141 |
+
langcodes==3.3.0
|
142 |
+
langsmith==0.1.99
|
143 |
+
lazy-loader==0.3
|
144 |
+
levenshtein==0.25.1
|
145 |
+
librosa==0.10.1
|
146 |
+
lightning-utilities==0.11.6
|
147 |
+
llm-jp-eval==1.4.0
|
148 |
+
llvmlite==0.40.1
|
149 |
+
lm-eval==0.3.0
|
150 |
+
locket==1.0.0
|
151 |
+
logzero==1.7.0
|
152 |
+
lxml==5.2.2
|
153 |
+
markdown-it-py==3.0.0
|
154 |
+
markdown==3.5.2
|
155 |
+
markupsafe==2.1.4
|
156 |
+
marshmallow==3.21.3
|
157 |
+
matplotlib-inline==0.1.6
|
158 |
+
matplotlib==3.8.2
|
159 |
+
mbstrdecoder==1.1.3
|
160 |
+
mccabe==0.7.0
|
161 |
+
mdit-py-plugins==0.4.0
|
162 |
+
mdurl==0.1.2
|
163 |
+
mecab-python3==1.0.6
|
164 |
+
mistune==3.0.2
|
165 |
+
mkl-devel==2021.1.1
|
166 |
+
mkl-include==2021.1.1
|
167 |
+
mkl==2021.1.1
|
168 |
+
mock==5.1.0
|
169 |
+
mojimoji==0.0.13
|
170 |
+
more-itertools==9.1.0
|
171 |
+
mpmath==1.3.0
|
172 |
+
msgpack==1.0.7
|
173 |
+
multidict==6.0.4
|
174 |
+
multiprocess==0.70.16
|
175 |
+
murmurhash==1.0.10
|
176 |
+
mypy-extensions==1.0.0
|
177 |
+
nbclient==0.9.0
|
178 |
+
nbconvert==7.16.0
|
179 |
+
nbformat==5.9.2
|
180 |
+
neologdn==0.5.3
|
181 |
+
nest-asyncio==1.6.0
|
182 |
+
networkx==2.6.3
|
183 |
+
ninja==1.11.1.1
|
184 |
+
nltk==3.8.1
|
185 |
+
notebook==6.4.10
|
186 |
+
numba==0.57.1+1.g1ff679645
|
187 |
+
numexpr==2.10.1
|
188 |
+
numpy==1.24.4
|
189 |
+
nvfuser==0.1.4a0+d0bb811
|
190 |
+
nvidia-dali-cuda120==1.34.0
|
191 |
+
nvidia-pyindex==1.0.9
|
192 |
+
nvtx==0.2.5
|
193 |
+
oauthlib==3.2.2
|
194 |
+
omegaconf==2.3.0
|
195 |
+
onnx==1.15.0rc2
|
196 |
+
openai==1.40.6
|
197 |
+
opencv==4.7.0
|
198 |
+
optree==0.10.0
|
199 |
+
orjson==3.10.7
|
200 |
+
packaging==23.2
|
201 |
+
pandas==2.2.2
|
202 |
+
pandocfilters==1.5.1
|
203 |
+
parso==0.8.3
|
204 |
+
partd==1.4.1
|
205 |
+
pathvalidate==3.2.0
|
206 |
+
peft==0.5.0
|
207 |
+
pexpect==4.9.0
|
208 |
+
pillow==10.2.0
|
209 |
+
pip==24.0
|
210 |
+
pkginfo==1.11.1
|
211 |
+
plac==1.4.3
|
212 |
+
platformdirs==4.2.0
|
213 |
+
pluggy==1.4.0
|
214 |
+
ply==3.11
|
215 |
+
poetry-core==1.9.0
|
216 |
+
poetry-plugin-export==1.8.0
|
217 |
+
poetry==1.8.3
|
218 |
+
polygraphy==0.49.4
|
219 |
+
pooch==1.8.0
|
220 |
+
portalocker==2.10.1
|
221 |
+
preshed==3.0.9
|
222 |
+
prettytable==3.9.0
|
223 |
+
prometheus-client==0.19.0
|
224 |
+
prompt-toolkit==3.0.43
|
225 |
+
protobuf==4.24.4
|
226 |
+
psutil==5.9.4
|
227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
228 |
+
ptyprocess==0.7.0
|
229 |
+
pure-eval==0.2.2
|
230 |
+
pyarrow-hotfix==0.6
|
231 |
+
pyarrow==15.0.2
|
232 |
+
pyasn1-modules==0.3.0
|
233 |
+
pyasn1==0.5.1
|
234 |
+
pybind11-global==2.11.1
|
235 |
+
pybind11==2.11.1
|
236 |
+
pycocotools==2.0+nv0.8.0
|
237 |
+
pycountry==24.6.1
|
238 |
+
pycparser==2.21
|
239 |
+
pydantic-core==2.16.2
|
240 |
+
pydantic==2.6.1
|
241 |
+
pygments==2.17.2
|
242 |
+
pylibcugraph==23.12.0
|
243 |
+
pylibcugraphops==23.12.0
|
244 |
+
pylibraft==23.12.0
|
245 |
+
pylint==3.2.6
|
246 |
+
pynvml==11.4.1
|
247 |
+
pyparsing==3.1.1
|
248 |
+
pyproject-hooks==1.1.0
|
249 |
+
pytablewriter==1.2.0
|
250 |
+
pytest-flakefinder==1.1.0
|
251 |
+
pytest-rerunfailures==13.0
|
252 |
+
pytest-shard==0.1.2
|
253 |
+
pytest-xdist==3.5.0
|
254 |
+
pytest==8.0.0
|
255 |
+
python-dateutil==2.8.2
|
256 |
+
python-dotenv==1.0.0
|
257 |
+
python-hostlist==1.23.0
|
258 |
+
python-levenshtein==0.25.1
|
259 |
+
pytorch-lightning==2.4.0
|
260 |
+
pytorch-quantization==2.1.2
|
261 |
+
pytz==2023.3.post1
|
262 |
+
pyyaml==6.0.1
|
263 |
+
pyzmq==25.1.2
|
264 |
+
raft-dask==23.12.0
|
265 |
+
rapidfuzz==3.9.6
|
266 |
+
rapids-dask-dependency==23.12.1
|
267 |
+
referencing==0.33.0
|
268 |
+
regex==2023.12.25
|
269 |
+
requests-oauthlib==1.3.1
|
270 |
+
requests-toolbelt==1.0.0
|
271 |
+
requests==2.32.3
|
272 |
+
rhoknp==1.7.0
|
273 |
+
rich==13.7.0
|
274 |
+
rmm==23.12.0
|
275 |
+
rouge-score==0.1.2
|
276 |
+
rpds-py==0.17.1
|
277 |
+
rsa==4.9
|
278 |
+
sacrebleu==2.4.2
|
279 |
+
safetensors==0.4.3
|
280 |
+
scikit-learn==1.5.1
|
281 |
+
scipy==1.12.0
|
282 |
+
secretstorage==3.3.3
|
283 |
+
send2trash==1.8.2
|
284 |
+
sentence-transformers==3.0.1
|
285 |
+
sentencepiece==0.1.99
|
286 |
+
sentry-sdk==2.12.0
|
287 |
+
setproctitle==1.3.3
|
288 |
+
setuptools==68.2.2
|
289 |
+
shellingham==1.5.4
|
290 |
+
six==1.16.0
|
291 |
+
smart-open==6.4.0
|
292 |
+
smmap==5.0.1
|
293 |
+
sniffio==1.3.1
|
294 |
+
sortedcontainers==2.4.0
|
295 |
+
soundfile==0.12.1
|
296 |
+
soupsieve==2.5
|
297 |
+
soxr==0.3.7
|
298 |
+
spacy-legacy==3.0.12
|
299 |
+
spacy-loggers==1.0.5
|
300 |
+
spacy==3.7.2
|
301 |
+
sphinx-glpi-theme==0.6
|
302 |
+
sqlalchemy==2.0.32
|
303 |
+
sqlitedict==2.1.0
|
304 |
+
srsly==2.4.8
|
305 |
+
stack-data==0.6.3
|
306 |
+
sumeval==0.2.2
|
307 |
+
sympy==1.12
|
308 |
+
tabledata==1.3.3
|
309 |
+
tabulate==0.9.0
|
310 |
+
tbb==2021.11.0
|
311 |
+
tblib==3.0.0
|
312 |
+
tcolorpy==0.1.6
|
313 |
+
tenacity==8.5.0
|
314 |
+
tensorboard-data-server==0.6.1
|
315 |
+
tensorboard-plugin-wit==1.8.1
|
316 |
+
tensorboard==2.9.0
|
317 |
+
tensorrt==8.6.3
|
318 |
+
terminado==0.18.0
|
319 |
+
termplotlib==0.3.9
|
320 |
+
text-generation==0.7.0
|
321 |
+
thinc==8.2.3
|
322 |
+
threadpoolctl==3.2.0
|
323 |
+
thriftpy2==0.4.17
|
324 |
+
tiktoken==0.7.0
|
325 |
+
tinycss2==1.2.1
|
326 |
+
tokenizers==0.19.1
|
327 |
+
toml==0.10.2
|
328 |
+
tomli==2.0.1
|
329 |
+
tomlkit==0.13.2
|
330 |
+
toolz==0.12.1
|
331 |
+
torch-tensorrt==2.3.0a0
|
332 |
+
torch==2.3.0a0+ebedce2
|
333 |
+
torchdata==0.7.1a0
|
334 |
+
torchmetrics==0.10.3
|
335 |
+
torchtext==0.17.0a0
|
336 |
+
torchvision==0.18.0a0
|
337 |
+
tornado==6.4
|
338 |
+
tqdm-multiprocess==0.0.11
|
339 |
+
tqdm==4.66.5
|
340 |
+
traitlets==5.9.0
|
341 |
+
transformer-engine==1.3.0+5b90b7f
|
342 |
+
transformers==4.43.3
|
343 |
+
treelite-runtime==3.9.1
|
344 |
+
treelite==3.9.1
|
345 |
+
triton==2.2.0+e28a256
|
346 |
+
trove-classifiers==2024.7.2
|
347 |
+
typepy==1.3.2
|
348 |
+
typer==0.9.0
|
349 |
+
types-dataclasses==0.6.6
|
350 |
+
typing-extensions==4.12.2
|
351 |
+
typing-inspect==0.9.0
|
352 |
+
tzdata==2024.1
|
353 |
+
ucx-py==0.35.0
|
354 |
+
uff==0.6.9
|
355 |
+
ujson==5.8.0
|
356 |
+
unbabel-comet==2.2.2
|
357 |
+
unidic-lite==1.0.8
|
358 |
+
urllib3==1.26.18
|
359 |
+
virtualenv==20.26.3
|
360 |
+
wandb==0.16.3
|
361 |
+
wasabi==1.1.2
|
362 |
+
wcwidth==0.2.13
|
363 |
+
weasel==0.3.4
|
364 |
+
webencodings==0.5.1
|
365 |
+
werkzeug==3.0.1
|
366 |
+
wheel==0.42.0
|
367 |
+
word2number==1.1
|
368 |
+
xdoctest==1.0.2
|
369 |
+
xgboost==1.7.6
|
370 |
+
xmltodict==0.13.0
|
371 |
+
xxhash==3.4.1
|
372 |
+
yarl==1.9.4
|
373 |
+
zict==3.0.0
|
374 |
+
zipp==3.17.0
|
375 |
+
zstandard==0.23.0
|
wandb/run-20240831_192346-5vo4p2k7/files/wandb-metadata.json
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-31T10:23:46.789150",
|
5 |
+
"startedAt": "2024-08-31T10:23:46.204215",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"2048",
|
11 |
+
"--micro-batch-size",
|
12 |
+
"4",
|
13 |
+
"--valid_micro_batch_size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"16",
|
17 |
+
"--train-iters",
|
18 |
+
"1000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
23 |
+
"--instruction-train-data-path",
|
24 |
+
"/work/datasets/bin/ichikara/train/data",
|
25 |
+
"--instruction-valid-data-path",
|
26 |
+
"/work/datasets/bin/ichikara/valid/data",
|
27 |
+
"--efficient-instruction-tuning",
|
28 |
+
"--remove-padding-masking",
|
29 |
+
"--source-key",
|
30 |
+
"source",
|
31 |
+
"--target-key",
|
32 |
+
"target",
|
33 |
+
"--lr",
|
34 |
+
"2e-5",
|
35 |
+
"--min-lr",
|
36 |
+
"1e-6",
|
37 |
+
"--lr-decay-style",
|
38 |
+
"cosine",
|
39 |
+
"--lr-warmup-iters",
|
40 |
+
"25",
|
41 |
+
"--lr-decay-iters",
|
42 |
+
"1000",
|
43 |
+
"--weight-decay",
|
44 |
+
"0.1",
|
45 |
+
"--grad-clip-norm",
|
46 |
+
"1.0",
|
47 |
+
"--optimizer",
|
48 |
+
"anyprecision",
|
49 |
+
"--adam-beta1",
|
50 |
+
"0.9",
|
51 |
+
"--adam-beta2",
|
52 |
+
"0.99",
|
53 |
+
"--adam-eps",
|
54 |
+
"1e-6",
|
55 |
+
"--save-interval",
|
56 |
+
"100",
|
57 |
+
"--eval-interval",
|
58 |
+
"100",
|
59 |
+
"--eval-iters",
|
60 |
+
"10",
|
61 |
+
"--bf16",
|
62 |
+
"--mixed-precision",
|
63 |
+
"--base-model",
|
64 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
65 |
+
"--hf-transformer-model-dir",
|
66 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
67 |
+
"--save",
|
68 |
+
"/work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara",
|
69 |
+
"--load",
|
70 |
+
"/work/llm_recipes/models/yans-baseline-qwen2-0.5B-3.5e-5-ichikara",
|
71 |
+
"--fsdp-activation-checkpointing",
|
72 |
+
"--sharding-strategy",
|
73 |
+
"SHARD_GRAD_OP",
|
74 |
+
"--checkpoint-type",
|
75 |
+
"LOCAL_STATE_DICT",
|
76 |
+
"--save-sampler-state",
|
77 |
+
"--save-n-checkpoints",
|
78 |
+
"10",
|
79 |
+
"--hf-upload-retry-limit",
|
80 |
+
"2",
|
81 |
+
"--hf-repo-id",
|
82 |
+
"koichi12/yans-baseline-qwen2-0.5B-3.5e-5-ichikara",
|
83 |
+
"--num-workers",
|
84 |
+
"4",
|
85 |
+
"--wandb-entity",
|
86 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
87 |
+
"--wandb-project",
|
88 |
+
"yans_experiment",
|
89 |
+
"--wandb-name",
|
90 |
+
"yans-baseline-qwen2-0.5B-3.5e-5-ichikara_train_2024-08-31-19:23:33"
|
91 |
+
],
|
92 |
+
"state": "running",
|
93 |
+
"program": "/project/examples/finetuning.py",
|
94 |
+
"codePathLocal": "examples/finetuning.py",
|
95 |
+
"codePath": "examples/finetuning.py",
|
96 |
+
"git": {
|
97 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
98 |
+
"commit": "3b2976faebe2228c39adb20194a29b785a37defe"
|
99 |
+
},
|
100 |
+
"email": null,
|
101 |
+
"root": "/project",
|
102 |
+
"host": "gpu-koiwa-00",
|
103 |
+
"username": "koiwa",
|
104 |
+
"executable": "/usr/bin/python",
|
105 |
+
"cpu_count": 18,
|
106 |
+
"cpu_count_logical": 18,
|
107 |
+
"cpu_freq": {
|
108 |
+
"current": 2400.025999999999,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
"cpu_freq_per_core": [
|
113 |
+
{
|
114 |
+
"current": 2400.026,
|
115 |
+
"min": 0.0,
|
116 |
+
"max": 0.0
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"current": 2400.026,
|
120 |
+
"min": 0.0,
|
121 |
+
"max": 0.0
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"current": 2400.026,
|
125 |
+
"min": 0.0,
|
126 |
+
"max": 0.0
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"current": 2400.026,
|
130 |
+
"min": 0.0,
|
131 |
+
"max": 0.0
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"current": 2400.026,
|
135 |
+
"min": 0.0,
|
136 |
+
"max": 0.0
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"current": 2400.026,
|
140 |
+
"min": 0.0,
|
141 |
+
"max": 0.0
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"current": 2400.026,
|
145 |
+
"min": 0.0,
|
146 |
+
"max": 0.0
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"current": 2400.026,
|
150 |
+
"min": 0.0,
|
151 |
+
"max": 0.0
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"current": 2400.026,
|
155 |
+
"min": 0.0,
|
156 |
+
"max": 0.0
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"current": 2400.026,
|
160 |
+
"min": 0.0,
|
161 |
+
"max": 0.0
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"current": 2400.026,
|
165 |
+
"min": 0.0,
|
166 |
+
"max": 0.0
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"current": 2400.026,
|
170 |
+
"min": 0.0,
|
171 |
+
"max": 0.0
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"current": 2400.026,
|
175 |
+
"min": 0.0,
|
176 |
+
"max": 0.0
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"current": 2400.026,
|
180 |
+
"min": 0.0,
|
181 |
+
"max": 0.0
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"current": 2400.026,
|
185 |
+
"min": 0.0,
|
186 |
+
"max": 0.0
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"current": 2400.026,
|
190 |
+
"min": 0.0,
|
191 |
+
"max": 0.0
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"current": 2400.026,
|
195 |
+
"min": 0.0,
|
196 |
+
"max": 0.0
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"current": 2400.026,
|
200 |
+
"min": 0.0,
|
201 |
+
"max": 0.0
|
202 |
+
}
|
203 |
+
],
|
204 |
+
"disk": {
|
205 |
+
"/": {
|
206 |
+
"total": 0.0625,
|
207 |
+
"used": 1.1444091796875e-05
|
208 |
+
}
|
209 |
+
},
|
210 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
211 |
+
"gpu_count": 1,
|
212 |
+
"gpu_devices": [
|
213 |
+
{
|
214 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
215 |
+
"memory_total": 42949672960
|
216 |
+
}
|
217 |
+
],
|
218 |
+
"memory": {
|
219 |
+
"total": 56.48781967163086
|
220 |
+
}
|
221 |
+
}
|
wandb/run-20240831_192346-5vo4p2k7/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 3}}
|