Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- wandb/run-20240804_021032-cd2cg2ui/files/config.yaml +314 -0
- wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt +271 -0
- wandb/run-20240804_021032-cd2cg2ui/logs/debug-internal.log +185 -0
- wandb/run-20240804_021032-cd2cg2ui/logs/debug.log +28 -0
- wandb/run-20240804_021032-cd2cg2ui/run-cd2cg2ui.wandb +0 -0
- wandb/run-20240804_035140-nyllt780/files/config.yaml +335 -0
- wandb/run-20240804_035140-nyllt780/files/output.log +130 -0
- wandb/run-20240804_035140-nyllt780/files/requirements.txt +271 -0
- wandb/run-20240804_035140-nyllt780/files/wandb-metadata.json +215 -0
- wandb/run-20240804_035140-nyllt780/files/wandb-summary.json +1 -0
- wandb/run-20240804_035140-nyllt780/logs/debug-internal.log +215 -0
- wandb/run-20240804_035140-nyllt780/logs/debug.log +30 -0
- wandb/run-20240804_035140-nyllt780/run-nyllt780.wandb +0 -0
- wandb/run-20240804_211947-niq3ake5/files/config.yaml +335 -0
- wandb/run-20240804_211947-niq3ake5/files/output.log +135 -0
- wandb/run-20240804_211947-niq3ake5/files/requirements.txt +271 -0
- wandb/run-20240804_211947-niq3ake5/files/wandb-metadata.json +215 -0
- wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json +1 -0
- wandb/run-20240804_211947-niq3ake5/logs/debug-internal.log +213 -0
- wandb/run-20240804_211947-niq3ake5/logs/debug.log +30 -0
- wandb/run-20240804_211947-niq3ake5/run-niq3ake5.wandb +0 -0
- wandb/run-20240812_055620-qpw0uqx2/files/config.yaml +314 -0
- wandb/run-20240812_055620-qpw0uqx2/files/output.log +9 -0
- wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt +271 -0
- wandb/run-20240812_055620-qpw0uqx2/files/wandb-metadata.json +215 -0
- wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json +1 -0
- wandb/run-20240812_055620-qpw0uqx2/logs/debug-internal.log +181 -0
- wandb/run-20240812_055620-qpw0uqx2/logs/debug.log +27 -0
- wandb/run-20240812_055620-qpw0uqx2/run-qpw0uqx2.wandb +0 -0
- wandb/run-20240812_073955-ikoro1zp/files/config.yaml +335 -0
- wandb/run-20240812_073955-ikoro1zp/files/output.log +0 -0
- wandb/run-20240812_073955-ikoro1zp/files/requirements.txt +271 -0
- wandb/run-20240812_073955-ikoro1zp/files/wandb-metadata.json +215 -0
- wandb/run-20240812_073955-ikoro1zp/files/wandb-summary.json +1 -0
- wandb/run-20240812_073955-ikoro1zp/logs/debug-internal.log +0 -0
- wandb/run-20240812_073955-ikoro1zp/logs/debug.log +29 -0
- wandb/run-20240823_160642-78xnl14c/files/config.yaml +342 -0
- wandb/run-20240823_160642-78xnl14c/files/output.log +253 -0
- wandb/run-20240823_160642-78xnl14c/files/requirements.txt +375 -0
- wandb/run-20240823_160642-78xnl14c/files/wandb-metadata.json +220 -0
- wandb/run-20240823_160642-78xnl14c/files/wandb-summary.json +1 -0
- wandb/run-20240823_160642-78xnl14c/logs/debug-internal.log +0 -0
- wandb/run-20240823_160642-78xnl14c/logs/debug.log +30 -0
- wandb/run-20240823_160642-78xnl14c/run-78xnl14c.wandb +0 -0
- wandb/run-20240823_162922-z3gs82jm/files/config.yaml +342 -0
- wandb/run-20240823_162922-z3gs82jm/files/output.log +174 -0
- wandb/run-20240823_162922-z3gs82jm/files/requirements.txt +375 -0
- wandb/run-20240823_162922-z3gs82jm/files/wandb-metadata.json +220 -0
- wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json +1 -0
- wandb/run-20240823_162922-z3gs82jm/logs/debug-internal.log +453 -0
wandb/run-20240804_021032-cd2cg2ui/files/config.yaml
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 1024
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-mistral-sample_train_2024-08-04-02:10:14
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-mistral-sample
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-mistral-sample
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/custom/tiny-mistral
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 8
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 8192
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-mistral-sample
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32768
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722705032.417279
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240804_021032-cd2cg2ui/logs/debug-internal.log
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 02:10:32,669 INFO StreamThr :11309 [internal.py:wandb_internal():86] W&B internal server running at pid: 11309, started at: 2024-08-04 02:10:32.417731
|
2 |
+
2024-08-04 02:10:32,670 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-04 02:10:32,672 INFO WriterThread:11309 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_021032-cd2cg2ui/run-cd2cg2ui.wandb
|
4 |
+
2024-08-04 02:10:32,673 DEBUG SenderThread:11309 [sender.py:send():382] send: header
|
5 |
+
2024-08-04 02:10:32,883 DEBUG SenderThread:11309 [sender.py:send():382] send: run
|
6 |
+
2024-08-04 02:10:33,348 INFO SenderThread:11309 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_021032-cd2cg2ui/files
|
7 |
+
2024-08-04 02:10:33,348 INFO SenderThread:11309 [sender.py:_start_run_threads():1136] run started: cd2cg2ui with start time 1722705032.417279
|
8 |
+
2024-08-04 02:10:33,353 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-04 02:10:33,354 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-04 02:10:33,438 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-04 02:10:33,444 DEBUG HandlerThread:11309 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-04 02:10:33,444 DEBUG HandlerThread:11309 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-04 02:10:33,444 INFO HandlerThread:11309 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-04 02:10:33,444 INFO SystemMonitor:11309 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-04 02:10:33,445 INFO HandlerThread:11309 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-04 02:10:33,445 INFO SystemMonitor:11309 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-04 02:10:33,445 INFO SystemMonitor:11309 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-04 02:10:33,447 INFO SystemMonitor:11309 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-04 02:10:33,447 INFO SystemMonitor:11309 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-04 02:10:33,448 INFO SystemMonitor:11309 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-04 02:10:33,458 DEBUG HandlerThread:11309 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-04 02:10:33,460 DEBUG HandlerThread:11309 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-04 02:10:33,471 DEBUG HandlerThread:11309 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-04 02:10:33,471 DEBUG HandlerThread:11309 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-04 02:10:33,471 DEBUG HandlerThread:11309 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T17:10:33.458421', 'startedAt': '2024-08-03T17:10:32.395506', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '8192', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample', '--load', '/work/llm_recipes/models/tiny-mistral-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample_train_2024-08-04-02:10:14'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
26 |
+
2024-08-04 02:10:33,471 INFO HandlerThread:11309 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-04 02:10:33,471 INFO HandlerThread:11309 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-04 02:10:33,473 INFO HandlerThread:11309 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-04 02:10:33,478 DEBUG SenderThread:11309 [sender.py:send():382] send: files
|
30 |
+
2024-08-04 02:10:33,479 INFO SenderThread:11309 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-04 02:10:33,488 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-04 02:10:33,488 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-04 02:10:33,488 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: python_packages
|
34 |
+
2024-08-04 02:10:33,489 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: internal_messages
|
35 |
+
2024-08-04 02:10:33,490 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-04 02:10:33,776 DEBUG SenderThread:11309 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-04 02:10:34,131 INFO wandb-upload_0:11309 [upload_job.py:push():131] Uploaded file /tmp/tmpcp1trk59wandb/1uhn5dog-wandb-metadata.json
|
38 |
+
2024-08-04 02:10:34,327 DEBUG SenderThread:11309 [sender.py:send():382] send: exit
|
39 |
+
2024-08-04 02:10:34,327 INFO SenderThread:11309 [sender.py:send_exit():589] handling exit code: 1
|
40 |
+
2024-08-04 02:10:34,327 INFO SenderThread:11309 [sender.py:send_exit():591] handling runtime: 0
|
41 |
+
2024-08-04 02:10:34,328 INFO SenderThread:11309 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
42 |
+
2024-08-04 02:10:34,329 INFO SenderThread:11309 [sender.py:send_exit():597] send defer
|
43 |
+
2024-08-04 02:10:34,329 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
44 |
+
2024-08-04 02:10:34,329 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 0
|
45 |
+
2024-08-04 02:10:34,329 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
46 |
+
2024-08-04 02:10:34,329 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 0
|
47 |
+
2024-08-04 02:10:34,329 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 1
|
48 |
+
2024-08-04 02:10:34,329 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
49 |
+
2024-08-04 02:10:34,329 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 1
|
50 |
+
2024-08-04 02:10:34,330 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
51 |
+
2024-08-04 02:10:34,330 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 1
|
52 |
+
2024-08-04 02:10:34,330 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 2
|
53 |
+
2024-08-04 02:10:34,330 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
54 |
+
2024-08-04 02:10:34,330 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 2
|
55 |
+
2024-08-04 02:10:34,330 INFO HandlerThread:11309 [system_monitor.py:finish():203] Stopping system monitor
|
56 |
+
2024-08-04 02:10:34,330 DEBUG SystemMonitor:11309 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
57 |
+
2024-08-04 02:10:34,330 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined cpu monitor
|
58 |
+
2024-08-04 02:10:34,330 DEBUG SystemMonitor:11309 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
59 |
+
2024-08-04 02:10:34,331 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined disk monitor
|
60 |
+
2024-08-04 02:10:34,331 DEBUG SystemMonitor:11309 [system_monitor.py:_start():183] Publishing last batch of metrics
|
61 |
+
2024-08-04 02:10:34,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt
|
62 |
+
2024-08-04 02:10:34,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021032-cd2cg2ui/files/output.log
|
63 |
+
2024-08-04 02:10:34,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json
|
64 |
+
2024-08-04 02:10:34,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json
|
65 |
+
2024-08-04 02:10:34,365 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined gpu monitor
|
66 |
+
2024-08-04 02:10:34,366 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined memory monitor
|
67 |
+
2024-08-04 02:10:34,366 INFO HandlerThread:11309 [interfaces.py:finish():202] Joined network monitor
|
68 |
+
2024-08-04 02:10:34,366 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
69 |
+
2024-08-04 02:10:34,366 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 2
|
70 |
+
2024-08-04 02:10:34,366 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 3
|
71 |
+
2024-08-04 02:10:34,366 DEBUG SenderThread:11309 [sender.py:send():382] send: stats
|
72 |
+
2024-08-04 02:10:34,366 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
73 |
+
2024-08-04 02:10:34,367 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 3
|
74 |
+
2024-08-04 02:10:34,367 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
75 |
+
2024-08-04 02:10:34,367 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 3
|
76 |
+
2024-08-04 02:10:34,367 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 4
|
77 |
+
2024-08-04 02:10:34,367 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
78 |
+
2024-08-04 02:10:34,367 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 4
|
79 |
+
2024-08-04 02:10:34,367 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
80 |
+
2024-08-04 02:10:34,367 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 4
|
81 |
+
2024-08-04 02:10:34,367 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 5
|
82 |
+
2024-08-04 02:10:34,367 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
83 |
+
2024-08-04 02:10:34,367 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 5
|
84 |
+
2024-08-04 02:10:34,368 DEBUG SenderThread:11309 [sender.py:send():382] send: summary
|
85 |
+
2024-08-04 02:10:34,368 INFO SenderThread:11309 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
86 |
+
2024-08-04 02:10:34,369 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
87 |
+
2024-08-04 02:10:34,369 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 5
|
88 |
+
2024-08-04 02:10:34,369 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 6
|
89 |
+
2024-08-04 02:10:34,369 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
90 |
+
2024-08-04 02:10:34,369 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 6
|
91 |
+
2024-08-04 02:10:34,369 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
92 |
+
2024-08-04 02:10:34,369 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 6
|
93 |
+
2024-08-04 02:10:34,372 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: status_report
|
94 |
+
2024-08-04 02:10:34,573 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 7
|
95 |
+
2024-08-04 02:10:34,573 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
96 |
+
2024-08-04 02:10:34,573 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 7
|
97 |
+
2024-08-04 02:10:34,573 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
98 |
+
2024-08-04 02:10:34,573 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 7
|
99 |
+
2024-08-04 02:10:35,327 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
|
100 |
+
2024-08-04 02:10:35,350 INFO Thread-12 :11309 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021032-cd2cg2ui/files/config.yaml
|
101 |
+
2024-08-04 02:10:36,318 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 8
|
102 |
+
2024-08-04 02:10:36,318 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
|
103 |
+
2024-08-04 02:10:36,318 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
104 |
+
2024-08-04 02:10:36,318 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 8
|
105 |
+
2024-08-04 02:10:36,318 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
106 |
+
2024-08-04 02:10:36,318 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 8
|
107 |
+
2024-08-04 02:10:36,319 INFO SenderThread:11309 [job_builder.py:build():296] Attempting to build job artifact
|
108 |
+
2024-08-04 02:10:36,319 INFO SenderThread:11309 [job_builder.py:_get_source_type():426] is repo sourced job
|
109 |
+
2024-08-04 02:10:36,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
|
110 |
+
2024-08-04 02:10:36,333 INFO SenderThread:11309 [job_builder.py:build():402] adding wandb-job metadata file
|
111 |
+
2024-08-04 02:10:36,342 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 9
|
112 |
+
2024-08-04 02:10:36,342 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
|
113 |
+
2024-08-04 02:10:36,342 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
114 |
+
2024-08-04 02:10:36,342 DEBUG SenderThread:11309 [sender.py:send():382] send: artifact
|
115 |
+
2024-08-04 02:10:36,342 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 9
|
116 |
+
2024-08-04 02:10:36,351 INFO Thread-12 :11309 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021032-cd2cg2ui/files/output.log
|
117 |
+
2024-08-04 02:10:37,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
|
118 |
+
2024-08-04 02:10:37,552 INFO wandb-upload_1:11309 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpteaibpd9
|
119 |
+
2024-08-04 02:10:37,910 INFO wandb-upload_0:11309 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp9hfu5wh3
|
120 |
+
2024-08-04 02:10:39,216 INFO SenderThread:11309 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MTk4ODAyMA==', 'versionIndex': 2}}}
|
121 |
+
2024-08-04 02:10:39,216 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
122 |
+
2024-08-04 02:10:39,216 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 9
|
123 |
+
2024-08-04 02:10:39,216 INFO SenderThread:11309 [dir_watcher.py:finish():358] shutting down directory watcher
|
124 |
+
2024-08-04 02:10:39,352 INFO SenderThread:11309 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_021032-cd2cg2ui/files
|
125 |
+
2024-08-04 02:10:39,352 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt requirements.txt
|
126 |
+
2024-08-04 02:10:39,352 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/config.yaml config.yaml
|
127 |
+
2024-08-04 02:10:39,354 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json wandb-metadata.json
|
128 |
+
2024-08-04 02:10:39,354 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json wandb-summary.json
|
129 |
+
2024-08-04 02:10:39,355 INFO SenderThread:11309 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021032-cd2cg2ui/files/output.log output.log
|
130 |
+
2024-08-04 02:10:39,357 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 10
|
131 |
+
2024-08-04 02:10:39,357 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
|
132 |
+
2024-08-04 02:10:39,358 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
133 |
+
2024-08-04 02:10:39,358 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 10
|
134 |
+
2024-08-04 02:10:39,359 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
135 |
+
2024-08-04 02:10:39,359 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 10
|
136 |
+
2024-08-04 02:10:39,359 INFO SenderThread:11309 [file_pusher.py:finish():172] shutting down file pusher
|
137 |
+
2024-08-04 02:10:39,788 INFO wandb-upload_0:11309 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021032-cd2cg2ui/files/config.yaml
|
138 |
+
2024-08-04 02:10:39,856 INFO wandb-upload_1:11309 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021032-cd2cg2ui/files/requirements.txt
|
139 |
+
2024-08-04 02:10:39,931 INFO wandb-upload_3:11309 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021032-cd2cg2ui/files/output.log
|
140 |
+
2024-08-04 02:10:39,937 INFO wandb-upload_2:11309 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json
|
141 |
+
2024-08-04 02:10:40,137 INFO Thread-11 (_thread_body):11309 [sender.py:transition_state():617] send defer: 11
|
142 |
+
2024-08-04 02:10:40,137 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
143 |
+
2024-08-04 02:10:40,137 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 11
|
144 |
+
2024-08-04 02:10:40,138 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
145 |
+
2024-08-04 02:10:40,138 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 11
|
146 |
+
2024-08-04 02:10:40,138 INFO SenderThread:11309 [file_pusher.py:join():178] waiting for file pusher
|
147 |
+
2024-08-04 02:10:40,138 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 12
|
148 |
+
2024-08-04 02:10:40,138 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
149 |
+
2024-08-04 02:10:40,138 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 12
|
150 |
+
2024-08-04 02:10:40,138 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
151 |
+
2024-08-04 02:10:40,138 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 12
|
152 |
+
2024-08-04 02:10:40,138 INFO SenderThread:11309 [file_stream.py:finish():595] file stream finish called
|
153 |
+
2024-08-04 02:10:40,324 INFO SenderThread:11309 [file_stream.py:finish():599] file stream finish is done
|
154 |
+
2024-08-04 02:10:40,324 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 13
|
155 |
+
2024-08-04 02:10:40,324 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
156 |
+
2024-08-04 02:10:40,324 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 13
|
157 |
+
2024-08-04 02:10:40,324 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
158 |
+
2024-08-04 02:10:40,324 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 13
|
159 |
+
2024-08-04 02:10:40,324 INFO SenderThread:11309 [sender.py:transition_state():617] send defer: 14
|
160 |
+
2024-08-04 02:10:40,325 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: defer
|
161 |
+
2024-08-04 02:10:40,325 DEBUG SenderThread:11309 [sender.py:send():382] send: final
|
162 |
+
2024-08-04 02:10:40,325 INFO HandlerThread:11309 [handler.py:handle_request_defer():172] handle defer: 14
|
163 |
+
2024-08-04 02:10:40,325 DEBUG SenderThread:11309 [sender.py:send():382] send: footer
|
164 |
+
2024-08-04 02:10:40,325 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: defer
|
165 |
+
2024-08-04 02:10:40,325 INFO SenderThread:11309 [sender.py:send_request_defer():613] handle sender defer: 14
|
166 |
+
2024-08-04 02:10:40,325 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
|
167 |
+
2024-08-04 02:10:40,326 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: poll_exit
|
168 |
+
2024-08-04 02:10:40,326 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
|
169 |
+
2024-08-04 02:10:40,326 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: server_info
|
170 |
+
2024-08-04 02:10:40,326 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: poll_exit
|
171 |
+
2024-08-04 02:10:40,326 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: server_info
|
172 |
+
2024-08-04 02:10:40,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: get_summary
|
173 |
+
2024-08-04 02:10:40,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: sampled_history
|
174 |
+
2024-08-04 02:10:40,328 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: internal_messages
|
175 |
+
2024-08-04 02:10:40,329 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: job_info
|
176 |
+
2024-08-04 02:10:40,492 DEBUG SenderThread:11309 [sender.py:send_request():409] send_request: job_info
|
177 |
+
2024-08-04 02:10:40,492 INFO MainThread:11309 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
178 |
+
2024-08-04 02:10:40,492 INFO MainThread:11309 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
179 |
+
2024-08-04 02:10:40,492 INFO MainThread:11309 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
180 |
+
2024-08-04 02:10:40,492 DEBUG HandlerThread:11309 [handler.py:handle_request():146] handle_request: shutdown
|
181 |
+
2024-08-04 02:10:40,492 INFO HandlerThread:11309 [handler.py:finish():869] shutting down handler
|
182 |
+
2024-08-04 02:10:41,329 INFO WriterThread:11309 [datastore.py:close():296] close: /project/wandb/run-20240804_021032-cd2cg2ui/run-cd2cg2ui.wandb
|
183 |
+
2024-08-04 02:10:41,492 INFO SenderThread:11309 [sender.py:finish():1572] shutting down sender
|
184 |
+
2024-08-04 02:10:41,492 INFO SenderThread:11309 [file_pusher.py:finish():172] shutting down file pusher
|
185 |
+
2024-08-04 02:10:41,492 INFO SenderThread:11309 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_021032-cd2cg2ui/logs/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 02:10:32,410 INFO MainThread:11238 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Configure stats pid to 11238
|
3 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
|
6 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_021032-cd2cg2ui/logs/debug.log
|
9 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_021032-cd2cg2ui/logs/debug-internal.log
|
10 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-04-02:10:14', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 8192, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-04 02:10:32,411 INFO MainThread:11238 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-04 02:10:32,416 INFO MainThread:11238 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-04 02:10:32,416 INFO MainThread:11238 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-04 02:10:32,422 INFO MainThread:11238 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-04 02:10:32,879 INFO MainThread:11238 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-04 02:10:33,353 INFO MainThread:11238 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-04 02:10:33,431 INFO MainThread:11238 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-04 02:10:33,431 INFO MainThread:11238 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-04 02:10:33,487 INFO MainThread:11238 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-04 02:10:33,487 INFO MainThread:11238 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-04 02:10:33,487 INFO MainThread:11238 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-04 02:10:33,488 INFO MainThread:11238 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-04 02:10:33,489 INFO MainThread:11238 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-04 02:10:41,493 WARNING MsgRouterThr:11238 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_021032-cd2cg2ui/run-cd2cg2ui.wandb
ADDED
Binary file (7.16 kB). View file
|
|
wandb/run-20240804_035140-nyllt780/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 512
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-llama-sample_train_2024-08-04-03:51:30
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-llama-sample
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-llama-sample
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 8
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-llama-sample
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32000
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722711100.510646
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 2048
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: llama
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 2048
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 32
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 22
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: LlamaForCausalLM
|
wandb/run-20240804_035140-nyllt780/files/output.log
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/tiny-llama-sample.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
8 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
9 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
10 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
11 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
12 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
13 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
14 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping model loading
|
15 |
+
--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
16 |
+
--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
|
17 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
18 |
+
warnings.warn(
|
19 |
+
Let split = None
|
20 |
+
Building a BlendedDataset for a single MegatronDataset
|
21 |
+
Unable to save the indexes because path_to_cache is None
|
22 |
+
Building a BlendedDataset for a single MegatronDataset
|
23 |
+
Unable to save the indexes because path_to_cache is None
|
24 |
+
Building a BlendedDataset for a single MegatronDataset
|
25 |
+
Unable to save the indexes because path_to_cache is None
|
26 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
27 |
+
--> applying fsdp activation checkpointing...
|
28 |
+
> datasets target sizes (minimum size):
|
29 |
+
train: 6400000
|
30 |
+
validation: 323200
|
31 |
+
test: 3200
|
32 |
+
> building train, validation, and test datasets for GPT ...
|
33 |
+
> finished creating GPT datasets ...
|
34 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
35 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
36 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping optimizer loading
|
37 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
38 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
39 |
+
model info: FullyShardedDataParallel(
|
40 |
+
(_fsdp_wrapped_module): LlamaForCausalLM(
|
41 |
+
(model): LlamaModel(
|
42 |
+
(embed_tokens): Embedding(32000, 2048)
|
43 |
+
(layers): ModuleList(
|
44 |
+
(0-21): 22 x FullyShardedDataParallel(
|
45 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
46 |
+
(_checkpoint_wrapped_module): LlamaDecoderLayer(
|
47 |
+
(self_attn): LlamaFlashAttention2(
|
48 |
+
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
49 |
+
(k_proj): Linear(in_features=2048, out_features=256, bias=False)
|
50 |
+
(v_proj): Linear(in_features=2048, out_features=256, bias=False)
|
51 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
52 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
53 |
+
)
|
54 |
+
(mlp): LlamaMLP(
|
55 |
+
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
56 |
+
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
57 |
+
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
|
58 |
+
(act_fn): SiLU()
|
59 |
+
)
|
60 |
+
(input_layernorm): LlamaRMSNorm()
|
61 |
+
(post_attention_layernorm): LlamaRMSNorm()
|
62 |
+
)
|
63 |
+
)
|
64 |
+
)
|
65 |
+
)
|
66 |
+
(norm): LlamaRMSNorm()
|
67 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
68 |
+
)
|
69 |
+
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
|
70 |
+
)
|
71 |
+
)
|
72 |
+
model config: LlamaConfig {
|
73 |
+
"_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
74 |
+
"architectures": [
|
75 |
+
"LlamaForCausalLM"
|
76 |
+
],
|
77 |
+
"attention_bias": false,
|
78 |
+
"attention_dropout": 0.0,
|
79 |
+
"bos_token_id": 1,
|
80 |
+
"eos_token_id": 2,
|
81 |
+
"hidden_act": "silu",
|
82 |
+
"hidden_size": 2048,
|
83 |
+
"initializer_range": 0.02,
|
84 |
+
"intermediate_size": 5632,
|
85 |
+
"label_smoothing": 0.0,
|
86 |
+
"max_position_embeddings": 2048,
|
87 |
+
"mlp_bias": false,
|
88 |
+
"model_type": "llama",
|
89 |
+
"num_attention_heads": 32,
|
90 |
+
"num_hidden_layers": 22,
|
91 |
+
"num_key_value_heads": 4,
|
92 |
+
"pretraining_tp": 1,
|
93 |
+
"rms_norm_eps": 1e-05,
|
94 |
+
"rope_scaling": null,
|
95 |
+
"rope_theta": 10000.0,
|
96 |
+
"tie_word_embeddings": false,
|
97 |
+
"torch_dtype": "float32",
|
98 |
+
"transformers_version": "4.43.3",
|
99 |
+
"use_cache": false,
|
100 |
+
"vocab_size": 32000
|
101 |
+
}
|
102 |
+
Traceback (most recent call last):
|
103 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
104 |
+
main()
|
105 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
106 |
+
train(
|
107 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
|
108 |
+
loss: torch.Tensor = model(**batch).loss
|
109 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
110 |
+
return self._call_impl(*args, **kwargs)
|
111 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
112 |
+
return forward_call(*args, **kwargs)
|
113 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
114 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
116 |
+
return self._call_impl(*args, **kwargs)
|
117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
118 |
+
return forward_call(*args, **kwargs)
|
119 |
+
File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
|
120 |
+
outputs = self.model(
|
121 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
122 |
+
return self._call_impl(*args, **kwargs)
|
123 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
124 |
+
return forward_call(*args, **kwargs)
|
125 |
+
File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
|
126 |
+
cache_position = torch.arange(
|
127 |
+
RuntimeError: CUDA error: device-side assert triggered
|
128 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
129 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
|
130 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
wandb/run-20240804_035140-nyllt780/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240804_035140-nyllt780/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-03T18:51:41.236802",
|
5 |
+
"startedAt": "2024-08-03T18:51:40.498160",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"512",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"8",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-llama-sample",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-llama-sample",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-llama-sample",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-llama-sample_train_2024-08-04-03:51:30"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.034,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.034,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.034,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.034,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.034,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.034,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.034,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.034,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.034,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.034,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.034,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.034,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.034,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.034,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.034,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.034,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.034,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.034,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.034,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240804_035140-nyllt780/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 63}}
|
wandb/run-20240804_035140-nyllt780/logs/debug-internal.log
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 03:51:40,525 INFO StreamThr :12425 [internal.py:wandb_internal():86] W&B internal server running at pid: 12425, started at: 2024-08-04 03:51:40.511089
|
2 |
+
2024-08-04 03:51:40,526 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-04 03:51:40,527 INFO WriterThread:12425 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_035140-nyllt780/run-nyllt780.wandb
|
4 |
+
2024-08-04 03:51:40,541 DEBUG SenderThread:12425 [sender.py:send():382] send: header
|
5 |
+
2024-08-04 03:51:40,658 DEBUG SenderThread:12425 [sender.py:send():382] send: run
|
6 |
+
2024-08-04 03:51:41,127 INFO SenderThread:12425 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_035140-nyllt780/files
|
7 |
+
2024-08-04 03:51:41,127 INFO SenderThread:12425 [sender.py:_start_run_threads():1136] run started: nyllt780 with start time 1722711100.510646
|
8 |
+
2024-08-04 03:51:41,132 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-04 03:51:41,133 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-04 03:51:41,218 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-04 03:51:41,224 DEBUG HandlerThread:12425 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-04 03:51:41,224 DEBUG HandlerThread:12425 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-04 03:51:41,224 INFO HandlerThread:12425 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-04 03:51:41,224 INFO SystemMonitor:12425 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-04 03:51:41,224 INFO HandlerThread:12425 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-04 03:51:41,225 INFO SystemMonitor:12425 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-04 03:51:41,225 INFO SystemMonitor:12425 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-04 03:51:41,226 INFO SystemMonitor:12425 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-04 03:51:41,227 INFO SystemMonitor:12425 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-04 03:51:41,228 INFO SystemMonitor:12425 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-04 03:51:41,236 DEBUG HandlerThread:12425 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-04 03:51:41,238 DEBUG HandlerThread:12425 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-04 03:51:41,249 DEBUG HandlerThread:12425 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-04 03:51:41,249 DEBUG HandlerThread:12425 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-04 03:51:41,250 DEBUG HandlerThread:12425 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T18:51:41.236802', 'startedAt': '2024-08-03T18:51:40.498160', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama-sample', '--load', '/work/llm_recipes/models/tiny-llama-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama-sample_train_2024-08-04-03:51:30'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
26 |
+
2024-08-04 03:51:41,250 INFO HandlerThread:12425 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-04 03:51:41,250 INFO HandlerThread:12425 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-04 03:51:41,302 INFO HandlerThread:12425 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-04 03:51:41,308 DEBUG SenderThread:12425 [sender.py:send():382] send: files
|
30 |
+
2024-08-04 03:51:41,308 INFO SenderThread:12425 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-04 03:51:41,317 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-04 03:51:41,317 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-04 03:51:41,317 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-04 03:51:41,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-04 03:51:41,335 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-04 03:51:41,618 DEBUG SenderThread:12425 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-04 03:51:41,985 INFO wandb-upload_0:12425 [upload_job.py:push():131] Uploaded file /tmp/tmpxkt1klm7wandb/bxmu94ae-wandb-metadata.json
|
38 |
+
2024-08-04 03:51:42,129 INFO Thread-12 :12425 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035140-nyllt780/files/output.log
|
39 |
+
2024-08-04 03:51:42,129 INFO Thread-12 :12425 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035140-nyllt780/files/requirements.txt
|
40 |
+
2024-08-04 03:51:42,129 INFO Thread-12 :12425 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035140-nyllt780/files/wandb-metadata.json
|
41 |
+
2024-08-04 03:51:44,129 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
|
42 |
+
2024-08-04 03:51:45,608 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-04 03:51:48,132 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
|
44 |
+
2024-08-04 03:51:50,610 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
45 |
+
2024-08-04 03:51:55,611 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
46 |
+
2024-08-04 03:51:56,316 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
|
47 |
+
2024-08-04 03:51:56,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
|
48 |
+
2024-08-04 03:51:56,317 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
|
49 |
+
2024-08-04 03:52:01,592 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
50 |
+
2024-08-04 03:52:06,593 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
51 |
+
2024-08-04 03:52:11,316 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
|
52 |
+
2024-08-04 03:52:11,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
|
53 |
+
2024-08-04 03:52:11,360 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
|
54 |
+
2024-08-04 03:52:12,552 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
55 |
+
2024-08-04 03:52:13,160 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/config.yaml
|
56 |
+
2024-08-04 03:52:17,755 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
57 |
+
2024-08-04 03:52:22,755 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
58 |
+
2024-08-04 03:52:26,316 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
|
59 |
+
2024-08-04 03:52:26,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
|
60 |
+
2024-08-04 03:52:26,360 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
|
61 |
+
2024-08-04 03:52:28,589 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-04 03:52:33,590 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
63 |
+
2024-08-04 03:52:38,591 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
64 |
+
2024-08-04 03:52:41,228 DEBUG SystemMonitor:12425 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
65 |
+
2024-08-04 03:52:41,230 DEBUG SenderThread:12425 [sender.py:send():382] send: stats
|
66 |
+
2024-08-04 03:52:41,316 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: stop_status
|
67 |
+
2024-08-04 03:52:41,317 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: stop_status
|
68 |
+
2024-08-04 03:52:41,360 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
|
69 |
+
2024-08-04 03:52:43,008 DEBUG SenderThread:12425 [sender.py:send():382] send: config
|
70 |
+
2024-08-04 03:52:43,008 DEBUG SenderThread:12425 [sender.py:send():382] send: config
|
71 |
+
2024-08-04 03:52:44,011 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
72 |
+
2024-08-04 03:52:44,176 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
|
73 |
+
2024-08-04 03:52:44,726 DEBUG SenderThread:12425 [sender.py:send():382] send: exit
|
74 |
+
2024-08-04 03:52:44,726 INFO SenderThread:12425 [sender.py:send_exit():589] handling exit code: 1
|
75 |
+
2024-08-04 03:52:44,726 INFO SenderThread:12425 [sender.py:send_exit():591] handling runtime: 63
|
76 |
+
2024-08-04 03:52:44,741 INFO SenderThread:12425 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
77 |
+
2024-08-04 03:52:44,741 INFO SenderThread:12425 [sender.py:send_exit():597] send defer
|
78 |
+
2024-08-04 03:52:44,742 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
79 |
+
2024-08-04 03:52:44,742 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 0
|
80 |
+
2024-08-04 03:52:44,742 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
81 |
+
2024-08-04 03:52:44,742 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 0
|
82 |
+
2024-08-04 03:52:44,742 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 1
|
83 |
+
2024-08-04 03:52:44,742 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
84 |
+
2024-08-04 03:52:44,742 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 1
|
85 |
+
2024-08-04 03:52:44,742 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
86 |
+
2024-08-04 03:52:44,742 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 1
|
87 |
+
2024-08-04 03:52:44,742 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 2
|
88 |
+
2024-08-04 03:52:44,742 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
89 |
+
2024-08-04 03:52:44,742 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 2
|
90 |
+
2024-08-04 03:52:44,742 INFO HandlerThread:12425 [system_monitor.py:finish():203] Stopping system monitor
|
91 |
+
2024-08-04 03:52:44,743 DEBUG SystemMonitor:12425 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
92 |
+
2024-08-04 03:52:44,743 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined cpu monitor
|
93 |
+
2024-08-04 03:52:44,743 DEBUG SystemMonitor:12425 [system_monitor.py:_start():183] Publishing last batch of metrics
|
94 |
+
2024-08-04 03:52:44,743 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined disk monitor
|
95 |
+
2024-08-04 03:52:44,777 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined gpu monitor
|
96 |
+
2024-08-04 03:52:44,777 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined memory monitor
|
97 |
+
2024-08-04 03:52:44,777 INFO HandlerThread:12425 [interfaces.py:finish():202] Joined network monitor
|
98 |
+
2024-08-04 03:52:44,778 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
99 |
+
2024-08-04 03:52:44,778 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 2
|
100 |
+
2024-08-04 03:52:44,778 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 3
|
101 |
+
2024-08-04 03:52:44,778 DEBUG SenderThread:12425 [sender.py:send():382] send: stats
|
102 |
+
2024-08-04 03:52:44,778 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
103 |
+
2024-08-04 03:52:44,778 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 3
|
104 |
+
2024-08-04 03:52:44,778 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
105 |
+
2024-08-04 03:52:44,779 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 3
|
106 |
+
2024-08-04 03:52:44,779 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 4
|
107 |
+
2024-08-04 03:52:44,779 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
108 |
+
2024-08-04 03:52:44,779 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 4
|
109 |
+
2024-08-04 03:52:44,779 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
110 |
+
2024-08-04 03:52:44,779 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 4
|
111 |
+
2024-08-04 03:52:44,779 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 5
|
112 |
+
2024-08-04 03:52:44,779 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
113 |
+
2024-08-04 03:52:44,779 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 5
|
114 |
+
2024-08-04 03:52:44,779 DEBUG SenderThread:12425 [sender.py:send():382] send: summary
|
115 |
+
2024-08-04 03:52:44,780 INFO SenderThread:12425 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
116 |
+
2024-08-04 03:52:44,780 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
117 |
+
2024-08-04 03:52:44,780 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 5
|
118 |
+
2024-08-04 03:52:44,780 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 6
|
119 |
+
2024-08-04 03:52:44,780 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
120 |
+
2024-08-04 03:52:44,781 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 6
|
121 |
+
2024-08-04 03:52:44,781 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
122 |
+
2024-08-04 03:52:44,781 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 6
|
123 |
+
2024-08-04 03:52:44,781 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 7
|
124 |
+
2024-08-04 03:52:44,781 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: status_report
|
125 |
+
2024-08-04 03:52:44,781 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
126 |
+
2024-08-04 03:52:44,781 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 7
|
127 |
+
2024-08-04 03:52:44,781 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
128 |
+
2024-08-04 03:52:44,781 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 7
|
129 |
+
2024-08-04 03:52:45,177 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/config.yaml
|
130 |
+
2024-08-04 03:52:45,177 INFO Thread-12 :12425 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035140-nyllt780/files/wandb-summary.json
|
131 |
+
2024-08-04 03:52:45,726 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
|
132 |
+
2024-08-04 03:52:46,178 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
|
133 |
+
2024-08-04 03:52:47,600 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 8
|
134 |
+
2024-08-04 03:52:47,600 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
|
135 |
+
2024-08-04 03:52:47,600 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
136 |
+
2024-08-04 03:52:47,601 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 8
|
137 |
+
2024-08-04 03:52:47,601 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
138 |
+
2024-08-04 03:52:47,601 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 8
|
139 |
+
2024-08-04 03:52:47,601 INFO SenderThread:12425 [job_builder.py:build():296] Attempting to build job artifact
|
140 |
+
2024-08-04 03:52:47,602 INFO SenderThread:12425 [job_builder.py:_get_source_type():426] is repo sourced job
|
141 |
+
2024-08-04 03:52:47,616 INFO SenderThread:12425 [job_builder.py:build():402] adding wandb-job metadata file
|
142 |
+
2024-08-04 03:52:47,688 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 9
|
143 |
+
2024-08-04 03:52:47,689 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
144 |
+
2024-08-04 03:52:47,689 DEBUG SenderThread:12425 [sender.py:send():382] send: artifact
|
145 |
+
2024-08-04 03:52:47,689 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 9
|
146 |
+
2024-08-04 03:52:47,727 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
|
147 |
+
2024-08-04 03:52:48,179 INFO Thread-12 :12425 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035140-nyllt780/files/output.log
|
148 |
+
2024-08-04 03:52:48,575 INFO SenderThread:12425 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
149 |
+
2024-08-04 03:52:48,575 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
150 |
+
2024-08-04 03:52:48,575 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 9
|
151 |
+
2024-08-04 03:52:48,575 INFO SenderThread:12425 [dir_watcher.py:finish():358] shutting down directory watcher
|
152 |
+
2024-08-04 03:52:49,180 INFO SenderThread:12425 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_035140-nyllt780/files
|
153 |
+
2024-08-04 03:52:49,180 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/requirements.txt requirements.txt
|
154 |
+
2024-08-04 03:52:49,180 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/config.yaml config.yaml
|
155 |
+
2024-08-04 03:52:49,181 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/wandb-metadata.json wandb-metadata.json
|
156 |
+
2024-08-04 03:52:49,182 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/wandb-summary.json wandb-summary.json
|
157 |
+
2024-08-04 03:52:49,183 INFO SenderThread:12425 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035140-nyllt780/files/output.log output.log
|
158 |
+
2024-08-04 03:52:49,185 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 10
|
159 |
+
2024-08-04 03:52:49,185 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
|
160 |
+
2024-08-04 03:52:49,185 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
161 |
+
2024-08-04 03:52:49,187 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 10
|
162 |
+
2024-08-04 03:52:49,187 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
163 |
+
2024-08-04 03:52:49,187 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 10
|
164 |
+
2024-08-04 03:52:49,187 INFO SenderThread:12425 [file_pusher.py:finish():172] shutting down file pusher
|
165 |
+
2024-08-04 03:52:49,580 INFO wandb-upload_0:12425 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035140-nyllt780/files/requirements.txt
|
166 |
+
2024-08-04 03:52:49,719 INFO wandb-upload_1:12425 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035140-nyllt780/files/config.yaml
|
167 |
+
2024-08-04 03:52:49,727 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
|
168 |
+
2024-08-04 03:52:49,727 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
|
169 |
+
2024-08-04 03:52:49,752 INFO wandb-upload_2:12425 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035140-nyllt780/files/wandb-summary.json
|
170 |
+
2024-08-04 03:52:49,778 INFO wandb-upload_3:12425 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035140-nyllt780/files/output.log
|
171 |
+
2024-08-04 03:52:49,978 INFO Thread-11 (_thread_body):12425 [sender.py:transition_state():617] send defer: 11
|
172 |
+
2024-08-04 03:52:49,978 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
173 |
+
2024-08-04 03:52:49,979 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 11
|
174 |
+
2024-08-04 03:52:49,979 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
175 |
+
2024-08-04 03:52:49,979 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 11
|
176 |
+
2024-08-04 03:52:49,979 INFO SenderThread:12425 [file_pusher.py:join():178] waiting for file pusher
|
177 |
+
2024-08-04 03:52:49,979 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 12
|
178 |
+
2024-08-04 03:52:49,979 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
179 |
+
2024-08-04 03:52:49,979 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 12
|
180 |
+
2024-08-04 03:52:49,979 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
181 |
+
2024-08-04 03:52:49,979 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 12
|
182 |
+
2024-08-04 03:52:49,979 INFO SenderThread:12425 [file_stream.py:finish():595] file stream finish called
|
183 |
+
2024-08-04 03:52:50,544 INFO SenderThread:12425 [file_stream.py:finish():599] file stream finish is done
|
184 |
+
2024-08-04 03:52:50,544 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 13
|
185 |
+
2024-08-04 03:52:50,545 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
186 |
+
2024-08-04 03:52:50,545 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 13
|
187 |
+
2024-08-04 03:52:50,545 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
188 |
+
2024-08-04 03:52:50,545 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 13
|
189 |
+
2024-08-04 03:52:50,545 INFO SenderThread:12425 [sender.py:transition_state():617] send defer: 14
|
190 |
+
2024-08-04 03:52:50,545 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: defer
|
191 |
+
2024-08-04 03:52:50,545 DEBUG SenderThread:12425 [sender.py:send():382] send: final
|
192 |
+
2024-08-04 03:52:50,545 INFO HandlerThread:12425 [handler.py:handle_request_defer():172] handle defer: 14
|
193 |
+
2024-08-04 03:52:50,545 DEBUG SenderThread:12425 [sender.py:send():382] send: footer
|
194 |
+
2024-08-04 03:52:50,546 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: defer
|
195 |
+
2024-08-04 03:52:50,546 INFO SenderThread:12425 [sender.py:send_request_defer():613] handle sender defer: 14
|
196 |
+
2024-08-04 03:52:50,546 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
|
197 |
+
2024-08-04 03:52:50,546 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
|
198 |
+
2024-08-04 03:52:50,546 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: poll_exit
|
199 |
+
2024-08-04 03:52:50,547 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: server_info
|
200 |
+
2024-08-04 03:52:50,547 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: get_summary
|
201 |
+
2024-08-04 03:52:50,547 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: poll_exit
|
202 |
+
2024-08-04 03:52:50,547 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: sampled_history
|
203 |
+
2024-08-04 03:52:50,547 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: server_info
|
204 |
+
2024-08-04 03:52:50,548 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: internal_messages
|
205 |
+
2024-08-04 03:52:50,549 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: job_info
|
206 |
+
2024-08-04 03:52:50,716 DEBUG SenderThread:12425 [sender.py:send_request():409] send_request: job_info
|
207 |
+
2024-08-04 03:52:50,717 INFO MainThread:12425 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
208 |
+
2024-08-04 03:52:50,717 INFO MainThread:12425 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
209 |
+
2024-08-04 03:52:50,717 INFO MainThread:12425 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
210 |
+
2024-08-04 03:52:50,717 DEBUG HandlerThread:12425 [handler.py:handle_request():146] handle_request: shutdown
|
211 |
+
2024-08-04 03:52:50,717 INFO HandlerThread:12425 [handler.py:finish():869] shutting down handler
|
212 |
+
2024-08-04 03:52:51,549 INFO WriterThread:12425 [datastore.py:close():296] close: /project/wandb/run-20240804_035140-nyllt780/run-nyllt780.wandb
|
213 |
+
2024-08-04 03:52:51,717 INFO SenderThread:12425 [sender.py:finish():1572] shutting down sender
|
214 |
+
2024-08-04 03:52:51,717 INFO SenderThread:12425 [file_pusher.py:finish():172] shutting down file pusher
|
215 |
+
2024-08-04 03:52:51,717 INFO SenderThread:12425 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_035140-nyllt780/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 03:51:40,503 INFO MainThread:12354 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Configure stats pid to 12354
|
3 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
|
6 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_035140-nyllt780/logs/debug.log
|
9 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_035140-nyllt780/logs/debug-internal.log
|
10 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama-sample_train_2024-08-04-03:51:30', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama-sample', 'save': '/work/llm_recipes/models/tiny-llama-sample', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-04 03:51:40,504 INFO MainThread:12354 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-04 03:51:40,509 INFO MainThread:12354 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-04 03:51:40,510 INFO MainThread:12354 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-04 03:51:40,515 INFO MainThread:12354 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-04 03:51:40,654 INFO MainThread:12354 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-04 03:51:41,132 INFO MainThread:12354 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-04 03:51:41,211 INFO MainThread:12354 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-04 03:51:41,211 INFO MainThread:12354 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-04 03:51:41,316 INFO MainThread:12354 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-04 03:51:41,317 INFO MainThread:12354 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-04 03:51:41,317 INFO MainThread:12354 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-04 03:51:41,317 INFO MainThread:12354 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-04 03:51:41,318 INFO MainThread:12354 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-04 03:52:43,007 INFO MainThread:12354 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
|
29 |
+
2024-08-04 03:52:43,008 INFO MainThread:12354 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-04 03:52:51,718 WARNING MsgRouterThr:12354 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_035140-nyllt780/run-nyllt780.wandb
ADDED
Binary file (22.5 kB). View file
|
|
wandb/run-20240804_211947-niq3ake5/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 512
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-llama_train_2024-08-04-21:19:16
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-llama
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-llama
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 2000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 2000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 8
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-llama
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32000
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722773987.17106
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 2048
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: llama
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 2048
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 32
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 22
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: LlamaForCausalLM
|
wandb/run-20240804_211947-niq3ake5/files/output.log
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/tiny-llama.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
8 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
9 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
10 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
|
11 |
+
--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
12 |
+
--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
|
13 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
14 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
15 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
16 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
17 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
18 |
+
warnings.warn(
|
19 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
20 |
+
--> applying fsdp activation checkpointing...
|
21 |
+
> datasets target sizes (minimum size):
|
22 |
+
train: 640000
|
23 |
+
validation: 35200
|
24 |
+
test: 3200
|
25 |
+
> building train, validation, and test datasets for GPT ...
|
26 |
+
> finished creating GPT datasets ...
|
27 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
28 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
29 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
|
30 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
31 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
32 |
+
model info: FullyShardedDataParallel(
|
33 |
+
(_fsdp_wrapped_module): LlamaForCausalLM(
|
34 |
+
(model): LlamaModel(
|
35 |
+
(embed_tokens): Embedding(32000, 2048)
|
36 |
+
(layers): ModuleList(
|
37 |
+
(0-21): 22 x FullyShardedDataParallel(
|
38 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
39 |
+
(_checkpoint_wrapped_module): LlamaDecoderLayer(
|
40 |
+
(self_attn): LlamaFlashAttention2(
|
41 |
+
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
42 |
+
(k_proj): Linear(in_features=2048, out_features=256, bias=False)
|
43 |
+
(v_proj): Linear(in_features=2048, out_features=256, bias=False)
|
44 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
45 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
46 |
+
)
|
47 |
+
(mlp): LlamaMLP(
|
48 |
+
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
49 |
+
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
50 |
+
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
|
51 |
+
(act_fn): SiLU()
|
52 |
+
)
|
53 |
+
(input_layernorm): LlamaRMSNorm()
|
54 |
+
(post_attention_layernorm): LlamaRMSNorm()
|
55 |
+
)
|
56 |
+
)
|
57 |
+
)
|
58 |
+
)
|
59 |
+
(norm): LlamaRMSNorm()
|
60 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
61 |
+
)
|
62 |
+
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
|
63 |
+
)
|
64 |
+
)
|
65 |
+
model config: LlamaConfig {
|
66 |
+
"_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
67 |
+
"architectures": [
|
68 |
+
"LlamaForCausalLM"
|
69 |
+
],
|
70 |
+
"attention_bias": false,
|
71 |
+
"attention_dropout": 0.0,
|
72 |
+
"bos_token_id": 1,
|
73 |
+
"eos_token_id": 2,
|
74 |
+
"hidden_act": "silu",
|
75 |
+
"hidden_size": 2048,
|
76 |
+
"initializer_range": 0.02,
|
77 |
+
"intermediate_size": 5632,
|
78 |
+
"label_smoothing": 0.0,
|
79 |
+
"max_position_embeddings": 2048,
|
80 |
+
"mlp_bias": false,
|
81 |
+
"model_type": "llama",
|
82 |
+
"num_attention_heads": 32,
|
83 |
+
"num_hidden_layers": 22,
|
84 |
+
"num_key_value_heads": 4,
|
85 |
+
"pretraining_tp": 1,
|
86 |
+
"rms_norm_eps": 1e-05,
|
87 |
+
"rope_scaling": null,
|
88 |
+
"rope_theta": 10000.0,
|
89 |
+
"tie_word_embeddings": false,
|
90 |
+
"torch_dtype": "float32",
|
91 |
+
"transformers_version": "4.43.3",
|
92 |
+
"use_cache": false,
|
93 |
+
"vocab_size": 32000
|
94 |
+
}
|
95 |
+
Let split = None
|
96 |
+
Building a BlendedDataset for a single MegatronDataset
|
97 |
+
Unable to save the indexes because path_to_cache is None
|
98 |
+
Building a BlendedDataset for a single MegatronDataset
|
99 |
+
Unable to save the indexes because path_to_cache is None
|
100 |
+
Building a BlendedDataset for a single MegatronDataset
|
101 |
+
Unable to save the indexes because path_to_cache is None
|
102 |
+
Traceback (most recent call last):
|
103 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
104 |
+
main()
|
105 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
106 |
+
train(
|
107 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
|
108 |
+
batch = next(train_dataloader)
|
109 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
|
110 |
+
for x in iter:
|
111 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
|
112 |
+
data = self._next_data()
|
113 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
|
114 |
+
return self._process_data(data)
|
115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
|
116 |
+
data.reraise()
|
117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
|
118 |
+
raise exception
|
119 |
+
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
|
120 |
+
Original Traceback (most recent call last):
|
121 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
|
122 |
+
data = fetcher.fetch(index)
|
123 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
|
124 |
+
return self.collate_fn(data)
|
125 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
|
126 |
+
return collate(batch, collate_fn_map=default_collate_fn_map)
|
127 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
|
128 |
+
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
|
129 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
|
130 |
+
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
|
131 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
|
132 |
+
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
|
133 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
|
134 |
+
return torch.stack(batch, 0, out=out)
|
135 |
+
RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1
|
wandb/run-20240804_211947-niq3ake5/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240804_211947-niq3ake5/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-04T12:19:47.940599",
|
5 |
+
"startedAt": "2024-08-04T12:19:47.157671",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"512",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"8",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"2000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"2000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-llama",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-llama",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-llama",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-llama_train_2024-08-04-21:19:16"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.044999999999,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.045,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.045,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.045,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.045,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.045,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.045,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.045,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.045,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.045,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.045,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.045,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.045,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.045,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.045,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.045,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.045,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.045,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.045,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 67}}
|
wandb/run-20240804_211947-niq3ake5/logs/debug-internal.log
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 21:19:47,171 INFO StreamThr :10096 [internal.py:wandb_internal():86] W&B internal server running at pid: 10096, started at: 2024-08-04 21:19:47.170590
|
2 |
+
2024-08-04 21:19:47,173 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-04 21:19:47,176 INFO WriterThread:10096 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_211947-niq3ake5/run-niq3ake5.wandb
|
4 |
+
2024-08-04 21:19:47,177 DEBUG SenderThread:10096 [sender.py:send():382] send: header
|
5 |
+
2024-08-04 21:19:47,316 DEBUG SenderThread:10096 [sender.py:send():382] send: run
|
6 |
+
2024-08-04 21:19:47,822 INFO SenderThread:10096 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_211947-niq3ake5/files
|
7 |
+
2024-08-04 21:19:47,822 INFO SenderThread:10096 [sender.py:_start_run_threads():1136] run started: niq3ake5 with start time 1722773987.17106
|
8 |
+
2024-08-04 21:19:47,827 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-04 21:19:47,827 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-04 21:19:47,917 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-04 21:19:47,923 DEBUG HandlerThread:10096 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-04 21:19:47,923 DEBUG HandlerThread:10096 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-04 21:19:47,923 INFO HandlerThread:10096 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-04 21:19:47,923 INFO SystemMonitor:10096 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-04 21:19:47,924 INFO HandlerThread:10096 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-04 21:19:47,924 INFO SystemMonitor:10096 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-04 21:19:47,924 INFO SystemMonitor:10096 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-04 21:19:47,925 INFO SystemMonitor:10096 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-04 21:19:47,926 INFO SystemMonitor:10096 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-04 21:19:47,927 INFO SystemMonitor:10096 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-04 21:19:47,940 DEBUG HandlerThread:10096 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-04 21:19:47,942 DEBUG HandlerThread:10096 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-04 21:19:47,954 DEBUG HandlerThread:10096 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-04 21:19:47,954 DEBUG HandlerThread:10096 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-04 21:19:47,954 DEBUG HandlerThread:10096 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T12:19:47.940599', 'startedAt': '2024-08-04T12:19:47.157671', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-21:19:16'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.044999999999, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
26 |
+
2024-08-04 21:19:47,954 INFO HandlerThread:10096 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-04 21:19:47,954 INFO HandlerThread:10096 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-04 21:19:47,955 INFO HandlerThread:10096 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-04 21:19:47,982 DEBUG SenderThread:10096 [sender.py:send():382] send: files
|
30 |
+
2024-08-04 21:19:47,982 INFO SenderThread:10096 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-04 21:19:47,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-04 21:19:47,992 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: python_packages
|
33 |
+
2024-08-04 21:19:47,992 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
|
34 |
+
2024-08-04 21:19:47,992 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
|
35 |
+
2024-08-04 21:19:47,993 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-04 21:19:48,264 DEBUG SenderThread:10096 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-04 21:19:48,653 INFO wandb-upload_0:10096 [upload_job.py:push():131] Uploaded file /tmp/tmpc_z53slvwandb/somhprnl-wandb-metadata.json
|
38 |
+
2024-08-04 21:19:48,823 INFO Thread-12 :10096 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_211947-niq3ake5/files/requirements.txt
|
39 |
+
2024-08-04 21:19:48,824 INFO Thread-12 :10096 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_211947-niq3ake5/files/wandb-metadata.json
|
40 |
+
2024-08-04 21:19:52,265 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
41 |
+
2024-08-04 21:19:53,826 INFO Thread-12 :10096 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
|
42 |
+
2024-08-04 21:19:55,827 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
|
43 |
+
2024-08-04 21:19:57,441 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
44 |
+
2024-08-04 21:19:59,829 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
|
45 |
+
2024-08-04 21:20:02,739 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
46 |
+
2024-08-04 21:20:02,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
|
47 |
+
2024-08-04 21:20:02,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
|
48 |
+
2024-08-04 21:20:02,992 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
|
49 |
+
2024-08-04 21:20:08,241 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
50 |
+
2024-08-04 21:20:13,242 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
51 |
+
2024-08-04 21:20:17,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
|
52 |
+
2024-08-04 21:20:17,991 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
|
53 |
+
2024-08-04 21:20:18,032 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
|
54 |
+
2024-08-04 21:20:18,266 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
55 |
+
2024-08-04 21:20:18,841 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/config.yaml
|
56 |
+
2024-08-04 21:20:23,460 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
57 |
+
2024-08-04 21:20:28,461 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
58 |
+
2024-08-04 21:20:32,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
|
59 |
+
2024-08-04 21:20:32,991 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
|
60 |
+
2024-08-04 21:20:33,032 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
|
61 |
+
2024-08-04 21:20:34,171 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-04 21:20:39,171 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
63 |
+
2024-08-04 21:20:44,172 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
64 |
+
2024-08-04 21:20:47,927 DEBUG SystemMonitor:10096 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
65 |
+
2024-08-04 21:20:47,929 DEBUG SenderThread:10096 [sender.py:send():382] send: stats
|
66 |
+
2024-08-04 21:20:47,991 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: stop_status
|
67 |
+
2024-08-04 21:20:47,991 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: stop_status
|
68 |
+
2024-08-04 21:20:48,032 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
|
69 |
+
2024-08-04 21:20:49,259 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
70 |
+
2024-08-04 21:20:53,862 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
|
71 |
+
2024-08-04 21:20:54,334 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
72 |
+
2024-08-04 21:20:54,437 DEBUG SenderThread:10096 [sender.py:send():382] send: config
|
73 |
+
2024-08-04 21:20:54,437 DEBUG SenderThread:10096 [sender.py:send():382] send: config
|
74 |
+
2024-08-04 21:20:55,009 DEBUG SenderThread:10096 [sender.py:send():382] send: exit
|
75 |
+
2024-08-04 21:20:55,009 INFO SenderThread:10096 [sender.py:send_exit():589] handling exit code: 1
|
76 |
+
2024-08-04 21:20:55,009 INFO SenderThread:10096 [sender.py:send_exit():591] handling runtime: 67
|
77 |
+
2024-08-04 21:20:55,010 INFO SenderThread:10096 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
78 |
+
2024-08-04 21:20:55,011 INFO SenderThread:10096 [sender.py:send_exit():597] send defer
|
79 |
+
2024-08-04 21:20:55,011 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
80 |
+
2024-08-04 21:20:55,011 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 0
|
81 |
+
2024-08-04 21:20:55,011 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
82 |
+
2024-08-04 21:20:55,011 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 0
|
83 |
+
2024-08-04 21:20:55,011 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 1
|
84 |
+
2024-08-04 21:20:55,011 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
85 |
+
2024-08-04 21:20:55,011 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 1
|
86 |
+
2024-08-04 21:20:55,011 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
87 |
+
2024-08-04 21:20:55,011 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 1
|
88 |
+
2024-08-04 21:20:55,012 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 2
|
89 |
+
2024-08-04 21:20:55,012 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
90 |
+
2024-08-04 21:20:55,012 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 2
|
91 |
+
2024-08-04 21:20:55,012 INFO HandlerThread:10096 [system_monitor.py:finish():203] Stopping system monitor
|
92 |
+
2024-08-04 21:20:55,012 DEBUG SystemMonitor:10096 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
93 |
+
2024-08-04 21:20:55,012 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined cpu monitor
|
94 |
+
2024-08-04 21:20:55,012 DEBUG SystemMonitor:10096 [system_monitor.py:_start():183] Publishing last batch of metrics
|
95 |
+
2024-08-04 21:20:55,012 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined disk monitor
|
96 |
+
2024-08-04 21:20:55,046 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined gpu monitor
|
97 |
+
2024-08-04 21:20:55,047 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined memory monitor
|
98 |
+
2024-08-04 21:20:55,047 INFO HandlerThread:10096 [interfaces.py:finish():202] Joined network monitor
|
99 |
+
2024-08-04 21:20:55,047 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
100 |
+
2024-08-04 21:20:55,047 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 2
|
101 |
+
2024-08-04 21:20:55,047 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 3
|
102 |
+
2024-08-04 21:20:55,047 DEBUG SenderThread:10096 [sender.py:send():382] send: stats
|
103 |
+
2024-08-04 21:20:55,047 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
104 |
+
2024-08-04 21:20:55,048 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 3
|
105 |
+
2024-08-04 21:20:55,048 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
106 |
+
2024-08-04 21:20:55,048 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 3
|
107 |
+
2024-08-04 21:20:55,048 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 4
|
108 |
+
2024-08-04 21:20:55,048 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
109 |
+
2024-08-04 21:20:55,048 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 4
|
110 |
+
2024-08-04 21:20:55,048 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
111 |
+
2024-08-04 21:20:55,048 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 4
|
112 |
+
2024-08-04 21:20:55,048 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 5
|
113 |
+
2024-08-04 21:20:55,048 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
114 |
+
2024-08-04 21:20:55,048 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 5
|
115 |
+
2024-08-04 21:20:55,049 DEBUG SenderThread:10096 [sender.py:send():382] send: summary
|
116 |
+
2024-08-04 21:20:55,050 INFO SenderThread:10096 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
117 |
+
2024-08-04 21:20:55,050 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
118 |
+
2024-08-04 21:20:55,050 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 5
|
119 |
+
2024-08-04 21:20:55,050 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 6
|
120 |
+
2024-08-04 21:20:55,050 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
121 |
+
2024-08-04 21:20:55,050 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 6
|
122 |
+
2024-08-04 21:20:55,050 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
123 |
+
2024-08-04 21:20:55,050 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 6
|
124 |
+
2024-08-04 21:20:55,053 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: status_report
|
125 |
+
2024-08-04 21:20:55,265 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 7
|
126 |
+
2024-08-04 21:20:55,265 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
127 |
+
2024-08-04 21:20:55,265 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 7
|
128 |
+
2024-08-04 21:20:55,265 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
129 |
+
2024-08-04 21:20:55,265 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 7
|
130 |
+
2024-08-04 21:20:55,512 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 8
|
131 |
+
2024-08-04 21:20:55,513 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
132 |
+
2024-08-04 21:20:55,513 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 8
|
133 |
+
2024-08-04 21:20:55,513 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
134 |
+
2024-08-04 21:20:55,513 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 8
|
135 |
+
2024-08-04 21:20:55,513 INFO SenderThread:10096 [job_builder.py:build():296] Attempting to build job artifact
|
136 |
+
2024-08-04 21:20:55,514 INFO SenderThread:10096 [job_builder.py:_get_source_type():426] is repo sourced job
|
137 |
+
2024-08-04 21:20:55,528 INFO SenderThread:10096 [job_builder.py:build():402] adding wandb-job metadata file
|
138 |
+
2024-08-04 21:20:55,537 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 9
|
139 |
+
2024-08-04 21:20:55,537 DEBUG SenderThread:10096 [sender.py:send():382] send: artifact
|
140 |
+
2024-08-04 21:20:55,537 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
141 |
+
2024-08-04 21:20:55,539 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 9
|
142 |
+
2024-08-04 21:20:55,864 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/config.yaml
|
143 |
+
2024-08-04 21:20:55,864 INFO Thread-12 :10096 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_211947-niq3ake5/files/output.log
|
144 |
+
2024-08-04 21:20:55,864 INFO Thread-12 :10096 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json
|
145 |
+
2024-08-04 21:20:56,009 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: poll_exit
|
146 |
+
2024-08-04 21:20:57,540 INFO SenderThread:10096 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
147 |
+
2024-08-04 21:20:57,540 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
148 |
+
2024-08-04 21:20:57,540 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 9
|
149 |
+
2024-08-04 21:20:57,540 INFO SenderThread:10096 [dir_watcher.py:finish():358] shutting down directory watcher
|
150 |
+
2024-08-04 21:20:57,865 INFO SenderThread:10096 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_211947-niq3ake5/files
|
151 |
+
2024-08-04 21:20:57,865 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/requirements.txt requirements.txt
|
152 |
+
2024-08-04 21:20:57,865 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/config.yaml config.yaml
|
153 |
+
2024-08-04 21:20:57,867 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/wandb-metadata.json wandb-metadata.json
|
154 |
+
2024-08-04 21:20:57,867 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json wandb-summary.json
|
155 |
+
2024-08-04 21:20:57,869 INFO SenderThread:10096 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_211947-niq3ake5/files/output.log output.log
|
156 |
+
2024-08-04 21:20:57,869 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 10
|
157 |
+
2024-08-04 21:20:57,870 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: poll_exit
|
158 |
+
2024-08-04 21:20:57,872 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
159 |
+
2024-08-04 21:20:57,872 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 10
|
160 |
+
2024-08-04 21:20:57,872 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
161 |
+
2024-08-04 21:20:57,872 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 10
|
162 |
+
2024-08-04 21:20:57,872 INFO SenderThread:10096 [file_pusher.py:finish():172] shutting down file pusher
|
163 |
+
2024-08-04 21:20:58,009 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: poll_exit
|
164 |
+
2024-08-04 21:20:58,010 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: poll_exit
|
165 |
+
2024-08-04 21:20:58,272 INFO wandb-upload_1:10096 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_211947-niq3ake5/files/config.yaml
|
166 |
+
2024-08-04 21:20:58,376 INFO wandb-upload_0:10096 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_211947-niq3ake5/files/requirements.txt
|
167 |
+
2024-08-04 21:20:58,453 INFO wandb-upload_2:10096 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_211947-niq3ake5/files/wandb-summary.json
|
168 |
+
2024-08-04 21:20:58,476 INFO wandb-upload_3:10096 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_211947-niq3ake5/files/output.log
|
169 |
+
2024-08-04 21:20:58,677 INFO Thread-11 (_thread_body):10096 [sender.py:transition_state():617] send defer: 11
|
170 |
+
2024-08-04 21:20:58,677 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
171 |
+
2024-08-04 21:20:58,677 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 11
|
172 |
+
2024-08-04 21:20:58,677 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
173 |
+
2024-08-04 21:20:58,677 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 11
|
174 |
+
2024-08-04 21:20:58,677 INFO SenderThread:10096 [file_pusher.py:join():178] waiting for file pusher
|
175 |
+
2024-08-04 21:20:58,677 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 12
|
176 |
+
2024-08-04 21:20:58,677 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
177 |
+
2024-08-04 21:20:58,677 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 12
|
178 |
+
2024-08-04 21:20:58,678 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
179 |
+
2024-08-04 21:20:58,678 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 12
|
180 |
+
2024-08-04 21:20:58,678 INFO SenderThread:10096 [file_stream.py:finish():595] file stream finish called
|
181 |
+
2024-08-04 21:20:58,860 INFO SenderThread:10096 [file_stream.py:finish():599] file stream finish is done
|
182 |
+
2024-08-04 21:20:58,860 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 13
|
183 |
+
2024-08-04 21:20:58,860 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
184 |
+
2024-08-04 21:20:58,860 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 13
|
185 |
+
2024-08-04 21:20:58,860 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
186 |
+
2024-08-04 21:20:58,860 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 13
|
187 |
+
2024-08-04 21:20:58,860 INFO SenderThread:10096 [sender.py:transition_state():617] send defer: 14
|
188 |
+
2024-08-04 21:20:58,861 DEBUG SenderThread:10096 [sender.py:send():382] send: final
|
189 |
+
2024-08-04 21:20:58,861 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: defer
|
190 |
+
2024-08-04 21:20:58,861 DEBUG SenderThread:10096 [sender.py:send():382] send: footer
|
191 |
+
2024-08-04 21:20:58,861 INFO HandlerThread:10096 [handler.py:handle_request_defer():172] handle defer: 14
|
192 |
+
2024-08-04 21:20:58,861 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: defer
|
193 |
+
2024-08-04 21:20:58,861 INFO SenderThread:10096 [sender.py:send_request_defer():613] handle sender defer: 14
|
194 |
+
2024-08-04 21:20:58,861 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: poll_exit
|
195 |
+
2024-08-04 21:20:58,862 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: poll_exit
|
196 |
+
2024-08-04 21:20:58,862 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: poll_exit
|
197 |
+
2024-08-04 21:20:58,862 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: poll_exit
|
198 |
+
2024-08-04 21:20:58,862 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: server_info
|
199 |
+
2024-08-04 21:20:58,863 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: server_info
|
200 |
+
2024-08-04 21:20:58,864 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: get_summary
|
201 |
+
2024-08-04 21:20:58,864 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: sampled_history
|
202 |
+
2024-08-04 21:20:58,864 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: internal_messages
|
203 |
+
2024-08-04 21:20:58,865 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: job_info
|
204 |
+
2024-08-04 21:20:59,033 DEBUG SenderThread:10096 [sender.py:send_request():409] send_request: job_info
|
205 |
+
2024-08-04 21:20:59,033 INFO MainThread:10096 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
206 |
+
2024-08-04 21:20:59,033 INFO MainThread:10096 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
207 |
+
2024-08-04 21:20:59,033 INFO MainThread:10096 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
208 |
+
2024-08-04 21:20:59,033 DEBUG HandlerThread:10096 [handler.py:handle_request():146] handle_request: shutdown
|
209 |
+
2024-08-04 21:20:59,033 INFO HandlerThread:10096 [handler.py:finish():869] shutting down handler
|
210 |
+
2024-08-04 21:20:59,865 INFO WriterThread:10096 [datastore.py:close():296] close: /project/wandb/run-20240804_211947-niq3ake5/run-niq3ake5.wandb
|
211 |
+
2024-08-04 21:21:00,033 INFO SenderThread:10096 [sender.py:finish():1572] shutting down sender
|
212 |
+
2024-08-04 21:21:00,033 INFO SenderThread:10096 [file_pusher.py:finish():172] shutting down file pusher
|
213 |
+
2024-08-04 21:21:00,033 INFO SenderThread:10096 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_211947-niq3ake5/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 21:19:47,163 INFO MainThread:10025 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-04 21:19:47,163 INFO MainThread:10025 [wandb_setup.py:_flush():76] Configure stats pid to 10025
|
3 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
|
6 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_211947-niq3ake5/logs/debug.log
|
9 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_211947-niq3ake5/logs/debug-internal.log
|
10 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-21:19:16', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-04 21:19:47,164 INFO MainThread:10025 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-04 21:19:47,169 INFO MainThread:10025 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-04 21:19:47,170 INFO MainThread:10025 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-04 21:19:47,175 INFO MainThread:10025 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-04 21:19:47,312 INFO MainThread:10025 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-04 21:19:47,827 INFO MainThread:10025 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-04 21:19:47,910 INFO MainThread:10025 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-04 21:19:47,910 INFO MainThread:10025 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-04 21:19:47,990 INFO MainThread:10025 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-04 21:19:47,991 INFO MainThread:10025 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-04 21:19:47,991 INFO MainThread:10025 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-04 21:19:47,991 INFO MainThread:10025 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-04 21:19:47,992 INFO MainThread:10025 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-04 21:20:54,436 INFO MainThread:10025 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
|
29 |
+
2024-08-04 21:20:54,436 INFO MainThread:10025 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-04 21:21:00,034 WARNING MsgRouterThr:10025 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_211947-niq3ake5/run-niq3ake5.wandb
ADDED
Binary file (22.2 kB). View file
|
|
wandb/run-20240812_055620-qpw0uqx2/files/config.yaml
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '235289369'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '235289369'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '235289369'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 4096
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/Phi/Phi-2
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-sample-Phi-2_train_2024-08-12-05:56:09
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-sample-Phi-2
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-sample-Phi-2
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/Phi/Phi-2
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: anyprecision
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-sample-Phi-2
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 50304
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1723409780.063771
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
wandb/run-20240812_055620-qpw0uqx2/files/output.log
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-sample-Phi-2.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/yans-sample-Phi-2/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-Phi-2/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/yans-sample-Phi-2/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-Phi-2/latest_iteration.txt
|
8 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
9 |
+
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
|
wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240812_055620-qpw0uqx2/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-11T20:56:20.724831",
|
5 |
+
"startedAt": "2024-08-11T20:56:20.050826",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/Phi/Phi-2",
|
23 |
+
"--train-data-path",
|
24 |
+
"235289369",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"235289369",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"235289369",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"anyprecision",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/Phi/Phi-2",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-sample-Phi-2",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-sample-Phi-2",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-sample-Phi-2",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-sample-Phi-2_train_2024-08-12-05:56:09"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0429999999997,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.043,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.043,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.043,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.043,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.043,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.043,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.043,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.043,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.043,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.043,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.043,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.043,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.043,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.043,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.043,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.043,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.043,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.043,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.487823486328125
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 19}}
|
wandb/run-20240812_055620-qpw0uqx2/logs/debug-internal.log
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 05:56:20,065 INFO StreamThr :11662 [internal.py:wandb_internal():86] W&B internal server running at pid: 11662, started at: 2024-08-12 05:56:20.064563
|
2 |
+
2024-08-12 05:56:20,067 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-12 05:56:20,069 INFO WriterThread:11662 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_055620-qpw0uqx2/run-qpw0uqx2.wandb
|
4 |
+
2024-08-12 05:56:20,070 DEBUG SenderThread:11662 [sender.py:send():382] send: header
|
5 |
+
2024-08-12 05:56:20,085 DEBUG SenderThread:11662 [sender.py:send():382] send: run
|
6 |
+
2024-08-12 05:56:20,612 INFO SenderThread:11662 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_055620-qpw0uqx2/files
|
7 |
+
2024-08-12 05:56:20,612 INFO SenderThread:11662 [sender.py:_start_run_threads():1136] run started: qpw0uqx2 with start time 1723409780.063771
|
8 |
+
2024-08-12 05:56:20,617 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-12 05:56:20,617 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-12 05:56:20,704 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-12 05:56:20,711 DEBUG HandlerThread:11662 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-12 05:56:20,711 DEBUG HandlerThread:11662 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-12 05:56:20,711 INFO HandlerThread:11662 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-12 05:56:20,711 INFO SystemMonitor:11662 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-12 05:56:20,711 INFO HandlerThread:11662 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-12 05:56:20,712 INFO SystemMonitor:11662 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-12 05:56:20,712 INFO SystemMonitor:11662 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-12 05:56:20,713 INFO SystemMonitor:11662 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-12 05:56:20,714 INFO SystemMonitor:11662 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-12 05:56:20,714 INFO SystemMonitor:11662 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-12 05:56:20,724 DEBUG HandlerThread:11662 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-12 05:56:20,729 DEBUG HandlerThread:11662 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-12 05:56:20,742 DEBUG HandlerThread:11662 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-12 05:56:20,742 DEBUG HandlerThread:11662 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-12 05:56:20,742 DEBUG HandlerThread:11662 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T20:56:20.724831', 'startedAt': '2024-08-11T20:56:20.050826', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Phi/Phi-2', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Phi/Phi-2', '--save', '/work/llm_recipes/models/yans-sample-Phi-2', '--load', '/work/llm_recipes/models/yans-sample-Phi-2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-Phi-2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-Phi-2_train_2024-08-12-05:56:09'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
|
26 |
+
2024-08-12 05:56:20,742 INFO HandlerThread:11662 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-12 05:56:20,742 INFO HandlerThread:11662 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-12 05:56:20,743 INFO HandlerThread:11662 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-12 05:56:20,749 DEBUG SenderThread:11662 [sender.py:send():382] send: files
|
30 |
+
2024-08-12 05:56:20,749 INFO SenderThread:11662 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-12 05:56:20,759 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-12 05:56:20,759 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-12 05:56:20,759 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: python_packages
|
34 |
+
2024-08-12 05:56:20,760 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: internal_messages
|
35 |
+
2024-08-12 05:56:20,761 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-12 05:56:21,039 DEBUG SenderThread:11662 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-12 05:56:21,402 INFO wandb-upload_0:11662 [upload_job.py:push():131] Uploaded file /tmp/tmp1ghcluufwandb/07mrguha-wandb-metadata.json
|
38 |
+
2024-08-12 05:56:21,614 INFO Thread-12 :11662 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log
|
39 |
+
2024-08-12 05:56:21,614 INFO Thread-12 :11662 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt
|
40 |
+
2024-08-12 05:56:21,614 INFO Thread-12 :11662 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-metadata.json
|
41 |
+
2024-08-12 05:56:23,614 INFO Thread-12 :11662 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log
|
42 |
+
2024-08-12 05:56:25,807 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-12 05:56:30,807 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
44 |
+
2024-08-12 05:56:35,758 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: stop_status
|
45 |
+
2024-08-12 05:56:35,759 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: internal_messages
|
46 |
+
2024-08-12 05:56:35,759 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: stop_status
|
47 |
+
2024-08-12 05:56:35,962 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
48 |
+
2024-08-12 05:56:40,657 DEBUG SenderThread:11662 [sender.py:send():382] send: exit
|
49 |
+
2024-08-12 05:56:40,657 INFO SenderThread:11662 [sender.py:send_exit():589] handling exit code: 255
|
50 |
+
2024-08-12 05:56:40,657 INFO SenderThread:11662 [sender.py:send_exit():591] handling runtime: 19
|
51 |
+
2024-08-12 05:56:40,659 INFO SenderThread:11662 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
52 |
+
2024-08-12 05:56:40,659 INFO SenderThread:11662 [sender.py:send_exit():597] send defer
|
53 |
+
2024-08-12 05:56:40,659 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
54 |
+
2024-08-12 05:56:40,659 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 0
|
55 |
+
2024-08-12 05:56:40,659 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
56 |
+
2024-08-12 05:56:40,659 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 0
|
57 |
+
2024-08-12 05:56:40,659 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 1
|
58 |
+
2024-08-12 05:56:40,659 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
59 |
+
2024-08-12 05:56:40,660 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 1
|
60 |
+
2024-08-12 05:56:40,660 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
61 |
+
2024-08-12 05:56:40,660 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 1
|
62 |
+
2024-08-12 05:56:40,660 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 2
|
63 |
+
2024-08-12 05:56:40,660 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
64 |
+
2024-08-12 05:56:40,660 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 2
|
65 |
+
2024-08-12 05:56:40,660 INFO HandlerThread:11662 [system_monitor.py:finish():203] Stopping system monitor
|
66 |
+
2024-08-12 05:56:40,660 DEBUG SystemMonitor:11662 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
67 |
+
2024-08-12 05:56:40,660 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined cpu monitor
|
68 |
+
2024-08-12 05:56:40,660 DEBUG SystemMonitor:11662 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
69 |
+
2024-08-12 05:56:40,661 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined disk monitor
|
70 |
+
2024-08-12 05:56:40,661 DEBUG SystemMonitor:11662 [system_monitor.py:_start():183] Publishing last batch of metrics
|
71 |
+
2024-08-12 05:56:40,693 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined gpu monitor
|
72 |
+
2024-08-12 05:56:40,693 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined memory monitor
|
73 |
+
2024-08-12 05:56:40,693 INFO HandlerThread:11662 [interfaces.py:finish():202] Joined network monitor
|
74 |
+
2024-08-12 05:56:40,693 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
75 |
+
2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 2
|
76 |
+
2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 3
|
77 |
+
2024-08-12 05:56:40,694 DEBUG SenderThread:11662 [sender.py:send():382] send: stats
|
78 |
+
2024-08-12 05:56:40,694 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
79 |
+
2024-08-12 05:56:40,694 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 3
|
80 |
+
2024-08-12 05:56:40,694 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
81 |
+
2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 3
|
82 |
+
2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 4
|
83 |
+
2024-08-12 05:56:40,694 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
84 |
+
2024-08-12 05:56:40,694 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 4
|
85 |
+
2024-08-12 05:56:40,694 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
86 |
+
2024-08-12 05:56:40,694 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 4
|
87 |
+
2024-08-12 05:56:40,695 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 5
|
88 |
+
2024-08-12 05:56:40,695 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
89 |
+
2024-08-12 05:56:40,695 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 5
|
90 |
+
2024-08-12 05:56:40,695 DEBUG SenderThread:11662 [sender.py:send():382] send: summary
|
91 |
+
2024-08-12 05:56:40,696 INFO SenderThread:11662 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
92 |
+
2024-08-12 05:56:40,696 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
93 |
+
2024-08-12 05:56:40,696 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 5
|
94 |
+
2024-08-12 05:56:40,696 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 6
|
95 |
+
2024-08-12 05:56:40,696 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
96 |
+
2024-08-12 05:56:40,696 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 6
|
97 |
+
2024-08-12 05:56:40,696 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
98 |
+
2024-08-12 05:56:40,696 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 6
|
99 |
+
2024-08-12 05:56:40,699 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
100 |
+
2024-08-12 05:56:40,927 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 7
|
101 |
+
2024-08-12 05:56:40,927 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
102 |
+
2024-08-12 05:56:40,928 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 7
|
103 |
+
2024-08-12 05:56:40,928 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
104 |
+
2024-08-12 05:56:40,928 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 7
|
105 |
+
2024-08-12 05:56:41,625 INFO Thread-12 :11662 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_055620-qpw0uqx2/files/config.yaml
|
106 |
+
2024-08-12 05:56:41,626 INFO Thread-12 :11662 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json
|
107 |
+
2024-08-12 05:56:41,657 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: poll_exit
|
108 |
+
2024-08-12 05:56:41,724 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 8
|
109 |
+
2024-08-12 05:56:41,724 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: poll_exit
|
110 |
+
2024-08-12 05:56:41,724 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
111 |
+
2024-08-12 05:56:41,724 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 8
|
112 |
+
2024-08-12 05:56:41,724 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
113 |
+
2024-08-12 05:56:41,725 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 8
|
114 |
+
2024-08-12 05:56:41,725 INFO SenderThread:11662 [job_builder.py:build():296] Attempting to build job artifact
|
115 |
+
2024-08-12 05:56:41,725 INFO SenderThread:11662 [job_builder.py:_get_source_type():426] is repo sourced job
|
116 |
+
2024-08-12 05:56:41,740 INFO SenderThread:11662 [job_builder.py:build():402] adding wandb-job metadata file
|
117 |
+
2024-08-12 05:56:41,748 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 9
|
118 |
+
2024-08-12 05:56:41,749 DEBUG SenderThread:11662 [sender.py:send():382] send: artifact
|
119 |
+
2024-08-12 05:56:41,749 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
120 |
+
2024-08-12 05:56:41,750 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 9
|
121 |
+
2024-08-12 05:56:42,618 INFO SenderThread:11662 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTgzMzc4Mw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'versionIndex': 7}}}
|
122 |
+
2024-08-12 05:56:42,618 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
123 |
+
2024-08-12 05:56:42,618 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 9
|
124 |
+
2024-08-12 05:56:42,618 INFO SenderThread:11662 [dir_watcher.py:finish():358] shutting down directory watcher
|
125 |
+
2024-08-12 05:56:42,626 INFO Thread-12 :11662 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log
|
126 |
+
2024-08-12 05:56:42,627 INFO SenderThread:11662 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_055620-qpw0uqx2/files
|
127 |
+
2024-08-12 05:56:42,627 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt requirements.txt
|
128 |
+
2024-08-12 05:56:42,627 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/config.yaml config.yaml
|
129 |
+
2024-08-12 05:56:42,629 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-metadata.json wandb-metadata.json
|
130 |
+
2024-08-12 05:56:42,630 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json wandb-summary.json
|
131 |
+
2024-08-12 05:56:42,631 INFO SenderThread:11662 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log output.log
|
132 |
+
2024-08-12 05:56:42,631 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 10
|
133 |
+
2024-08-12 05:56:42,633 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
134 |
+
2024-08-12 05:56:42,633 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 10
|
135 |
+
2024-08-12 05:56:42,633 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
136 |
+
2024-08-12 05:56:42,634 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 10
|
137 |
+
2024-08-12 05:56:42,635 INFO SenderThread:11662 [file_pusher.py:finish():172] shutting down file pusher
|
138 |
+
2024-08-12 05:56:42,657 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: poll_exit
|
139 |
+
2024-08-12 05:56:42,657 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: poll_exit
|
140 |
+
2024-08-12 05:56:43,034 INFO wandb-upload_0:11662 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_055620-qpw0uqx2/files/requirements.txt
|
141 |
+
2024-08-12 05:56:43,159 INFO wandb-upload_1:11662 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_055620-qpw0uqx2/files/config.yaml
|
142 |
+
2024-08-12 05:56:43,201 INFO wandb-upload_2:11662 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_055620-qpw0uqx2/files/wandb-summary.json
|
143 |
+
2024-08-12 05:56:43,202 INFO wandb-upload_3:11662 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_055620-qpw0uqx2/files/output.log
|
144 |
+
2024-08-12 05:56:43,402 INFO Thread-11 (_thread_body):11662 [sender.py:transition_state():617] send defer: 11
|
145 |
+
2024-08-12 05:56:43,403 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
146 |
+
2024-08-12 05:56:43,403 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 11
|
147 |
+
2024-08-12 05:56:43,403 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
148 |
+
2024-08-12 05:56:43,403 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 11
|
149 |
+
2024-08-12 05:56:43,403 INFO SenderThread:11662 [file_pusher.py:join():178] waiting for file pusher
|
150 |
+
2024-08-12 05:56:43,403 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 12
|
151 |
+
2024-08-12 05:56:43,403 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
152 |
+
2024-08-12 05:56:43,404 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 12
|
153 |
+
2024-08-12 05:56:43,404 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
154 |
+
2024-08-12 05:56:43,404 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 12
|
155 |
+
2024-08-12 05:56:43,404 INFO SenderThread:11662 [file_stream.py:finish():595] file stream finish called
|
156 |
+
2024-08-12 05:56:43,591 INFO SenderThread:11662 [file_stream.py:finish():599] file stream finish is done
|
157 |
+
2024-08-12 05:56:43,591 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 13
|
158 |
+
2024-08-12 05:56:43,591 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
159 |
+
2024-08-12 05:56:43,591 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 13
|
160 |
+
2024-08-12 05:56:43,592 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
161 |
+
2024-08-12 05:56:43,592 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 13
|
162 |
+
2024-08-12 05:56:43,592 INFO SenderThread:11662 [sender.py:transition_state():617] send defer: 14
|
163 |
+
2024-08-12 05:56:43,592 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: defer
|
164 |
+
2024-08-12 05:56:43,592 DEBUG SenderThread:11662 [sender.py:send():382] send: final
|
165 |
+
2024-08-12 05:56:43,592 INFO HandlerThread:11662 [handler.py:handle_request_defer():172] handle defer: 14
|
166 |
+
2024-08-12 05:56:43,592 DEBUG SenderThread:11662 [sender.py:send():382] send: footer
|
167 |
+
2024-08-12 05:56:43,593 DEBUG SenderThread:11662 [sender.py:send_request():409] send_request: defer
|
168 |
+
2024-08-12 05:56:43,593 INFO SenderThread:11662 [sender.py:send_request_defer():613] handle sender defer: 14
|
169 |
+
2024-08-12 05:56:47,593 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
170 |
+
2024-08-12 05:56:52,594 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
171 |
+
2024-08-12 05:56:57,595 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
172 |
+
2024-08-12 05:57:02,595 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
173 |
+
2024-08-12 05:57:07,596 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
174 |
+
2024-08-12 05:57:12,597 DEBUG HandlerThread:11662 [handler.py:handle_request():146] handle_request: status_report
|
175 |
+
2024-08-12 05:57:17,123 WARNING StreamThr :11662 [internal.py:is_dead():414] Internal process exiting, parent pid 11591 disappeared
|
176 |
+
2024-08-12 05:57:17,123 ERROR StreamThr :11662 [internal.py:wandb_internal():152] Internal process shutdown.
|
177 |
+
2024-08-12 05:57:17,597 INFO SenderThread:11662 [sender.py:finish():1572] shutting down sender
|
178 |
+
2024-08-12 05:57:17,597 INFO SenderThread:11662 [file_pusher.py:finish():172] shutting down file pusher
|
179 |
+
2024-08-12 05:57:17,597 INFO SenderThread:11662 [file_pusher.py:join():178] waiting for file pusher
|
180 |
+
2024-08-12 05:57:17,598 INFO WriterThread:11662 [datastore.py:close():296] close: /project/wandb/run-20240812_055620-qpw0uqx2/run-qpw0uqx2.wandb
|
181 |
+
2024-08-12 05:57:17,598 INFO HandlerThread:11662 [handler.py:finish():869] shutting down handler
|
wandb/run-20240812_055620-qpw0uqx2/logs/debug.log
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 05:56:20,056 INFO MainThread:11591 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Configure stats pid to 11591
|
3 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train phi'}
|
6 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_055620-qpw0uqx2/logs/debug.log
|
9 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_055620-qpw0uqx2/logs/debug-internal.log
|
10 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Phi/Phi-2', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-Phi-2_train_2024-08-12-05:56:09', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-Phi-2', 'save': '/work/llm_recipes/models/yans-sample-Phi-2', 'base_model': '/share/pretrained_lm/Phi/Phi-2', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-Phi-2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 50304, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-12 05:56:20,057 INFO MainThread:11591 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-12 05:56:20,062 INFO MainThread:11591 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-12 05:56:20,063 INFO MainThread:11591 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-12 05:56:20,068 INFO MainThread:11591 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-12 05:56:20,080 INFO MainThread:11591 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-12 05:56:20,616 INFO MainThread:11591 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-12 05:56:20,697 INFO MainThread:11591 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-12 05:56:20,697 INFO MainThread:11591 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-12 05:56:20,758 INFO MainThread:11591 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-12 05:56:20,758 INFO MainThread:11591 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-12 05:56:20,759 INFO MainThread:11591 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-12 05:56:20,759 INFO MainThread:11591 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-12 05:56:20,760 INFO MainThread:11591 [wandb_init.py:init():847] run started, returning control to user process
|
wandb/run-20240812_055620-qpw0uqx2/run-qpw0uqx2.wandb
ADDED
Binary file (7.38 kB). View file
|
|
wandb/run-20240812_073955-ikoro1zp/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '304771887'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '304771887'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '304771887'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 4096
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-qwen2-0.5B_train_2024-08-12-07:39:43
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 10
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 10
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-qwen2-0.5B
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 151680
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1723415995.685329
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
model_architecture:
|
316 |
+
desc: null
|
317 |
+
value: Qwen2ForCausalLM
|
318 |
+
activation_function:
|
319 |
+
desc: null
|
320 |
+
value: silu
|
321 |
+
hidden_size:
|
322 |
+
desc: null
|
323 |
+
value: 896
|
324 |
+
model_type:
|
325 |
+
desc: null
|
326 |
+
value: qwen2
|
327 |
+
max_position_embeddings:
|
328 |
+
desc: null
|
329 |
+
value: 4096
|
330 |
+
num_attention_heads:
|
331 |
+
desc: null
|
332 |
+
value: 14
|
333 |
+
num_hidden_layers:
|
334 |
+
desc: null
|
335 |
+
value: 24
|
wandb/run-20240812_073955-ikoro1zp/files/output.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240812_073955-ikoro1zp/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240812_073955-ikoro1zp/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-11T22:39:56.314869",
|
5 |
+
"startedAt": "2024-08-11T22:39:55.672249",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
23 |
+
"--train-data-path",
|
24 |
+
"304771887",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"304771887",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"304771887",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"10",
|
56 |
+
"--eval-interval",
|
57 |
+
"10",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-qwen2-0.5B",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-qwen2-0.5B_train_2024-08-12-07:39:43"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0429999999997,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.043,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.043,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.043,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.043,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.043,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.043,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.043,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.043,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.043,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.043,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.043,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.043,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.043,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.043,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.043,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.043,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.043,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.043,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.487823486328125
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240812_073955-ikoro1zp/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 2.4635202884674072, "training/perplexity": 11.746088463770842, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1167, "optimizer/lr": 1.9945203423500063e-05, "optimizer/variance_l2": 0.0048320659825907535, "optimizer/variance_sqrt_l2": 0.5318417899390797, "optimizer/momentum_l2": 0.127020084622386, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.2829437255859375, "optimizer/variance_sqrt_l1": 4615.0, "optimizer/momentum_l1": 979.125, "optimizer/weight_l1": 6918144.0, "optimizer/variance_abs_max": 0.0029296875, "optimizer/variance_sqrt_abs_max": 0.05419921875, "optimizer/momentum_abs_max": 0.01129150390625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 73.68068221400608, "stats/tokens_per_sec": 17793.53774429062, "stats/tokens_per_sec_per_gpu": 17793.53774429062, "stats/tflops": 71.54763648032535, "_timestamp": 1723503194.8273196, "_runtime": 87199.14199066162, "_step": 1167, "evaluation/val_loss": 2.4397435188293457, "evaluation/val_ppl": 11.470099449157715, "_wandb": {"runtime": 87227}}
|
wandb/run-20240812_073955-ikoro1zp/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240812_073955-ikoro1zp/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Configure stats pid to 14724
|
3 |
+
2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-12 07:39:55,678 INFO MainThread:14724 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
|
6 |
+
2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_073955-ikoro1zp/logs/debug.log
|
9 |
+
2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_073955-ikoro1zp/logs/debug-internal.log
|
10 |
+
2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:39:43', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-12 07:39:55,679 INFO MainThread:14724 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-12 07:39:55,684 INFO MainThread:14724 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-12 07:39:55,685 INFO MainThread:14724 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-12 07:39:55,689 INFO MainThread:14724 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-12 07:39:55,704 INFO MainThread:14724 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-12 07:39:56,202 INFO MainThread:14724 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-12 07:39:56,287 INFO MainThread:14724 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-12 07:39:56,287 INFO MainThread:14724 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-12 07:39:56,346 INFO MainThread:14724 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-12 07:39:56,346 INFO MainThread:14724 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-12 07:39:56,346 INFO MainThread:14724 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-12 07:39:56,347 INFO MainThread:14724 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-12 07:39:56,348 INFO MainThread:14724 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-12 07:40:02,086 INFO MainThread:14724 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-12 07:40:02,086 INFO MainThread:14724 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
wandb/run-20240823_160642-78xnl14c/files/config.yaml
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '1754785366'
|
31 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
32 |
+
- '28623823675'
|
33 |
+
- /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
|
34 |
+
valid_data_path:
|
35 |
+
desc: null
|
36 |
+
value:
|
37 |
+
- '1754785366'
|
38 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
39 |
+
test_data_path:
|
40 |
+
desc: null
|
41 |
+
value:
|
42 |
+
- '1754785366'
|
43 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
44 |
+
data_cache_path:
|
45 |
+
desc: null
|
46 |
+
value: null
|
47 |
+
vocab_size:
|
48 |
+
desc: null
|
49 |
+
value: null
|
50 |
+
vocab_file:
|
51 |
+
desc: null
|
52 |
+
value: null
|
53 |
+
merge_file:
|
54 |
+
desc: null
|
55 |
+
value: null
|
56 |
+
seq_length:
|
57 |
+
desc: null
|
58 |
+
value: 2048
|
59 |
+
num_workers:
|
60 |
+
desc: null
|
61 |
+
value: 2
|
62 |
+
tokenizer_type:
|
63 |
+
desc: null
|
64 |
+
value: HFPreTrainedTokenizer
|
65 |
+
tokenizer_model:
|
66 |
+
desc: null
|
67 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
68 |
+
reset_position_ids:
|
69 |
+
desc: null
|
70 |
+
value: false
|
71 |
+
reset_attention_mask:
|
72 |
+
desc: null
|
73 |
+
value: false
|
74 |
+
eod_mask_loss:
|
75 |
+
desc: null
|
76 |
+
value: false
|
77 |
+
retro_return_doc_ids:
|
78 |
+
desc: null
|
79 |
+
value: false
|
80 |
+
short_seq_prob:
|
81 |
+
desc: null
|
82 |
+
value: 0.1
|
83 |
+
vocab_extra_ids:
|
84 |
+
desc: null
|
85 |
+
value: 0
|
86 |
+
seed:
|
87 |
+
desc: null
|
88 |
+
value: 1234
|
89 |
+
use_mpi:
|
90 |
+
desc: null
|
91 |
+
value: false
|
92 |
+
wandb_entity:
|
93 |
+
desc: null
|
94 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
95 |
+
wandb_name:
|
96 |
+
desc: null
|
97 |
+
value: Qwen2-0.5b-0.2_train_2024-08-23-16:06:29
|
98 |
+
wandb_project:
|
99 |
+
desc: null
|
100 |
+
value: llm_tutorial-0.2
|
101 |
+
quantization:
|
102 |
+
desc: null
|
103 |
+
value: false
|
104 |
+
use_freeze_layers:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
freeze_layers:
|
108 |
+
desc: null
|
109 |
+
value: null
|
110 |
+
bf16:
|
111 |
+
desc: null
|
112 |
+
value: true
|
113 |
+
fp16:
|
114 |
+
desc: null
|
115 |
+
value: false
|
116 |
+
mixed_precision:
|
117 |
+
desc: null
|
118 |
+
value: true
|
119 |
+
param_dtype:
|
120 |
+
desc: null
|
121 |
+
value: null
|
122 |
+
load:
|
123 |
+
desc: null
|
124 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
125 |
+
save:
|
126 |
+
desc: null
|
127 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
128 |
+
base_model:
|
129 |
+
desc: null
|
130 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
131 |
+
use_better_transformer:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
grad_clip_norm:
|
135 |
+
desc: null
|
136 |
+
value: 1.0
|
137 |
+
eval_interval:
|
138 |
+
desc: null
|
139 |
+
value: 10
|
140 |
+
save_interval:
|
141 |
+
desc: null
|
142 |
+
value: 10
|
143 |
+
eval_iters:
|
144 |
+
desc: null
|
145 |
+
value: 10
|
146 |
+
optimizer:
|
147 |
+
desc: null
|
148 |
+
value: anyprecision
|
149 |
+
lr:
|
150 |
+
desc: null
|
151 |
+
value: 2.0e-05
|
152 |
+
lr_decay_style:
|
153 |
+
desc: null
|
154 |
+
value: cosine
|
155 |
+
lr_decay_iters:
|
156 |
+
desc: null
|
157 |
+
value: 7500
|
158 |
+
lr_warmup_iters:
|
159 |
+
desc: null
|
160 |
+
value: 500
|
161 |
+
min_lr:
|
162 |
+
desc: null
|
163 |
+
value: 1.0e-06
|
164 |
+
train_iters:
|
165 |
+
desc: null
|
166 |
+
value: 7500
|
167 |
+
train_samples:
|
168 |
+
desc: null
|
169 |
+
value: null
|
170 |
+
global_batch_size:
|
171 |
+
desc: null
|
172 |
+
value: 320
|
173 |
+
micro_batch_size:
|
174 |
+
desc: null
|
175 |
+
value: 5
|
176 |
+
make_vocab_size_divisible_by:
|
177 |
+
desc: null
|
178 |
+
value: 128
|
179 |
+
sliding_window_size:
|
180 |
+
desc: null
|
181 |
+
value: 131072
|
182 |
+
skip_batch:
|
183 |
+
desc: null
|
184 |
+
value: null
|
185 |
+
no_save_optimizer_state:
|
186 |
+
desc: null
|
187 |
+
value: false
|
188 |
+
continual_pretraining:
|
189 |
+
desc: null
|
190 |
+
value: false
|
191 |
+
instruction_tuning:
|
192 |
+
desc: null
|
193 |
+
value: false
|
194 |
+
direct_preference_optimization:
|
195 |
+
desc: null
|
196 |
+
value: false
|
197 |
+
attention_dropout:
|
198 |
+
desc: null
|
199 |
+
value: 0.1
|
200 |
+
hidden_dropout:
|
201 |
+
desc: null
|
202 |
+
value: 0.1
|
203 |
+
weight_decay:
|
204 |
+
desc: null
|
205 |
+
value: 0.1
|
206 |
+
adam_beta1:
|
207 |
+
desc: null
|
208 |
+
value: 0.9
|
209 |
+
adam_beta2:
|
210 |
+
desc: null
|
211 |
+
value: 0.95
|
212 |
+
adam_eps:
|
213 |
+
desc: null
|
214 |
+
value: 1.0e-06
|
215 |
+
hf_transformer_model_dir:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
instruction_train_data_path:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
instruction_valid_data_path:
|
222 |
+
desc: null
|
223 |
+
value: null
|
224 |
+
epoch:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
instruction_dataset_size:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
save_sampler_state:
|
231 |
+
desc: null
|
232 |
+
value: false
|
233 |
+
label_smoothing:
|
234 |
+
desc: null
|
235 |
+
value: 0.0
|
236 |
+
save_n_checkpoints:
|
237 |
+
desc: null
|
238 |
+
value: 10
|
239 |
+
hf_repo_id:
|
240 |
+
desc: null
|
241 |
+
value: koichi12/Qwen2-0.5b-0.2
|
242 |
+
create_public_hf_repo:
|
243 |
+
desc: null
|
244 |
+
value: false
|
245 |
+
upload_all_checkpoints_to_hf:
|
246 |
+
desc: null
|
247 |
+
value: true
|
248 |
+
hf_upload_retry_limit:
|
249 |
+
desc: null
|
250 |
+
value: 2
|
251 |
+
exit_duration_in_mins:
|
252 |
+
desc: null
|
253 |
+
value: null
|
254 |
+
source_key:
|
255 |
+
desc: null
|
256 |
+
value: null
|
257 |
+
target_key:
|
258 |
+
desc: null
|
259 |
+
value: null
|
260 |
+
attn_implementation:
|
261 |
+
desc: null
|
262 |
+
value: flash_attention_2
|
263 |
+
efficient_instruction_tuning:
|
264 |
+
desc: null
|
265 |
+
value: false
|
266 |
+
remove_padding_masking:
|
267 |
+
desc: null
|
268 |
+
value: false
|
269 |
+
save_start_iter:
|
270 |
+
desc: null
|
271 |
+
value: null
|
272 |
+
valid_micro_batch_size:
|
273 |
+
desc: null
|
274 |
+
value: 1
|
275 |
+
rank:
|
276 |
+
desc: null
|
277 |
+
value: 0
|
278 |
+
world_size:
|
279 |
+
desc: null
|
280 |
+
value: 1
|
281 |
+
padded_vocab_size:
|
282 |
+
desc: null
|
283 |
+
value: 151680
|
284 |
+
gradient_accumulation_steps:
|
285 |
+
desc: null
|
286 |
+
value: 64
|
287 |
+
_wandb:
|
288 |
+
desc: null
|
289 |
+
value:
|
290 |
+
python_version: 3.10.12
|
291 |
+
cli_version: 0.16.3
|
292 |
+
framework: huggingface
|
293 |
+
huggingface_version: 4.43.3
|
294 |
+
is_jupyter_run: false
|
295 |
+
is_kaggle_kernel: false
|
296 |
+
start_time: 1724396802.555005
|
297 |
+
t:
|
298 |
+
1:
|
299 |
+
- 1
|
300 |
+
- 11
|
301 |
+
- 49
|
302 |
+
- 55
|
303 |
+
- 71
|
304 |
+
- 105
|
305 |
+
2:
|
306 |
+
- 1
|
307 |
+
- 11
|
308 |
+
- 49
|
309 |
+
- 55
|
310 |
+
- 71
|
311 |
+
- 105
|
312 |
+
3:
|
313 |
+
- 13
|
314 |
+
- 16
|
315 |
+
- 23
|
316 |
+
4: 3.10.12
|
317 |
+
5: 0.16.3
|
318 |
+
6: 4.43.3
|
319 |
+
8:
|
320 |
+
- 5
|
321 |
+
13: linux-x86_64
|
322 |
+
model_architecture:
|
323 |
+
desc: null
|
324 |
+
value: Qwen2ForCausalLM
|
325 |
+
activation_function:
|
326 |
+
desc: null
|
327 |
+
value: silu
|
328 |
+
hidden_size:
|
329 |
+
desc: null
|
330 |
+
value: 896
|
331 |
+
model_type:
|
332 |
+
desc: null
|
333 |
+
value: qwen2
|
334 |
+
max_position_embeddings:
|
335 |
+
desc: null
|
336 |
+
value: 2048
|
337 |
+
num_attention_heads:
|
338 |
+
desc: null
|
339 |
+
value: 14
|
340 |
+
num_hidden_layers:
|
341 |
+
desc: null
|
342 |
+
value: 24
|
wandb/run-20240823_160642-78xnl14c/files/output.log
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000010/model.pt
|
5 |
+
Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000010/model.pt
|
6 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
7 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
8 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
9 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
10 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
11 |
+
warnings.warn(
|
12 |
+
Let split = None
|
13 |
+
Unable to save the indexes because path_to_cache is None
|
14 |
+
Building a BlendedDataset for a single MegatronDataset
|
15 |
+
Unable to save the indexes because path_to_cache is None
|
16 |
+
Building a BlendedDataset for a single MegatronDataset
|
17 |
+
Unable to save the indexes because path_to_cache is None
|
18 |
+
--> applying fsdp activation checkpointing...
|
19 |
+
> datasets target sizes (minimum size):
|
20 |
+
train: 2400000
|
21 |
+
validation: 2403200
|
22 |
+
test: 3200
|
23 |
+
> building train, validation, and test datasets for GPT ...
|
24 |
+
> finished creating GPT datasets ...
|
25 |
+
Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000010/optimizer.pt
|
26 |
+
Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000010/optimizer.pt
|
27 |
+
model info: FullyShardedDataParallel(
|
28 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
29 |
+
(model): Qwen2Model(
|
30 |
+
(embed_tokens): Embedding(151936, 896)
|
31 |
+
(layers): ModuleList(
|
32 |
+
(0-23): 24 x FullyShardedDataParallel(
|
33 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
34 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
35 |
+
(self_attn): Qwen2FlashAttention2(
|
36 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
37 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
38 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
39 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
40 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
41 |
+
)
|
42 |
+
(mlp): Qwen2MLP(
|
43 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
44 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
45 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
46 |
+
(act_fn): SiLU()
|
47 |
+
)
|
48 |
+
(input_layernorm): Qwen2RMSNorm()
|
49 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
50 |
+
)
|
51 |
+
)
|
52 |
+
)
|
53 |
+
)
|
54 |
+
(norm): Qwen2RMSNorm()
|
55 |
+
)
|
56 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
57 |
+
)
|
58 |
+
)
|
59 |
+
model config: Qwen2Config {
|
60 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
61 |
+
"architectures": [
|
62 |
+
"Qwen2ForCausalLM"
|
63 |
+
],
|
64 |
+
"attention_dropout": 0.0,
|
65 |
+
"bos_token_id": 151643,
|
66 |
+
"eos_token_id": 151643,
|
67 |
+
"hidden_act": "silu",
|
68 |
+
"hidden_size": 896,
|
69 |
+
"initializer_range": 0.02,
|
70 |
+
"intermediate_size": 4864,
|
71 |
+
"label_smoothing": 0.0,
|
72 |
+
"max_position_embeddings": 2048,
|
73 |
+
"max_window_layers": 24,
|
74 |
+
"model_type": "qwen2",
|
75 |
+
"num_attention_heads": 14,
|
76 |
+
"num_hidden_layers": 24,
|
77 |
+
"num_key_value_heads": 2,
|
78 |
+
"rms_norm_eps": 1e-06,
|
79 |
+
"rope_theta": 1000000.0,
|
80 |
+
"sliding_window": 131072,
|
81 |
+
"tie_word_embeddings": true,
|
82 |
+
"torch_dtype": "bfloat16",
|
83 |
+
"transformers_version": "4.43.3",
|
84 |
+
"use_cache": false,
|
85 |
+
"use_sliding_window": false,
|
86 |
+
"vocab_size": 151936
|
87 |
+
}
|
88 |
+
[rank0]:[2024-08-23 16:06:47,708] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
89 |
+
------------------------------------------------------------------
|
90 |
+
iteration: 11 , TFLOPS: 78.6590174462143, Tokens per sec: 22522.217665366814, Loss: 4.254438400268555
|
91 |
+
------------------------------------------------------------------
|
92 |
+
------------------------------------------------------------------
|
93 |
+
iteration: 12 , TFLOPS: 82.22378668182658, Tokens per sec: 23542.90812474309, Loss: 4.243721008300781
|
94 |
+
------------------------------------------------------------------
|
95 |
+
------------------------------------------------------------------
|
96 |
+
iteration: 13 , TFLOPS: 81.9584539835788, Tokens per sec: 23466.93615131027, Loss: 4.228161334991455
|
97 |
+
------------------------------------------------------------------
|
98 |
+
------------------------------------------------------------------
|
99 |
+
iteration: 14 , TFLOPS: 81.93344116019101, Tokens per sec: 23459.774299165976, Loss: 4.26573371887207
|
100 |
+
------------------------------------------------------------------
|
101 |
+
------------------------------------------------------------------
|
102 |
+
iteration: 15 , TFLOPS: 82.18786108647078, Tokens per sec: 23532.62164895645, Loss: 4.256962776184082
|
103 |
+
------------------------------------------------------------------
|
104 |
+
------------------------------------------------------------------
|
105 |
+
iteration: 16 , TFLOPS: 81.96618258354832, Tokens per sec: 23469.14905984146, Loss: 4.25914192199707
|
106 |
+
------------------------------------------------------------------
|
107 |
+
------------------------------------------------------------------
|
108 |
+
iteration: 17 , TFLOPS: 81.87438758063158, Tokens per sec: 23442.865651995726, Loss: 4.231760025024414
|
109 |
+
------------------------------------------------------------------
|
110 |
+
------------------------------------------------------------------
|
111 |
+
iteration: 18 , TFLOPS: 81.88852755364577, Tokens per sec: 23446.914311136756, Loss: 4.2337870597839355
|
112 |
+
------------------------------------------------------------------
|
113 |
+
------------------------------------------------------------------
|
114 |
+
iteration: 19 , TFLOPS: 82.2474099184413, Tokens per sec: 23549.672100371965, Loss: 4.217740535736084
|
115 |
+
------------------------------------------------------------------
|
116 |
+
------------------------------------------------------------------
|
117 |
+
iteration: 20 , TFLOPS: 81.96982275621532, Tokens per sec: 23470.191340355606, Loss: 4.259789943695068
|
118 |
+
------------------------------------------------------------------
|
119 |
+
Saving checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020
|
120 |
+
Saving model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/model.pt
|
121 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
122 |
+
warnings.warn(
|
123 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
124 |
+
warnings.warn(
|
125 |
+
Saved model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/model.pt
|
126 |
+
Saving optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/optimizer.pt
|
127 |
+
[rank0]:[2024-08-23 16:11:31,233] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.006403441000202292, 'preprocessing_with_comm': 0.0006401519999599259, 'state_converting': 0.9829786160003096, <Type.ALL: 'all'>: 0.9914544049997858})
|
128 |
+
Saved optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/optimizer.pt
|
129 |
+
Saving scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/scheduler.pt
|
130 |
+
Saved scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/scheduler.pt
|
131 |
+
Saving RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/rng.pt
|
132 |
+
Saved RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020/rng.pt
|
133 |
+
Saved checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000020, took 4.19s
|
134 |
+
eval ppl=57.051639556884766, eval loss=4.043956756591797
|
135 |
+
------------------------------------------------------------------
|
136 |
+
iteration: 21 , TFLOPS: 81.67173393242047, Tokens per sec: 23384.840396610194, Loss: 4.235799789428711
|
137 |
+
------------------------------------------------------------------
|
138 |
+
------------------------------------------------------------------
|
139 |
+
iteration: 22 , TFLOPS: 82.17554686977023, Tokens per sec: 23529.095753542137, Loss: 4.2608537673950195
|
140 |
+
------------------------------------------------------------------
|
141 |
+
------------------------------------------------------------------
|
142 |
+
iteration: 23 , TFLOPS: 81.93471100946451, Tokens per sec: 23460.13789157621, Loss: 4.211125373840332
|
143 |
+
------------------------------------------------------------------
|
144 |
+
------------------------------------------------------------------
|
145 |
+
iteration: 24 , TFLOPS: 81.95296715435393, Tokens per sec: 23465.36512276062, Loss: 4.202465534210205
|
146 |
+
------------------------------------------------------------------
|
147 |
+
------------------------------------------------------------------
|
148 |
+
iteration: 25 , TFLOPS: 81.86218501721527, Tokens per sec: 23439.37172595571, Loss: 4.217883586883545
|
149 |
+
------------------------------------------------------------------
|
150 |
+
------------------------------------------------------------------
|
151 |
+
iteration: 26 , TFLOPS: 82.2054193168547, Tokens per sec: 23537.64904822184, Loss: 4.235620021820068
|
152 |
+
------------------------------------------------------------------
|
153 |
+
------------------------------------------------------------------
|
154 |
+
iteration: 27 , TFLOPS: 81.90650355273718, Tokens per sec: 23452.06132895408, Loss: 4.211632251739502
|
155 |
+
------------------------------------------------------------------
|
156 |
+
------------------------------------------------------------------
|
157 |
+
iteration: 28 , TFLOPS: 81.86577887433305, Tokens per sec: 23440.400745067473, Loss: 4.186619758605957
|
158 |
+
------------------------------------------------------------------
|
159 |
+
------------------------------------------------------------------
|
160 |
+
iteration: 29 , TFLOPS: 81.98724461570576, Tokens per sec: 23475.179692922393, Loss: 4.187148571014404
|
161 |
+
------------------------------------------------------------------
|
162 |
+
------------------------------------------------------------------
|
163 |
+
iteration: 30 , TFLOPS: 82.14778410156913, Tokens per sec: 23521.146517348767, Loss: 4.202610492706299
|
164 |
+
------------------------------------------------------------------
|
165 |
+
Saving checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030
|
166 |
+
Saving model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/model.pt
|
167 |
+
Saved model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/model.pt
|
168 |
+
Saving optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/optimizer.pt
|
169 |
+
[rank0]:[2024-08-23 16:16:16,915] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.0063674119996903755, 'preprocessing_with_comm': 0.0006565419998878497, 'state_converting': 0.9940128050002386, <Type.ALL: 'all'>: 1.0024299110000356})
|
170 |
+
Saved optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/optimizer.pt
|
171 |
+
Saving scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/scheduler.pt
|
172 |
+
Saved scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/scheduler.pt
|
173 |
+
Saving RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/rng.pt
|
174 |
+
Saved RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030/rng.pt
|
175 |
+
Saved checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000030, took 4.25s
|
176 |
+
eval ppl=49.190242767333984, eval loss=3.895695209503174
|
177 |
+
------------------------------------------------------------------
|
178 |
+
iteration: 31 , TFLOPS: 81.49741363388435, Tokens per sec: 23334.92775042431, Loss: 4.200056552886963
|
179 |
+
------------------------------------------------------------------
|
180 |
+
------------------------------------------------------------------
|
181 |
+
iteration: 32 , TFLOPS: 81.78042346796904, Tokens per sec: 23415.961168990125, Loss: 4.181160926818848
|
182 |
+
------------------------------------------------------------------
|
183 |
+
------------------------------------------------------------------
|
184 |
+
iteration: 33 , TFLOPS: 81.92238077096499, Tokens per sec: 23456.60740868476, Loss: 4.155094623565674
|
185 |
+
------------------------------------------------------------------
|
186 |
+
------------------------------------------------------------------
|
187 |
+
iteration: 34 , TFLOPS: 82.11937602485122, Tokens per sec: 23513.012511762787, Loss: 4.177372932434082
|
188 |
+
------------------------------------------------------------------
|
189 |
+
------------------------------------------------------------------
|
190 |
+
iteration: 35 , TFLOPS: 81.71716264856666, Tokens per sec: 23397.847874538613, Loss: 4.142157077789307
|
191 |
+
------------------------------------------------------------------
|
192 |
+
------------------------------------------------------------------
|
193 |
+
iteration: 36 , TFLOPS: 81.89085188246439, Tokens per sec: 23447.57982974198, Loss: 4.166767597198486
|
194 |
+
------------------------------------------------------------------
|
195 |
+
------------------------------------------------------------------
|
196 |
+
iteration: 37 , TFLOPS: 81.95670790833275, Tokens per sec: 23466.436202441997, Loss: 4.169678688049316
|
197 |
+
------------------------------------------------------------------
|
198 |
+
------------------------------------------------------------------
|
199 |
+
iteration: 38 , TFLOPS: 82.18953848455327, Tokens per sec: 23533.101933683516, Loss: 4.145669937133789
|
200 |
+
------------------------------------------------------------------
|
201 |
+
------------------------------------------------------------------
|
202 |
+
iteration: 39 , TFLOPS: 81.94531984778251, Tokens per sec: 23463.175490741974, Loss: 4.136501312255859
|
203 |
+
------------------------------------------------------------------
|
204 |
+
------------------------------------------------------------------
|
205 |
+
iteration: 40 , TFLOPS: 81.88752676960887, Tokens per sec: 23446.627759427276, Loss: 4.12642240524292
|
206 |
+
------------------------------------------------------------------
|
207 |
+
Saving checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040
|
208 |
+
Saving model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
|
209 |
+
Saved model state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
|
210 |
+
[rank0]:[2024-08-23 16:21:02,356] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.006349127999783377, 'preprocessing_with_comm': 0.0006216020001375, 'state_converting': 0.7473425819998738, <Type.ALL: 'all'>: 0.7556547180001871})
|
211 |
+
Saving optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
|
212 |
+
Saved optimizer state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
|
213 |
+
Saving scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/scheduler.pt
|
214 |
+
Saved scheduler state dict to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/scheduler.pt
|
215 |
+
Saving RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/rng.pt
|
216 |
+
Saved RNG states to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/rng.pt
|
217 |
+
Saved checkpoint to /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040, took 3.86s
|
218 |
+
eval ppl=30.580110549926758, eval loss=3.4203498363494873
|
219 |
+
------------------------------------------------------------------
|
220 |
+
iteration: 41 , TFLOPS: 81.74253389997872, Tokens per sec: 23405.112354382087, Loss: 4.1325507164001465
|
221 |
+
------------------------------------------------------------------
|
222 |
+
------------------------------------------------------------------
|
223 |
+
iteration: 42 , TFLOPS: 81.9581488586999, Tokens per sec: 23466.848785752438, Loss: 4.099006652832031
|
224 |
+
------------------------------------------------------------------
|
225 |
+
------------------------------------------------------------------
|
226 |
+
iteration: 43 , TFLOPS: 81.91532895406834, Tokens per sec: 23454.588281568267, Loss: 4.136029243469238
|
227 |
+
------------------------------------------------------------------
|
228 |
+
------------------------------------------------------------------
|
229 |
+
iteration: 44 , TFLOPS: 81.95839032196434, Tokens per sec: 23466.917923257286, Loss: 4.140143871307373
|
230 |
+
------------------------------------------------------------------
|
231 |
+
------------------------------------------------------------------
|
232 |
+
iteration: 45 , TFLOPS: 82.23276643961596, Tokens per sec: 23545.47927381259, Loss: 4.161101341247559
|
233 |
+
------------------------------------------------------------------
|
234 |
+
------------------------------------------------------------------
|
235 |
+
iteration: 46 , TFLOPS: 81.96124008786207, Tokens per sec: 23467.733888799798, Loss: 4.099796772003174
|
236 |
+
------------------------------------------------------------------
|
237 |
+
------------------------------------------------------------------
|
238 |
+
iteration: 47 , TFLOPS: 81.96352747485656, Tokens per sec: 23468.388829955275, Loss: 4.10368537902832
|
239 |
+
------------------------------------------------------------------
|
240 |
+
Traceback (most recent call last):
|
241 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
242 |
+
main()
|
243 |
+
File "/project/src/llama_recipes/finetuning.py", line 282, in main
|
244 |
+
train(
|
245 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
|
246 |
+
loss.backward()
|
247 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
|
248 |
+
torch.autograd.backward(
|
249 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
|
250 |
+
_engine_run_backward(
|
251 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
|
252 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
253 |
+
KeyboardInterrupt
|
wandb/run-20240823_160642-78xnl14c/files/requirements.txt
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
build==1.2.1
|
23 |
+
cachecontrol==0.14.0
|
24 |
+
cachetools==5.3.2
|
25 |
+
catalogue==2.0.10
|
26 |
+
certifi==2024.2.2
|
27 |
+
cffi==1.16.0
|
28 |
+
chardet==5.2.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
cleo==2.1.0
|
31 |
+
click==8.1.7
|
32 |
+
cloudpathlib==0.16.0
|
33 |
+
cloudpickle==3.0.0
|
34 |
+
cmake==3.28.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.1
|
37 |
+
confection==0.1.4
|
38 |
+
contourpy==1.2.0
|
39 |
+
cramjam==2.8.3
|
40 |
+
crashtest==0.4.1
|
41 |
+
cryptography==43.0.0
|
42 |
+
cubinlinker==0.3.0+2.g405ac64
|
43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
44 |
+
cudf==23.12.0
|
45 |
+
cugraph-dgl==23.12.0
|
46 |
+
cugraph-service-client==23.12.0
|
47 |
+
cugraph-service-server==23.12.0
|
48 |
+
cugraph==23.12.0
|
49 |
+
cuml==23.12.0
|
50 |
+
cupy-cuda12x==12.3.0
|
51 |
+
cycler==0.12.1
|
52 |
+
cymem==2.0.8
|
53 |
+
cython==3.0.8
|
54 |
+
dask-cuda==23.12.0
|
55 |
+
dask-cudf==23.12.0
|
56 |
+
dask==2023.11.0
|
57 |
+
dataclasses-json==0.6.7
|
58 |
+
dataproperty==1.0.1
|
59 |
+
datasets==2.20.0
|
60 |
+
debugpy==1.8.1
|
61 |
+
decorator==5.1.1
|
62 |
+
defusedxml==0.7.1
|
63 |
+
dill==0.3.8
|
64 |
+
distlib==0.3.8
|
65 |
+
distributed==2023.11.0
|
66 |
+
distro==1.9.0
|
67 |
+
dm-tree==0.1.8
|
68 |
+
docker-pycreds==0.4.0
|
69 |
+
dulwich==0.21.7
|
70 |
+
einops==0.7.0
|
71 |
+
emoji==2.12.1
|
72 |
+
entmax==1.3
|
73 |
+
evaluate==0.4.2
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
execnet==2.0.2
|
76 |
+
executing==2.0.1
|
77 |
+
expecttest==0.1.3
|
78 |
+
fastjsonschema==2.19.1
|
79 |
+
fastparquet==2023.10.1
|
80 |
+
fastrlock==0.8.2
|
81 |
+
filelock==3.13.1
|
82 |
+
flash-attn==2.4.2
|
83 |
+
fonttools==4.48.1
|
84 |
+
frozenlist==1.4.1
|
85 |
+
fsspec==2023.12.2
|
86 |
+
fugashi==1.3.2
|
87 |
+
fuzzywuzzy==0.18.0
|
88 |
+
gast==0.5.4
|
89 |
+
gitdb==4.0.11
|
90 |
+
gitpython==3.1.43
|
91 |
+
google-auth-oauthlib==0.4.6
|
92 |
+
google-auth==2.27.0
|
93 |
+
graphsurgeon==0.4.6
|
94 |
+
greenlet==3.0.3
|
95 |
+
grpcio==1.60.1
|
96 |
+
h11==0.14.0
|
97 |
+
httpcore==1.0.5
|
98 |
+
httpx==0.27.0
|
99 |
+
huggingface-hub==0.24.5
|
100 |
+
hydra-core==1.3.2
|
101 |
+
hypothesis==5.35.1
|
102 |
+
idna==3.6
|
103 |
+
importlib-metadata==7.0.1
|
104 |
+
iniconfig==2.0.0
|
105 |
+
installer==0.7.0
|
106 |
+
intel-openmp==2021.4.0
|
107 |
+
ipadic==1.0.0
|
108 |
+
ipykernel==6.29.2
|
109 |
+
ipython-genutils==0.2.0
|
110 |
+
ipython==8.21.0
|
111 |
+
isort==5.13.2
|
112 |
+
jaraco.classes==3.4.0
|
113 |
+
jedi==0.19.1
|
114 |
+
jeepney==0.8.0
|
115 |
+
jinja2==3.1.3
|
116 |
+
jiter==0.5.0
|
117 |
+
joblib==1.3.2
|
118 |
+
json5==0.9.14
|
119 |
+
jsonargparse==3.13.1
|
120 |
+
jsonlines==4.0.0
|
121 |
+
jsonnet==0.19.1
|
122 |
+
jsonpatch==1.33
|
123 |
+
jsonpointer==3.0.0
|
124 |
+
jsonschema-specifications==2023.12.1
|
125 |
+
jsonschema==4.21.1
|
126 |
+
jupyter-client==8.6.0
|
127 |
+
jupyter-core==5.7.1
|
128 |
+
jupyter-tensorboard==0.2.0
|
129 |
+
jupyterlab-pygments==0.3.0
|
130 |
+
jupyterlab-server==1.2.0
|
131 |
+
jupyterlab==2.3.2
|
132 |
+
jupytext==1.16.1
|
133 |
+
keyring==24.3.1
|
134 |
+
kiwisolver==1.4.5
|
135 |
+
langchain-community==0.2.12
|
136 |
+
langchain-core==0.2.31
|
137 |
+
langchain-huggingface==0.0.2
|
138 |
+
langchain-openai==0.1.21
|
139 |
+
langchain-text-splitters==0.2.2
|
140 |
+
langchain==0.2.13
|
141 |
+
langcodes==3.3.0
|
142 |
+
langsmith==0.1.99
|
143 |
+
lazy-loader==0.3
|
144 |
+
levenshtein==0.25.1
|
145 |
+
librosa==0.10.1
|
146 |
+
lightning-utilities==0.11.6
|
147 |
+
llm-jp-eval==1.4.0
|
148 |
+
llvmlite==0.40.1
|
149 |
+
lm-eval==0.3.0
|
150 |
+
locket==1.0.0
|
151 |
+
logzero==1.7.0
|
152 |
+
lxml==5.2.2
|
153 |
+
markdown-it-py==3.0.0
|
154 |
+
markdown==3.5.2
|
155 |
+
markupsafe==2.1.4
|
156 |
+
marshmallow==3.21.3
|
157 |
+
matplotlib-inline==0.1.6
|
158 |
+
matplotlib==3.8.2
|
159 |
+
mbstrdecoder==1.1.3
|
160 |
+
mccabe==0.7.0
|
161 |
+
mdit-py-plugins==0.4.0
|
162 |
+
mdurl==0.1.2
|
163 |
+
mecab-python3==1.0.6
|
164 |
+
mistune==3.0.2
|
165 |
+
mkl-devel==2021.1.1
|
166 |
+
mkl-include==2021.1.1
|
167 |
+
mkl==2021.1.1
|
168 |
+
mock==5.1.0
|
169 |
+
mojimoji==0.0.13
|
170 |
+
more-itertools==9.1.0
|
171 |
+
mpmath==1.3.0
|
172 |
+
msgpack==1.0.7
|
173 |
+
multidict==6.0.4
|
174 |
+
multiprocess==0.70.16
|
175 |
+
murmurhash==1.0.10
|
176 |
+
mypy-extensions==1.0.0
|
177 |
+
nbclient==0.9.0
|
178 |
+
nbconvert==7.16.0
|
179 |
+
nbformat==5.9.2
|
180 |
+
neologdn==0.5.3
|
181 |
+
nest-asyncio==1.6.0
|
182 |
+
networkx==2.6.3
|
183 |
+
ninja==1.11.1.1
|
184 |
+
nltk==3.8.1
|
185 |
+
notebook==6.4.10
|
186 |
+
numba==0.57.1+1.g1ff679645
|
187 |
+
numexpr==2.10.1
|
188 |
+
numpy==1.24.4
|
189 |
+
nvfuser==0.1.4a0+d0bb811
|
190 |
+
nvidia-dali-cuda120==1.34.0
|
191 |
+
nvidia-pyindex==1.0.9
|
192 |
+
nvtx==0.2.5
|
193 |
+
oauthlib==3.2.2
|
194 |
+
omegaconf==2.3.0
|
195 |
+
onnx==1.15.0rc2
|
196 |
+
openai==1.40.6
|
197 |
+
opencv==4.7.0
|
198 |
+
optree==0.10.0
|
199 |
+
orjson==3.10.7
|
200 |
+
packaging==23.2
|
201 |
+
pandas==2.2.2
|
202 |
+
pandocfilters==1.5.1
|
203 |
+
parso==0.8.3
|
204 |
+
partd==1.4.1
|
205 |
+
pathvalidate==3.2.0
|
206 |
+
peft==0.5.0
|
207 |
+
pexpect==4.9.0
|
208 |
+
pillow==10.2.0
|
209 |
+
pip==24.0
|
210 |
+
pkginfo==1.11.1
|
211 |
+
plac==1.4.3
|
212 |
+
platformdirs==4.2.0
|
213 |
+
pluggy==1.4.0
|
214 |
+
ply==3.11
|
215 |
+
poetry-core==1.9.0
|
216 |
+
poetry-plugin-export==1.8.0
|
217 |
+
poetry==1.8.3
|
218 |
+
polygraphy==0.49.4
|
219 |
+
pooch==1.8.0
|
220 |
+
portalocker==2.10.1
|
221 |
+
preshed==3.0.9
|
222 |
+
prettytable==3.9.0
|
223 |
+
prometheus-client==0.19.0
|
224 |
+
prompt-toolkit==3.0.43
|
225 |
+
protobuf==4.24.4
|
226 |
+
psutil==5.9.4
|
227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
228 |
+
ptyprocess==0.7.0
|
229 |
+
pure-eval==0.2.2
|
230 |
+
pyarrow-hotfix==0.6
|
231 |
+
pyarrow==15.0.2
|
232 |
+
pyasn1-modules==0.3.0
|
233 |
+
pyasn1==0.5.1
|
234 |
+
pybind11-global==2.11.1
|
235 |
+
pybind11==2.11.1
|
236 |
+
pycocotools==2.0+nv0.8.0
|
237 |
+
pycountry==24.6.1
|
238 |
+
pycparser==2.21
|
239 |
+
pydantic-core==2.16.2
|
240 |
+
pydantic==2.6.1
|
241 |
+
pygments==2.17.2
|
242 |
+
pylibcugraph==23.12.0
|
243 |
+
pylibcugraphops==23.12.0
|
244 |
+
pylibraft==23.12.0
|
245 |
+
pylint==3.2.6
|
246 |
+
pynvml==11.4.1
|
247 |
+
pyparsing==3.1.1
|
248 |
+
pyproject-hooks==1.1.0
|
249 |
+
pytablewriter==1.2.0
|
250 |
+
pytest-flakefinder==1.1.0
|
251 |
+
pytest-rerunfailures==13.0
|
252 |
+
pytest-shard==0.1.2
|
253 |
+
pytest-xdist==3.5.0
|
254 |
+
pytest==8.0.0
|
255 |
+
python-dateutil==2.8.2
|
256 |
+
python-dotenv==1.0.0
|
257 |
+
python-hostlist==1.23.0
|
258 |
+
python-levenshtein==0.25.1
|
259 |
+
pytorch-lightning==2.4.0
|
260 |
+
pytorch-quantization==2.1.2
|
261 |
+
pytz==2023.3.post1
|
262 |
+
pyyaml==6.0.1
|
263 |
+
pyzmq==25.1.2
|
264 |
+
raft-dask==23.12.0
|
265 |
+
rapidfuzz==3.9.6
|
266 |
+
rapids-dask-dependency==23.12.1
|
267 |
+
referencing==0.33.0
|
268 |
+
regex==2023.12.25
|
269 |
+
requests-oauthlib==1.3.1
|
270 |
+
requests-toolbelt==1.0.0
|
271 |
+
requests==2.32.3
|
272 |
+
rhoknp==1.7.0
|
273 |
+
rich==13.7.0
|
274 |
+
rmm==23.12.0
|
275 |
+
rouge-score==0.1.2
|
276 |
+
rpds-py==0.17.1
|
277 |
+
rsa==4.9
|
278 |
+
sacrebleu==2.4.2
|
279 |
+
safetensors==0.4.3
|
280 |
+
scikit-learn==1.5.1
|
281 |
+
scipy==1.12.0
|
282 |
+
secretstorage==3.3.3
|
283 |
+
send2trash==1.8.2
|
284 |
+
sentence-transformers==3.0.1
|
285 |
+
sentencepiece==0.1.99
|
286 |
+
sentry-sdk==2.12.0
|
287 |
+
setproctitle==1.3.3
|
288 |
+
setuptools==68.2.2
|
289 |
+
shellingham==1.5.4
|
290 |
+
six==1.16.0
|
291 |
+
smart-open==6.4.0
|
292 |
+
smmap==5.0.1
|
293 |
+
sniffio==1.3.1
|
294 |
+
sortedcontainers==2.4.0
|
295 |
+
soundfile==0.12.1
|
296 |
+
soupsieve==2.5
|
297 |
+
soxr==0.3.7
|
298 |
+
spacy-legacy==3.0.12
|
299 |
+
spacy-loggers==1.0.5
|
300 |
+
spacy==3.7.2
|
301 |
+
sphinx-glpi-theme==0.6
|
302 |
+
sqlalchemy==2.0.32
|
303 |
+
sqlitedict==2.1.0
|
304 |
+
srsly==2.4.8
|
305 |
+
stack-data==0.6.3
|
306 |
+
sumeval==0.2.2
|
307 |
+
sympy==1.12
|
308 |
+
tabledata==1.3.3
|
309 |
+
tabulate==0.9.0
|
310 |
+
tbb==2021.11.0
|
311 |
+
tblib==3.0.0
|
312 |
+
tcolorpy==0.1.6
|
313 |
+
tenacity==8.5.0
|
314 |
+
tensorboard-data-server==0.6.1
|
315 |
+
tensorboard-plugin-wit==1.8.1
|
316 |
+
tensorboard==2.9.0
|
317 |
+
tensorrt==8.6.3
|
318 |
+
terminado==0.18.0
|
319 |
+
termplotlib==0.3.9
|
320 |
+
text-generation==0.7.0
|
321 |
+
thinc==8.2.3
|
322 |
+
threadpoolctl==3.2.0
|
323 |
+
thriftpy2==0.4.17
|
324 |
+
tiktoken==0.7.0
|
325 |
+
tinycss2==1.2.1
|
326 |
+
tokenizers==0.19.1
|
327 |
+
toml==0.10.2
|
328 |
+
tomli==2.0.1
|
329 |
+
tomlkit==0.13.2
|
330 |
+
toolz==0.12.1
|
331 |
+
torch-tensorrt==2.3.0a0
|
332 |
+
torch==2.3.0a0+ebedce2
|
333 |
+
torchdata==0.7.1a0
|
334 |
+
torchmetrics==0.10.3
|
335 |
+
torchtext==0.17.0a0
|
336 |
+
torchvision==0.18.0a0
|
337 |
+
tornado==6.4
|
338 |
+
tqdm-multiprocess==0.0.11
|
339 |
+
tqdm==4.66.5
|
340 |
+
traitlets==5.9.0
|
341 |
+
transformer-engine==1.3.0+5b90b7f
|
342 |
+
transformers==4.43.3
|
343 |
+
treelite-runtime==3.9.1
|
344 |
+
treelite==3.9.1
|
345 |
+
triton==2.2.0+e28a256
|
346 |
+
trove-classifiers==2024.7.2
|
347 |
+
typepy==1.3.2
|
348 |
+
typer==0.9.0
|
349 |
+
types-dataclasses==0.6.6
|
350 |
+
typing-extensions==4.12.2
|
351 |
+
typing-inspect==0.9.0
|
352 |
+
tzdata==2024.1
|
353 |
+
ucx-py==0.35.0
|
354 |
+
uff==0.6.9
|
355 |
+
ujson==5.8.0
|
356 |
+
unbabel-comet==2.2.2
|
357 |
+
unidic-lite==1.0.8
|
358 |
+
urllib3==1.26.18
|
359 |
+
virtualenv==20.26.3
|
360 |
+
wandb==0.16.3
|
361 |
+
wasabi==1.1.2
|
362 |
+
wcwidth==0.2.13
|
363 |
+
weasel==0.3.4
|
364 |
+
webencodings==0.5.1
|
365 |
+
werkzeug==3.0.1
|
366 |
+
wheel==0.42.0
|
367 |
+
word2number==1.1
|
368 |
+
xdoctest==1.0.2
|
369 |
+
xgboost==1.7.6
|
370 |
+
xmltodict==0.13.0
|
371 |
+
xxhash==3.4.1
|
372 |
+
yarl==1.9.4
|
373 |
+
zict==3.0.0
|
374 |
+
zipp==3.17.0
|
375 |
+
zstandard==0.23.0
|
wandb/run-20240823_160642-78xnl14c/files/wandb-metadata.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-23T07:06:43.074166",
|
5 |
+
"startedAt": "2024-08-23T07:06:42.542542",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"2048",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"131072",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"5",
|
15 |
+
"--valid_micro_batch_size",
|
16 |
+
"1",
|
17 |
+
"--global-batch-size",
|
18 |
+
"320",
|
19 |
+
"--train-iters",
|
20 |
+
"7500",
|
21 |
+
"--tokenizer-type",
|
22 |
+
"HFPreTrainedTokenizer",
|
23 |
+
"--tokenizer-model",
|
24 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
25 |
+
"--train-data-path",
|
26 |
+
"1754785366",
|
27 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
28 |
+
"28623823675",
|
29 |
+
"/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
|
30 |
+
"--valid-data-path",
|
31 |
+
"1754785366",
|
32 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
33 |
+
"--test-data-path",
|
34 |
+
"1754785366",
|
35 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
36 |
+
"--lr",
|
37 |
+
"2e-5",
|
38 |
+
"--min-lr",
|
39 |
+
"1e-6",
|
40 |
+
"--lr-decay-style",
|
41 |
+
"cosine",
|
42 |
+
"--lr-warmup-iters",
|
43 |
+
"500",
|
44 |
+
"--lr-decay-iters",
|
45 |
+
"7500",
|
46 |
+
"--weight-decay",
|
47 |
+
"0.1",
|
48 |
+
"--grad-clip-norm",
|
49 |
+
"1.0",
|
50 |
+
"--optimizer",
|
51 |
+
"anyprecision",
|
52 |
+
"--adam-beta1",
|
53 |
+
"0.9",
|
54 |
+
"--adam-beta2",
|
55 |
+
"0.95",
|
56 |
+
"--adam-eps",
|
57 |
+
"1e-6",
|
58 |
+
"--save-interval",
|
59 |
+
"10",
|
60 |
+
"--eval-interval",
|
61 |
+
"10",
|
62 |
+
"--eval-iters",
|
63 |
+
"10",
|
64 |
+
"--bf16",
|
65 |
+
"--mixed-precision",
|
66 |
+
"--base-model",
|
67 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
68 |
+
"--save",
|
69 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
70 |
+
"--load",
|
71 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
72 |
+
"--fsdp-activation-checkpointing",
|
73 |
+
"--sharding-strategy",
|
74 |
+
"FULL_SHARD",
|
75 |
+
"--checkpoint-type",
|
76 |
+
"LOCAL_STATE_DICT",
|
77 |
+
"--save-n-checkpoints",
|
78 |
+
"10",
|
79 |
+
"--upload-all-checkpoints-to-hf",
|
80 |
+
"--hf-upload-retry-limit",
|
81 |
+
"2",
|
82 |
+
"--hf-repo-id",
|
83 |
+
"koichi12/Qwen2-0.5b-0.2",
|
84 |
+
"--wandb-entity",
|
85 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
86 |
+
"--wandb-project",
|
87 |
+
"llm_tutorial-0.2",
|
88 |
+
"--wandb-name",
|
89 |
+
"Qwen2-0.5b-0.2_train_2024-08-23-16:06:29"
|
90 |
+
],
|
91 |
+
"state": "running",
|
92 |
+
"program": "/project/examples/finetuning.py",
|
93 |
+
"codePathLocal": "examples/finetuning.py",
|
94 |
+
"codePath": "examples/finetuning.py",
|
95 |
+
"git": {
|
96 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
97 |
+
"commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
|
98 |
+
},
|
99 |
+
"email": null,
|
100 |
+
"root": "/project",
|
101 |
+
"host": "gpu-koiwa-00",
|
102 |
+
"username": "koiwa",
|
103 |
+
"executable": "/usr/bin/python",
|
104 |
+
"cpu_count": 18,
|
105 |
+
"cpu_count_logical": 18,
|
106 |
+
"cpu_freq": {
|
107 |
+
"current": 2400.0389999999993,
|
108 |
+
"min": 0.0,
|
109 |
+
"max": 0.0
|
110 |
+
},
|
111 |
+
"cpu_freq_per_core": [
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"current": 2400.039,
|
199 |
+
"min": 0.0,
|
200 |
+
"max": 0.0
|
201 |
+
}
|
202 |
+
],
|
203 |
+
"disk": {
|
204 |
+
"/": {
|
205 |
+
"total": 0.0625,
|
206 |
+
"used": 1.1444091796875e-05
|
207 |
+
}
|
208 |
+
},
|
209 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
210 |
+
"gpu_count": 1,
|
211 |
+
"gpu_devices": [
|
212 |
+
{
|
213 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
214 |
+
"memory_total": 42949672960
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"memory": {
|
218 |
+
"total": 56.487831115722656
|
219 |
+
}
|
220 |
+
}
|
wandb/run-20240823_160642-78xnl14c/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 4.10368537902832, "training/perplexity": 60.56307470314165, "utils/batch_size": 5, "utils/global_batch_size": 320, "utils/seq_len": 2049, "utils/gradient_accumulation_steps": 64, "utils/iteration": 47, "optimizer/lr": 2.786e-06, "optimizer/variance_l2": 0.05447102242525233, "optimizer/variance_sqrt_l2": 0.9553866833854993, "optimizer/momentum_l2": 0.9463549769133376, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.9140548706054688, "optimizer/variance_sqrt_l1": 4069.0, "optimizer/momentum_l1": 3366.75, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.044921875, "optimizer/variance_sqrt_abs_max": 0.2119140625, "optimizer/momentum_abs_max": 0.2353515625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 27.938858723999147, "stats/tokens_per_sec": 23468.388829955275, "stats/tokens_per_sec_per_gpu": 23468.388829955275, "stats/tflops": 81.96352747485656, "_timestamp": 1724397861.433749, "_runtime": 1058.8787438869476, "_step": 47, "evaluation/val_loss": 3.4203498363494873, "evaluation/val_ppl": 30.580110549926758, "_wandb": {"runtime": 1064}}
|
wandb/run-20240823_160642-78xnl14c/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240823_160642-78xnl14c/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-23 16:06:42,548 INFO MainThread:10858 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Configure stats pid to 10858
|
3 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
6 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_160642-78xnl14c/logs/debug.log
|
9 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_160642-78xnl14c/logs/debug-internal.log
|
10 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 2048, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-16:06:29', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 5, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 64}
|
13 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-23 16:06:42,549 INFO MainThread:10858 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-23 16:06:42,554 INFO MainThread:10858 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-23 16:06:42,554 INFO MainThread:10858 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-23 16:06:42,559 INFO MainThread:10858 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-23 16:06:42,570 INFO MainThread:10858 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-23 16:06:42,985 INFO MainThread:10858 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-23 16:06:43,007 INFO MainThread:10858 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-23 16:06:43,007 INFO MainThread:10858 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-23 16:06:43,104 INFO MainThread:10858 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-23 16:06:43,105 INFO MainThread:10858 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-23 16:06:43,105 INFO MainThread:10858 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-23 16:06:43,105 INFO MainThread:10858 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-23 16:06:43,106 INFO MainThread:10858 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-23 16:06:47,996 INFO MainThread:10858 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 2048, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-23 16:06:47,997 INFO MainThread:10858 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-23 16:24:34,131 WARNING MsgRouterThr:10858 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240823_160642-78xnl14c/run-78xnl14c.wandb
ADDED
Binary file (137 kB). View file
|
|
wandb/run-20240823_162922-z3gs82jm/files/config.yaml
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '1754785366'
|
31 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
32 |
+
- '28623823675'
|
33 |
+
- /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
|
34 |
+
valid_data_path:
|
35 |
+
desc: null
|
36 |
+
value:
|
37 |
+
- '1754785366'
|
38 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
39 |
+
test_data_path:
|
40 |
+
desc: null
|
41 |
+
value:
|
42 |
+
- '1754785366'
|
43 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
44 |
+
data_cache_path:
|
45 |
+
desc: null
|
46 |
+
value: null
|
47 |
+
vocab_size:
|
48 |
+
desc: null
|
49 |
+
value: null
|
50 |
+
vocab_file:
|
51 |
+
desc: null
|
52 |
+
value: null
|
53 |
+
merge_file:
|
54 |
+
desc: null
|
55 |
+
value: null
|
56 |
+
seq_length:
|
57 |
+
desc: null
|
58 |
+
value: 2048
|
59 |
+
num_workers:
|
60 |
+
desc: null
|
61 |
+
value: 2
|
62 |
+
tokenizer_type:
|
63 |
+
desc: null
|
64 |
+
value: HFPreTrainedTokenizer
|
65 |
+
tokenizer_model:
|
66 |
+
desc: null
|
67 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
68 |
+
reset_position_ids:
|
69 |
+
desc: null
|
70 |
+
value: false
|
71 |
+
reset_attention_mask:
|
72 |
+
desc: null
|
73 |
+
value: false
|
74 |
+
eod_mask_loss:
|
75 |
+
desc: null
|
76 |
+
value: false
|
77 |
+
retro_return_doc_ids:
|
78 |
+
desc: null
|
79 |
+
value: false
|
80 |
+
short_seq_prob:
|
81 |
+
desc: null
|
82 |
+
value: 0.1
|
83 |
+
vocab_extra_ids:
|
84 |
+
desc: null
|
85 |
+
value: 0
|
86 |
+
seed:
|
87 |
+
desc: null
|
88 |
+
value: 1234
|
89 |
+
use_mpi:
|
90 |
+
desc: null
|
91 |
+
value: false
|
92 |
+
wandb_entity:
|
93 |
+
desc: null
|
94 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
95 |
+
wandb_name:
|
96 |
+
desc: null
|
97 |
+
value: Qwen2-0.5b-0.2_train_2024-08-23-16:29:10
|
98 |
+
wandb_project:
|
99 |
+
desc: null
|
100 |
+
value: llm_tutorial-0.2
|
101 |
+
quantization:
|
102 |
+
desc: null
|
103 |
+
value: false
|
104 |
+
use_freeze_layers:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
freeze_layers:
|
108 |
+
desc: null
|
109 |
+
value: null
|
110 |
+
bf16:
|
111 |
+
desc: null
|
112 |
+
value: true
|
113 |
+
fp16:
|
114 |
+
desc: null
|
115 |
+
value: false
|
116 |
+
mixed_precision:
|
117 |
+
desc: null
|
118 |
+
value: true
|
119 |
+
param_dtype:
|
120 |
+
desc: null
|
121 |
+
value: null
|
122 |
+
load:
|
123 |
+
desc: null
|
124 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
125 |
+
save:
|
126 |
+
desc: null
|
127 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
128 |
+
base_model:
|
129 |
+
desc: null
|
130 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
131 |
+
use_better_transformer:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
grad_clip_norm:
|
135 |
+
desc: null
|
136 |
+
value: 1.0
|
137 |
+
eval_interval:
|
138 |
+
desc: null
|
139 |
+
value: 10
|
140 |
+
save_interval:
|
141 |
+
desc: null
|
142 |
+
value: 10
|
143 |
+
eval_iters:
|
144 |
+
desc: null
|
145 |
+
value: 10
|
146 |
+
optimizer:
|
147 |
+
desc: null
|
148 |
+
value: anyprecision
|
149 |
+
lr:
|
150 |
+
desc: null
|
151 |
+
value: 2.0e-05
|
152 |
+
lr_decay_style:
|
153 |
+
desc: null
|
154 |
+
value: cosine
|
155 |
+
lr_decay_iters:
|
156 |
+
desc: null
|
157 |
+
value: 7500
|
158 |
+
lr_warmup_iters:
|
159 |
+
desc: null
|
160 |
+
value: 500
|
161 |
+
min_lr:
|
162 |
+
desc: null
|
163 |
+
value: 1.0e-06
|
164 |
+
train_iters:
|
165 |
+
desc: null
|
166 |
+
value: 7500
|
167 |
+
train_samples:
|
168 |
+
desc: null
|
169 |
+
value: null
|
170 |
+
global_batch_size:
|
171 |
+
desc: null
|
172 |
+
value: 640
|
173 |
+
micro_batch_size:
|
174 |
+
desc: null
|
175 |
+
value: 5
|
176 |
+
make_vocab_size_divisible_by:
|
177 |
+
desc: null
|
178 |
+
value: 128
|
179 |
+
sliding_window_size:
|
180 |
+
desc: null
|
181 |
+
value: 131072
|
182 |
+
skip_batch:
|
183 |
+
desc: null
|
184 |
+
value: null
|
185 |
+
no_save_optimizer_state:
|
186 |
+
desc: null
|
187 |
+
value: false
|
188 |
+
continual_pretraining:
|
189 |
+
desc: null
|
190 |
+
value: false
|
191 |
+
instruction_tuning:
|
192 |
+
desc: null
|
193 |
+
value: false
|
194 |
+
direct_preference_optimization:
|
195 |
+
desc: null
|
196 |
+
value: false
|
197 |
+
attention_dropout:
|
198 |
+
desc: null
|
199 |
+
value: 0.1
|
200 |
+
hidden_dropout:
|
201 |
+
desc: null
|
202 |
+
value: 0.1
|
203 |
+
weight_decay:
|
204 |
+
desc: null
|
205 |
+
value: 0.1
|
206 |
+
adam_beta1:
|
207 |
+
desc: null
|
208 |
+
value: 0.9
|
209 |
+
adam_beta2:
|
210 |
+
desc: null
|
211 |
+
value: 0.95
|
212 |
+
adam_eps:
|
213 |
+
desc: null
|
214 |
+
value: 1.0e-06
|
215 |
+
hf_transformer_model_dir:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
instruction_train_data_path:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
instruction_valid_data_path:
|
222 |
+
desc: null
|
223 |
+
value: null
|
224 |
+
epoch:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
instruction_dataset_size:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
save_sampler_state:
|
231 |
+
desc: null
|
232 |
+
value: false
|
233 |
+
label_smoothing:
|
234 |
+
desc: null
|
235 |
+
value: 0.0
|
236 |
+
save_n_checkpoints:
|
237 |
+
desc: null
|
238 |
+
value: 10
|
239 |
+
hf_repo_id:
|
240 |
+
desc: null
|
241 |
+
value: koichi12/Qwen2-0.5b-0.2
|
242 |
+
create_public_hf_repo:
|
243 |
+
desc: null
|
244 |
+
value: false
|
245 |
+
upload_all_checkpoints_to_hf:
|
246 |
+
desc: null
|
247 |
+
value: true
|
248 |
+
hf_upload_retry_limit:
|
249 |
+
desc: null
|
250 |
+
value: 2
|
251 |
+
exit_duration_in_mins:
|
252 |
+
desc: null
|
253 |
+
value: null
|
254 |
+
source_key:
|
255 |
+
desc: null
|
256 |
+
value: null
|
257 |
+
target_key:
|
258 |
+
desc: null
|
259 |
+
value: null
|
260 |
+
attn_implementation:
|
261 |
+
desc: null
|
262 |
+
value: flash_attention_2
|
263 |
+
efficient_instruction_tuning:
|
264 |
+
desc: null
|
265 |
+
value: false
|
266 |
+
remove_padding_masking:
|
267 |
+
desc: null
|
268 |
+
value: false
|
269 |
+
save_start_iter:
|
270 |
+
desc: null
|
271 |
+
value: null
|
272 |
+
valid_micro_batch_size:
|
273 |
+
desc: null
|
274 |
+
value: 1
|
275 |
+
rank:
|
276 |
+
desc: null
|
277 |
+
value: 0
|
278 |
+
world_size:
|
279 |
+
desc: null
|
280 |
+
value: 1
|
281 |
+
padded_vocab_size:
|
282 |
+
desc: null
|
283 |
+
value: 151680
|
284 |
+
gradient_accumulation_steps:
|
285 |
+
desc: null
|
286 |
+
value: 128
|
287 |
+
_wandb:
|
288 |
+
desc: null
|
289 |
+
value:
|
290 |
+
python_version: 3.10.12
|
291 |
+
cli_version: 0.16.3
|
292 |
+
framework: huggingface
|
293 |
+
huggingface_version: 4.43.3
|
294 |
+
is_jupyter_run: false
|
295 |
+
is_kaggle_kernel: false
|
296 |
+
start_time: 1724398162.884223
|
297 |
+
t:
|
298 |
+
1:
|
299 |
+
- 1
|
300 |
+
- 11
|
301 |
+
- 49
|
302 |
+
- 55
|
303 |
+
- 71
|
304 |
+
- 105
|
305 |
+
2:
|
306 |
+
- 1
|
307 |
+
- 11
|
308 |
+
- 49
|
309 |
+
- 55
|
310 |
+
- 71
|
311 |
+
- 105
|
312 |
+
3:
|
313 |
+
- 13
|
314 |
+
- 16
|
315 |
+
- 23
|
316 |
+
4: 3.10.12
|
317 |
+
5: 0.16.3
|
318 |
+
6: 4.43.3
|
319 |
+
8:
|
320 |
+
- 5
|
321 |
+
13: linux-x86_64
|
322 |
+
model_architecture:
|
323 |
+
desc: null
|
324 |
+
value: Qwen2ForCausalLM
|
325 |
+
activation_function:
|
326 |
+
desc: null
|
327 |
+
value: silu
|
328 |
+
hidden_size:
|
329 |
+
desc: null
|
330 |
+
value: 896
|
331 |
+
model_type:
|
332 |
+
desc: null
|
333 |
+
value: qwen2
|
334 |
+
max_position_embeddings:
|
335 |
+
desc: null
|
336 |
+
value: 2048
|
337 |
+
num_attention_heads:
|
338 |
+
desc: null
|
339 |
+
value: 14
|
340 |
+
num_hidden_layers:
|
341 |
+
desc: null
|
342 |
+
value: 24
|
wandb/run-20240823_162922-z3gs82jm/files/output.log
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
|
5 |
+
Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
|
6 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
7 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
8 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
9 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
10 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
11 |
+
warnings.warn(
|
12 |
+
Let split = None
|
13 |
+
--> applying fsdp activation checkpointing...
|
14 |
+
> datasets target sizes (minimum size):
|
15 |
+
train: 4800000
|
16 |
+
validation: 4806400
|
17 |
+
test: 6400
|
18 |
+
> building train, validation, and test datasets for GPT ...
|
19 |
+
Unable to save the indexes because path_to_cache is None
|
20 |
+
Building a BlendedDataset for a single MegatronDataset
|
21 |
+
Unable to save the indexes because path_to_cache is None
|
22 |
+
> finished creating GPT datasets ...
|
23 |
+
Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
|
24 |
+
Building a BlendedDataset for a single MegatronDataset
|
25 |
+
Unable to save the indexes because path_to_cache is None
|
26 |
+
Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
|
27 |
+
model info: FullyShardedDataParallel(
|
28 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
29 |
+
(model): Qwen2Model(
|
30 |
+
(embed_tokens): Embedding(151936, 896)
|
31 |
+
(layers): ModuleList(
|
32 |
+
(0-23): 24 x FullyShardedDataParallel(
|
33 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
34 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
35 |
+
(self_attn): Qwen2FlashAttention2(
|
36 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
37 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
38 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
39 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
40 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
41 |
+
)
|
42 |
+
(mlp): Qwen2MLP(
|
43 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
44 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
45 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
46 |
+
(act_fn): SiLU()
|
47 |
+
)
|
48 |
+
(input_layernorm): Qwen2RMSNorm()
|
49 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
50 |
+
)
|
51 |
+
)
|
52 |
+
)
|
53 |
+
)
|
54 |
+
(norm): Qwen2RMSNorm()
|
55 |
+
)
|
56 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
57 |
+
)
|
58 |
+
)
|
59 |
+
model config: Qwen2Config {
|
60 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
61 |
+
"architectures": [
|
62 |
+
"Qwen2ForCausalLM"
|
63 |
+
],
|
64 |
+
"attention_dropout": 0.0,
|
65 |
+
"bos_token_id": 151643,
|
66 |
+
"eos_token_id": 151643,
|
67 |
+
"hidden_act": "silu",
|
68 |
+
"hidden_size": 896,
|
69 |
+
"initializer_range": 0.02,
|
70 |
+
"intermediate_size": 4864,
|
71 |
+
"label_smoothing": 0.0,
|
72 |
+
"max_position_embeddings": 2048,
|
73 |
+
"max_window_layers": 24,
|
74 |
+
"model_type": "qwen2",
|
75 |
+
"num_attention_heads": 14,
|
76 |
+
"num_hidden_layers": 24,
|
77 |
+
"num_key_value_heads": 2,
|
78 |
+
"rms_norm_eps": 1e-06,
|
79 |
+
"rope_theta": 1000000.0,
|
80 |
+
"sliding_window": 131072,
|
81 |
+
"tie_word_embeddings": true,
|
82 |
+
"torch_dtype": "bfloat16",
|
83 |
+
"transformers_version": "4.43.3",
|
84 |
+
"use_cache": false,
|
85 |
+
"use_sliding_window": false,
|
86 |
+
"vocab_size": 151936
|
87 |
+
}
|
88 |
+
[rank0]:[2024-08-23 16:29:30,218] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
89 |
+
------------------------------------------------------------------
|
90 |
+
iteration: 41 , TFLOPS: 80.56174819694237, Tokens per sec: 23067.0212685365, Loss: 4.141458988189697
|
91 |
+
------------------------------------------------------------------
|
92 |
+
------------------------------------------------------------------
|
93 |
+
iteration: 42 , TFLOPS: 82.11115614789513, Tokens per sec: 23510.658937258577, Loss: 4.13422155380249
|
94 |
+
------------------------------------------------------------------
|
95 |
+
------------------------------------------------------------------
|
96 |
+
iteration: 43 , TFLOPS: 82.06292558214139, Tokens per sec: 23496.84921352573, Loss: 4.125084400177002
|
97 |
+
------------------------------------------------------------------
|
98 |
+
------------------------------------------------------------------
|
99 |
+
iteration: 44 , TFLOPS: 82.21135543718982, Tokens per sec: 23539.348721045328, Loss: 4.142415523529053
|
100 |
+
------------------------------------------------------------------
|
101 |
+
------------------------------------------------------------------
|
102 |
+
iteration: 45 , TFLOPS: 82.19624054931023, Tokens per sec: 23535.020917242735, Loss: 4.127298831939697
|
103 |
+
------------------------------------------------------------------
|
104 |
+
------------------------------------------------------------------
|
105 |
+
iteration: 46 , TFLOPS: 82.08863919564567, Tokens per sec: 23504.21172095474, Loss: 4.131596565246582
|
106 |
+
------------------------------------------------------------------
|
107 |
+
------------------------------------------------------------------
|
108 |
+
iteration: 47 , TFLOPS: 82.19786737309032, Tokens per sec: 23535.486721170473, Loss: 4.140783786773682
|
109 |
+
------------------------------------------------------------------
|
110 |
+
------------------------------------------------------------------
|
111 |
+
iteration: 48 , TFLOPS: 82.07990700682468, Tokens per sec: 23501.711457619313, Loss: 4.111098289489746
|
112 |
+
------------------------------------------------------------------
|
113 |
+
Traceback (most recent call last):
|
114 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
115 |
+
main()
|
116 |
+
File "/project/src/llama_recipes/finetuning.py", line 282, in main
|
117 |
+
train(
|
118 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
|
119 |
+
loss: torch.Tensor = model(**batch).loss
|
120 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
121 |
+
return self._call_impl(*args, **kwargs)
|
122 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
123 |
+
return forward_call(*args, **kwargs)
|
124 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
125 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
126 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
127 |
+
return self._call_impl(*args, **kwargs)
|
128 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
129 |
+
return forward_call(*args, **kwargs)
|
130 |
+
File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 1054, in forward
|
131 |
+
outputs = self.model(
|
132 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
133 |
+
return self._call_impl(*args, **kwargs)
|
134 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
135 |
+
return forward_call(*args, **kwargs)
|
136 |
+
File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 856, in forward
|
137 |
+
layer_outputs = decoder_layer(
|
138 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
139 |
+
return self._call_impl(*args, **kwargs)
|
140 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
141 |
+
return forward_call(*args, **kwargs)
|
142 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
143 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
144 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
145 |
+
return self._call_impl(*args, **kwargs)
|
146 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
147 |
+
return forward_call(*args, **kwargs)
|
148 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 168, in forward
|
149 |
+
return self.checkpoint_fn( # type: ignore[misc]
|
150 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_compile.py", line 24, in inner
|
151 |
+
return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
|
152 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 417, in _fn
|
153 |
+
return fn(*args, **kwargs)
|
154 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/external_utils.py", line 25, in inner
|
155 |
+
return fn(*args, **kwargs)
|
156 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 488, in checkpoint
|
157 |
+
ret = function(*args, **kwargs)
|
158 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
159 |
+
return self._call_impl(*args, **kwargs)
|
160 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
161 |
+
return forward_call(*args, **kwargs)
|
162 |
+
File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 609, in forward
|
163 |
+
hidden_states = self.post_attention_layernorm(hidden_states)
|
164 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
165 |
+
return self._call_impl(*args, **kwargs)
|
166 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
167 |
+
return forward_call(*args, **kwargs)
|
168 |
+
File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 78, in forward
|
169 |
+
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
170 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 1091, in pack_hook
|
171 |
+
with torch.no_grad():
|
172 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 149, in __new__
|
173 |
+
def __new__(cls, orig_func=None):
|
174 |
+
KeyboardInterrupt
|
wandb/run-20240823_162922-z3gs82jm/files/requirements.txt
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
build==1.2.1
|
23 |
+
cachecontrol==0.14.0
|
24 |
+
cachetools==5.3.2
|
25 |
+
catalogue==2.0.10
|
26 |
+
certifi==2024.2.2
|
27 |
+
cffi==1.16.0
|
28 |
+
chardet==5.2.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
cleo==2.1.0
|
31 |
+
click==8.1.7
|
32 |
+
cloudpathlib==0.16.0
|
33 |
+
cloudpickle==3.0.0
|
34 |
+
cmake==3.28.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.1
|
37 |
+
confection==0.1.4
|
38 |
+
contourpy==1.2.0
|
39 |
+
cramjam==2.8.3
|
40 |
+
crashtest==0.4.1
|
41 |
+
cryptography==43.0.0
|
42 |
+
cubinlinker==0.3.0+2.g405ac64
|
43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
44 |
+
cudf==23.12.0
|
45 |
+
cugraph-dgl==23.12.0
|
46 |
+
cugraph-service-client==23.12.0
|
47 |
+
cugraph-service-server==23.12.0
|
48 |
+
cugraph==23.12.0
|
49 |
+
cuml==23.12.0
|
50 |
+
cupy-cuda12x==12.3.0
|
51 |
+
cycler==0.12.1
|
52 |
+
cymem==2.0.8
|
53 |
+
cython==3.0.8
|
54 |
+
dask-cuda==23.12.0
|
55 |
+
dask-cudf==23.12.0
|
56 |
+
dask==2023.11.0
|
57 |
+
dataclasses-json==0.6.7
|
58 |
+
dataproperty==1.0.1
|
59 |
+
datasets==2.20.0
|
60 |
+
debugpy==1.8.1
|
61 |
+
decorator==5.1.1
|
62 |
+
defusedxml==0.7.1
|
63 |
+
dill==0.3.8
|
64 |
+
distlib==0.3.8
|
65 |
+
distributed==2023.11.0
|
66 |
+
distro==1.9.0
|
67 |
+
dm-tree==0.1.8
|
68 |
+
docker-pycreds==0.4.0
|
69 |
+
dulwich==0.21.7
|
70 |
+
einops==0.7.0
|
71 |
+
emoji==2.12.1
|
72 |
+
entmax==1.3
|
73 |
+
evaluate==0.4.2
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
execnet==2.0.2
|
76 |
+
executing==2.0.1
|
77 |
+
expecttest==0.1.3
|
78 |
+
fastjsonschema==2.19.1
|
79 |
+
fastparquet==2023.10.1
|
80 |
+
fastrlock==0.8.2
|
81 |
+
filelock==3.13.1
|
82 |
+
flash-attn==2.4.2
|
83 |
+
fonttools==4.48.1
|
84 |
+
frozenlist==1.4.1
|
85 |
+
fsspec==2023.12.2
|
86 |
+
fugashi==1.3.2
|
87 |
+
fuzzywuzzy==0.18.0
|
88 |
+
gast==0.5.4
|
89 |
+
gitdb==4.0.11
|
90 |
+
gitpython==3.1.43
|
91 |
+
google-auth-oauthlib==0.4.6
|
92 |
+
google-auth==2.27.0
|
93 |
+
graphsurgeon==0.4.6
|
94 |
+
greenlet==3.0.3
|
95 |
+
grpcio==1.60.1
|
96 |
+
h11==0.14.0
|
97 |
+
httpcore==1.0.5
|
98 |
+
httpx==0.27.0
|
99 |
+
huggingface-hub==0.24.5
|
100 |
+
hydra-core==1.3.2
|
101 |
+
hypothesis==5.35.1
|
102 |
+
idna==3.6
|
103 |
+
importlib-metadata==7.0.1
|
104 |
+
iniconfig==2.0.0
|
105 |
+
installer==0.7.0
|
106 |
+
intel-openmp==2021.4.0
|
107 |
+
ipadic==1.0.0
|
108 |
+
ipykernel==6.29.2
|
109 |
+
ipython-genutils==0.2.0
|
110 |
+
ipython==8.21.0
|
111 |
+
isort==5.13.2
|
112 |
+
jaraco.classes==3.4.0
|
113 |
+
jedi==0.19.1
|
114 |
+
jeepney==0.8.0
|
115 |
+
jinja2==3.1.3
|
116 |
+
jiter==0.5.0
|
117 |
+
joblib==1.3.2
|
118 |
+
json5==0.9.14
|
119 |
+
jsonargparse==3.13.1
|
120 |
+
jsonlines==4.0.0
|
121 |
+
jsonnet==0.19.1
|
122 |
+
jsonpatch==1.33
|
123 |
+
jsonpointer==3.0.0
|
124 |
+
jsonschema-specifications==2023.12.1
|
125 |
+
jsonschema==4.21.1
|
126 |
+
jupyter-client==8.6.0
|
127 |
+
jupyter-core==5.7.1
|
128 |
+
jupyter-tensorboard==0.2.0
|
129 |
+
jupyterlab-pygments==0.3.0
|
130 |
+
jupyterlab-server==1.2.0
|
131 |
+
jupyterlab==2.3.2
|
132 |
+
jupytext==1.16.1
|
133 |
+
keyring==24.3.1
|
134 |
+
kiwisolver==1.4.5
|
135 |
+
langchain-community==0.2.12
|
136 |
+
langchain-core==0.2.31
|
137 |
+
langchain-huggingface==0.0.2
|
138 |
+
langchain-openai==0.1.21
|
139 |
+
langchain-text-splitters==0.2.2
|
140 |
+
langchain==0.2.13
|
141 |
+
langcodes==3.3.0
|
142 |
+
langsmith==0.1.99
|
143 |
+
lazy-loader==0.3
|
144 |
+
levenshtein==0.25.1
|
145 |
+
librosa==0.10.1
|
146 |
+
lightning-utilities==0.11.6
|
147 |
+
llm-jp-eval==1.4.0
|
148 |
+
llvmlite==0.40.1
|
149 |
+
lm-eval==0.3.0
|
150 |
+
locket==1.0.0
|
151 |
+
logzero==1.7.0
|
152 |
+
lxml==5.2.2
|
153 |
+
markdown-it-py==3.0.0
|
154 |
+
markdown==3.5.2
|
155 |
+
markupsafe==2.1.4
|
156 |
+
marshmallow==3.21.3
|
157 |
+
matplotlib-inline==0.1.6
|
158 |
+
matplotlib==3.8.2
|
159 |
+
mbstrdecoder==1.1.3
|
160 |
+
mccabe==0.7.0
|
161 |
+
mdit-py-plugins==0.4.0
|
162 |
+
mdurl==0.1.2
|
163 |
+
mecab-python3==1.0.6
|
164 |
+
mistune==3.0.2
|
165 |
+
mkl-devel==2021.1.1
|
166 |
+
mkl-include==2021.1.1
|
167 |
+
mkl==2021.1.1
|
168 |
+
mock==5.1.0
|
169 |
+
mojimoji==0.0.13
|
170 |
+
more-itertools==9.1.0
|
171 |
+
mpmath==1.3.0
|
172 |
+
msgpack==1.0.7
|
173 |
+
multidict==6.0.4
|
174 |
+
multiprocess==0.70.16
|
175 |
+
murmurhash==1.0.10
|
176 |
+
mypy-extensions==1.0.0
|
177 |
+
nbclient==0.9.0
|
178 |
+
nbconvert==7.16.0
|
179 |
+
nbformat==5.9.2
|
180 |
+
neologdn==0.5.3
|
181 |
+
nest-asyncio==1.6.0
|
182 |
+
networkx==2.6.3
|
183 |
+
ninja==1.11.1.1
|
184 |
+
nltk==3.8.1
|
185 |
+
notebook==6.4.10
|
186 |
+
numba==0.57.1+1.g1ff679645
|
187 |
+
numexpr==2.10.1
|
188 |
+
numpy==1.24.4
|
189 |
+
nvfuser==0.1.4a0+d0bb811
|
190 |
+
nvidia-dali-cuda120==1.34.0
|
191 |
+
nvidia-pyindex==1.0.9
|
192 |
+
nvtx==0.2.5
|
193 |
+
oauthlib==3.2.2
|
194 |
+
omegaconf==2.3.0
|
195 |
+
onnx==1.15.0rc2
|
196 |
+
openai==1.40.6
|
197 |
+
opencv==4.7.0
|
198 |
+
optree==0.10.0
|
199 |
+
orjson==3.10.7
|
200 |
+
packaging==23.2
|
201 |
+
pandas==2.2.2
|
202 |
+
pandocfilters==1.5.1
|
203 |
+
parso==0.8.3
|
204 |
+
partd==1.4.1
|
205 |
+
pathvalidate==3.2.0
|
206 |
+
peft==0.5.0
|
207 |
+
pexpect==4.9.0
|
208 |
+
pillow==10.2.0
|
209 |
+
pip==24.0
|
210 |
+
pkginfo==1.11.1
|
211 |
+
plac==1.4.3
|
212 |
+
platformdirs==4.2.0
|
213 |
+
pluggy==1.4.0
|
214 |
+
ply==3.11
|
215 |
+
poetry-core==1.9.0
|
216 |
+
poetry-plugin-export==1.8.0
|
217 |
+
poetry==1.8.3
|
218 |
+
polygraphy==0.49.4
|
219 |
+
pooch==1.8.0
|
220 |
+
portalocker==2.10.1
|
221 |
+
preshed==3.0.9
|
222 |
+
prettytable==3.9.0
|
223 |
+
prometheus-client==0.19.0
|
224 |
+
prompt-toolkit==3.0.43
|
225 |
+
protobuf==4.24.4
|
226 |
+
psutil==5.9.4
|
227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
228 |
+
ptyprocess==0.7.0
|
229 |
+
pure-eval==0.2.2
|
230 |
+
pyarrow-hotfix==0.6
|
231 |
+
pyarrow==15.0.2
|
232 |
+
pyasn1-modules==0.3.0
|
233 |
+
pyasn1==0.5.1
|
234 |
+
pybind11-global==2.11.1
|
235 |
+
pybind11==2.11.1
|
236 |
+
pycocotools==2.0+nv0.8.0
|
237 |
+
pycountry==24.6.1
|
238 |
+
pycparser==2.21
|
239 |
+
pydantic-core==2.16.2
|
240 |
+
pydantic==2.6.1
|
241 |
+
pygments==2.17.2
|
242 |
+
pylibcugraph==23.12.0
|
243 |
+
pylibcugraphops==23.12.0
|
244 |
+
pylibraft==23.12.0
|
245 |
+
pylint==3.2.6
|
246 |
+
pynvml==11.4.1
|
247 |
+
pyparsing==3.1.1
|
248 |
+
pyproject-hooks==1.1.0
|
249 |
+
pytablewriter==1.2.0
|
250 |
+
pytest-flakefinder==1.1.0
|
251 |
+
pytest-rerunfailures==13.0
|
252 |
+
pytest-shard==0.1.2
|
253 |
+
pytest-xdist==3.5.0
|
254 |
+
pytest==8.0.0
|
255 |
+
python-dateutil==2.8.2
|
256 |
+
python-dotenv==1.0.0
|
257 |
+
python-hostlist==1.23.0
|
258 |
+
python-levenshtein==0.25.1
|
259 |
+
pytorch-lightning==2.4.0
|
260 |
+
pytorch-quantization==2.1.2
|
261 |
+
pytz==2023.3.post1
|
262 |
+
pyyaml==6.0.1
|
263 |
+
pyzmq==25.1.2
|
264 |
+
raft-dask==23.12.0
|
265 |
+
rapidfuzz==3.9.6
|
266 |
+
rapids-dask-dependency==23.12.1
|
267 |
+
referencing==0.33.0
|
268 |
+
regex==2023.12.25
|
269 |
+
requests-oauthlib==1.3.1
|
270 |
+
requests-toolbelt==1.0.0
|
271 |
+
requests==2.32.3
|
272 |
+
rhoknp==1.7.0
|
273 |
+
rich==13.7.0
|
274 |
+
rmm==23.12.0
|
275 |
+
rouge-score==0.1.2
|
276 |
+
rpds-py==0.17.1
|
277 |
+
rsa==4.9
|
278 |
+
sacrebleu==2.4.2
|
279 |
+
safetensors==0.4.3
|
280 |
+
scikit-learn==1.5.1
|
281 |
+
scipy==1.12.0
|
282 |
+
secretstorage==3.3.3
|
283 |
+
send2trash==1.8.2
|
284 |
+
sentence-transformers==3.0.1
|
285 |
+
sentencepiece==0.1.99
|
286 |
+
sentry-sdk==2.12.0
|
287 |
+
setproctitle==1.3.3
|
288 |
+
setuptools==68.2.2
|
289 |
+
shellingham==1.5.4
|
290 |
+
six==1.16.0
|
291 |
+
smart-open==6.4.0
|
292 |
+
smmap==5.0.1
|
293 |
+
sniffio==1.3.1
|
294 |
+
sortedcontainers==2.4.0
|
295 |
+
soundfile==0.12.1
|
296 |
+
soupsieve==2.5
|
297 |
+
soxr==0.3.7
|
298 |
+
spacy-legacy==3.0.12
|
299 |
+
spacy-loggers==1.0.5
|
300 |
+
spacy==3.7.2
|
301 |
+
sphinx-glpi-theme==0.6
|
302 |
+
sqlalchemy==2.0.32
|
303 |
+
sqlitedict==2.1.0
|
304 |
+
srsly==2.4.8
|
305 |
+
stack-data==0.6.3
|
306 |
+
sumeval==0.2.2
|
307 |
+
sympy==1.12
|
308 |
+
tabledata==1.3.3
|
309 |
+
tabulate==0.9.0
|
310 |
+
tbb==2021.11.0
|
311 |
+
tblib==3.0.0
|
312 |
+
tcolorpy==0.1.6
|
313 |
+
tenacity==8.5.0
|
314 |
+
tensorboard-data-server==0.6.1
|
315 |
+
tensorboard-plugin-wit==1.8.1
|
316 |
+
tensorboard==2.9.0
|
317 |
+
tensorrt==8.6.3
|
318 |
+
terminado==0.18.0
|
319 |
+
termplotlib==0.3.9
|
320 |
+
text-generation==0.7.0
|
321 |
+
thinc==8.2.3
|
322 |
+
threadpoolctl==3.2.0
|
323 |
+
thriftpy2==0.4.17
|
324 |
+
tiktoken==0.7.0
|
325 |
+
tinycss2==1.2.1
|
326 |
+
tokenizers==0.19.1
|
327 |
+
toml==0.10.2
|
328 |
+
tomli==2.0.1
|
329 |
+
tomlkit==0.13.2
|
330 |
+
toolz==0.12.1
|
331 |
+
torch-tensorrt==2.3.0a0
|
332 |
+
torch==2.3.0a0+ebedce2
|
333 |
+
torchdata==0.7.1a0
|
334 |
+
torchmetrics==0.10.3
|
335 |
+
torchtext==0.17.0a0
|
336 |
+
torchvision==0.18.0a0
|
337 |
+
tornado==6.4
|
338 |
+
tqdm-multiprocess==0.0.11
|
339 |
+
tqdm==4.66.5
|
340 |
+
traitlets==5.9.0
|
341 |
+
transformer-engine==1.3.0+5b90b7f
|
342 |
+
transformers==4.43.3
|
343 |
+
treelite-runtime==3.9.1
|
344 |
+
treelite==3.9.1
|
345 |
+
triton==2.2.0+e28a256
|
346 |
+
trove-classifiers==2024.7.2
|
347 |
+
typepy==1.3.2
|
348 |
+
typer==0.9.0
|
349 |
+
types-dataclasses==0.6.6
|
350 |
+
typing-extensions==4.12.2
|
351 |
+
typing-inspect==0.9.0
|
352 |
+
tzdata==2024.1
|
353 |
+
ucx-py==0.35.0
|
354 |
+
uff==0.6.9
|
355 |
+
ujson==5.8.0
|
356 |
+
unbabel-comet==2.2.2
|
357 |
+
unidic-lite==1.0.8
|
358 |
+
urllib3==1.26.18
|
359 |
+
virtualenv==20.26.3
|
360 |
+
wandb==0.16.3
|
361 |
+
wasabi==1.1.2
|
362 |
+
wcwidth==0.2.13
|
363 |
+
weasel==0.3.4
|
364 |
+
webencodings==0.5.1
|
365 |
+
werkzeug==3.0.1
|
366 |
+
wheel==0.42.0
|
367 |
+
word2number==1.1
|
368 |
+
xdoctest==1.0.2
|
369 |
+
xgboost==1.7.6
|
370 |
+
xmltodict==0.13.0
|
371 |
+
xxhash==3.4.1
|
372 |
+
yarl==1.9.4
|
373 |
+
zict==3.0.0
|
374 |
+
zipp==3.17.0
|
375 |
+
zstandard==0.23.0
|
wandb/run-20240823_162922-z3gs82jm/files/wandb-metadata.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-23T07:29:23.385958",
|
5 |
+
"startedAt": "2024-08-23T07:29:22.871856",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"2048",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"131072",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"5",
|
15 |
+
"--valid_micro_batch_size",
|
16 |
+
"1",
|
17 |
+
"--global-batch-size",
|
18 |
+
"640",
|
19 |
+
"--train-iters",
|
20 |
+
"7500",
|
21 |
+
"--tokenizer-type",
|
22 |
+
"HFPreTrainedTokenizer",
|
23 |
+
"--tokenizer-model",
|
24 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
25 |
+
"--train-data-path",
|
26 |
+
"1754785366",
|
27 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
28 |
+
"28623823675",
|
29 |
+
"/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
|
30 |
+
"--valid-data-path",
|
31 |
+
"1754785366",
|
32 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
33 |
+
"--test-data-path",
|
34 |
+
"1754785366",
|
35 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
36 |
+
"--lr",
|
37 |
+
"2e-5",
|
38 |
+
"--min-lr",
|
39 |
+
"1e-6",
|
40 |
+
"--lr-decay-style",
|
41 |
+
"cosine",
|
42 |
+
"--lr-warmup-iters",
|
43 |
+
"500",
|
44 |
+
"--lr-decay-iters",
|
45 |
+
"7500",
|
46 |
+
"--weight-decay",
|
47 |
+
"0.1",
|
48 |
+
"--grad-clip-norm",
|
49 |
+
"1.0",
|
50 |
+
"--optimizer",
|
51 |
+
"anyprecision",
|
52 |
+
"--adam-beta1",
|
53 |
+
"0.9",
|
54 |
+
"--adam-beta2",
|
55 |
+
"0.95",
|
56 |
+
"--adam-eps",
|
57 |
+
"1e-6",
|
58 |
+
"--save-interval",
|
59 |
+
"10",
|
60 |
+
"--eval-interval",
|
61 |
+
"10",
|
62 |
+
"--eval-iters",
|
63 |
+
"10",
|
64 |
+
"--bf16",
|
65 |
+
"--mixed-precision",
|
66 |
+
"--base-model",
|
67 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
68 |
+
"--save",
|
69 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
70 |
+
"--load",
|
71 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
72 |
+
"--fsdp-activation-checkpointing",
|
73 |
+
"--sharding-strategy",
|
74 |
+
"FULL_SHARD",
|
75 |
+
"--checkpoint-type",
|
76 |
+
"LOCAL_STATE_DICT",
|
77 |
+
"--save-n-checkpoints",
|
78 |
+
"10",
|
79 |
+
"--upload-all-checkpoints-to-hf",
|
80 |
+
"--hf-upload-retry-limit",
|
81 |
+
"2",
|
82 |
+
"--hf-repo-id",
|
83 |
+
"koichi12/Qwen2-0.5b-0.2",
|
84 |
+
"--wandb-entity",
|
85 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
86 |
+
"--wandb-project",
|
87 |
+
"llm_tutorial-0.2",
|
88 |
+
"--wandb-name",
|
89 |
+
"Qwen2-0.5b-0.2_train_2024-08-23-16:29:10"
|
90 |
+
],
|
91 |
+
"state": "running",
|
92 |
+
"program": "/project/examples/finetuning.py",
|
93 |
+
"codePathLocal": "examples/finetuning.py",
|
94 |
+
"codePath": "examples/finetuning.py",
|
95 |
+
"git": {
|
96 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
97 |
+
"commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
|
98 |
+
},
|
99 |
+
"email": null,
|
100 |
+
"root": "/project",
|
101 |
+
"host": "gpu-koiwa-00",
|
102 |
+
"username": "koiwa",
|
103 |
+
"executable": "/usr/bin/python",
|
104 |
+
"cpu_count": 18,
|
105 |
+
"cpu_count_logical": 18,
|
106 |
+
"cpu_freq": {
|
107 |
+
"current": 2400.0389999999993,
|
108 |
+
"min": 0.0,
|
109 |
+
"max": 0.0
|
110 |
+
},
|
111 |
+
"cpu_freq_per_core": [
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"current": 2400.039,
|
199 |
+
"min": 0.0,
|
200 |
+
"max": 0.0
|
201 |
+
}
|
202 |
+
],
|
203 |
+
"disk": {
|
204 |
+
"/": {
|
205 |
+
"total": 0.0625,
|
206 |
+
"used": 1.1444091796875e-05
|
207 |
+
}
|
208 |
+
},
|
209 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
210 |
+
"gpu_count": 1,
|
211 |
+
"gpu_devices": [
|
212 |
+
{
|
213 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
214 |
+
"memory_total": 42949672960
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"memory": {
|
218 |
+
"total": 56.487831115722656
|
219 |
+
}
|
220 |
+
}
|
wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 4.111098289489746, "training/perplexity": 61.013691480602496, "utils/batch_size": 5, "utils/global_batch_size": 640, "utils/seq_len": 2049, "utils/gradient_accumulation_steps": 128, "utils/iteration": 48, "optimizer/lr": 2.8240000000000004e-06, "optimizer/variance_l2": 0.05465312149531553, "optimizer/variance_sqrt_l2": 0.9576321330918345, "optimizer/momentum_l2": 0.9493419425990095, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.91644287109375, "optimizer/variance_sqrt_l1": 3987.5, "optimizer/momentum_l1": 3375.75, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.044921875, "optimizer/variance_sqrt_abs_max": 0.2119140625, "optimizer/momentum_abs_max": 0.234375, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 55.79848949999996, "stats/tokens_per_sec": 23501.711457619313, "stats/tokens_per_sec_per_gpu": 23501.711457619313, "stats/tflops": 82.07990700682468, "_timestamp": 1724398618.465386, "_runtime": 455.5811629295349, "_step": 48, "_wandb": {"runtime": 505}}
|
wandb/run-20240823_162922-z3gs82jm/logs/debug-internal.log
ADDED
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-23 16:29:22,886 INFO StreamThr :11966 [internal.py:wandb_internal():86] W&B internal server running at pid: 11966, started at: 2024-08-23 16:29:22.885536
|
2 |
+
2024-08-23 16:29:22,888 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-23 16:29:22,889 INFO WriterThread:11966 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_162922-z3gs82jm/run-z3gs82jm.wandb
|
4 |
+
2024-08-23 16:29:22,890 DEBUG SenderThread:11966 [sender.py:send():382] send: header
|
5 |
+
2024-08-23 16:29:22,904 DEBUG SenderThread:11966 [sender.py:send():382] send: run
|
6 |
+
2024-08-23 16:29:23,294 INFO SenderThread:11966 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_162922-z3gs82jm/files
|
7 |
+
2024-08-23 16:29:23,294 INFO SenderThread:11966 [sender.py:_start_run_threads():1136] run started: z3gs82jm with start time 1724398162.884223
|
8 |
+
2024-08-23 16:29:23,300 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-23 16:29:23,300 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-23 16:29:23,367 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-23 16:29:23,374 DEBUG HandlerThread:11966 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-23 16:29:23,374 DEBUG HandlerThread:11966 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-23 16:29:23,374 INFO HandlerThread:11966 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-23 16:29:23,374 INFO SystemMonitor:11966 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-23 16:29:23,374 INFO HandlerThread:11966 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-23 16:29:23,374 INFO SystemMonitor:11966 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-23 16:29:23,375 INFO SystemMonitor:11966 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-23 16:29:23,375 INFO SystemMonitor:11966 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-23 16:29:23,377 INFO SystemMonitor:11966 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-23 16:29:23,378 INFO SystemMonitor:11966 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-23 16:29:23,385 DEBUG HandlerThread:11966 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-23 16:29:23,387 DEBUG HandlerThread:11966 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-23 16:29:23,400 DEBUG HandlerThread:11966 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-23 16:29:23,400 DEBUG HandlerThread:11966 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-23 16:29:23,400 DEBUG HandlerThread:11966 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T07:29:23.385958', 'startedAt': '2024-08-23T07:29:22.871856', 'docker': None, 'cuda': None, 'args': ('--seq-length', '2048', '--sliding-window-size', '131072', '--micro-batch-size', '5', '--valid_micro_batch_size', '1', '--global-batch-size', '640', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-16:29:10'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
|
26 |
+
2024-08-23 16:29:23,400 INFO HandlerThread:11966 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-23 16:29:23,400 INFO HandlerThread:11966 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-23 16:29:23,402 INFO HandlerThread:11966 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-23 16:29:23,407 DEBUG SenderThread:11966 [sender.py:send():382] send: files
|
30 |
+
2024-08-23 16:29:23,407 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-23 16:29:23,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-23 16:29:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-23 16:29:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-23 16:29:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-23 16:29:23,421 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-23 16:29:23,617 DEBUG SenderThread:11966 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-23 16:29:24,296 INFO Thread-12 :11966 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
38 |
+
2024-08-23 16:29:24,296 INFO Thread-12 :11966 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162922-z3gs82jm/files/requirements.txt
|
39 |
+
2024-08-23 16:29:24,297 INFO Thread-12 :11966 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-metadata.json
|
40 |
+
2024-08-23 16:29:24,474 INFO wandb-upload_0:11966 [upload_job.py:push():131] Uploaded file /tmp/tmpcv014twmwandb/xf5mvw68-wandb-metadata.json
|
41 |
+
2024-08-23 16:29:26,296 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
42 |
+
2024-08-23 16:29:28,298 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
43 |
+
2024-08-23 16:29:28,658 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
44 |
+
2024-08-23 16:29:30,299 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
45 |
+
2024-08-23 16:29:30,499 DEBUG SenderThread:11966 [sender.py:send():382] send: config
|
46 |
+
2024-08-23 16:29:30,500 DEBUG SenderThread:11966 [sender.py:send():382] send: config
|
47 |
+
2024-08-23 16:29:32,300 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
48 |
+
2024-08-23 16:29:34,500 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
49 |
+
2024-08-23 16:29:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
50 |
+
2024-08-23 16:29:38,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
51 |
+
2024-08-23 16:29:38,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
52 |
+
2024-08-23 16:29:39,685 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
53 |
+
2024-08-23 16:29:44,685 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
54 |
+
2024-08-23 16:29:49,686 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
55 |
+
2024-08-23 16:29:53,417 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
56 |
+
2024-08-23 16:29:53,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
57 |
+
2024-08-23 16:29:53,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
58 |
+
2024-08-23 16:29:55,642 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
59 |
+
2024-08-23 16:29:56,312 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/config.yaml
|
60 |
+
2024-08-23 16:30:00,842 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
61 |
+
2024-08-23 16:30:05,842 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-23 16:30:08,417 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
63 |
+
2024-08-23 16:30:08,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
64 |
+
2024-08-23 16:30:08,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
65 |
+
2024-08-23 16:30:11,641 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
66 |
+
2024-08-23 16:30:16,641 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
67 |
+
2024-08-23 16:30:21,642 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
68 |
+
2024-08-23 16:30:23,378 DEBUG SystemMonitor:11966 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
69 |
+
2024-08-23 16:30:23,380 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
70 |
+
2024-08-23 16:30:23,417 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
71 |
+
2024-08-23 16:30:23,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
72 |
+
2024-08-23 16:30:23,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
73 |
+
2024-08-23 16:30:27,617 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
74 |
+
2024-08-23 16:30:28,122 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
|
75 |
+
2024-08-23 16:30:30,329 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
76 |
+
2024-08-23 16:30:33,164 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
77 |
+
2024-08-23 16:30:38,165 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
78 |
+
2024-08-23 16:30:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
79 |
+
2024-08-23 16:30:38,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
80 |
+
2024-08-23 16:30:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
81 |
+
2024-08-23 16:30:43,611 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
82 |
+
2024-08-23 16:30:48,611 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
83 |
+
2024-08-23 16:30:53,380 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
84 |
+
2024-08-23 16:30:53,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
85 |
+
2024-08-23 16:30:53,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
86 |
+
2024-08-23 16:30:53,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
87 |
+
2024-08-23 16:30:53,639 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
88 |
+
2024-08-23 16:30:58,640 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
89 |
+
2024-08-23 16:31:03,640 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
90 |
+
2024-08-23 16:31:08,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
91 |
+
2024-08-23 16:31:08,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
92 |
+
2024-08-23 16:31:08,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
93 |
+
2024-08-23 16:31:08,666 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
94 |
+
2024-08-23 16:31:13,666 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
95 |
+
2024-08-23 16:31:18,667 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
96 |
+
2024-08-23 16:31:23,382 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
97 |
+
2024-08-23 16:31:23,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
98 |
+
2024-08-23 16:31:23,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
99 |
+
2024-08-23 16:31:23,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
100 |
+
2024-08-23 16:31:23,682 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
101 |
+
2024-08-23 16:31:23,901 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
|
102 |
+
2024-08-23 16:31:23,904 DEBUG SenderThread:11966 [sender.py:send():382] send: history
|
103 |
+
2024-08-23 16:31:23,904 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
|
104 |
+
2024-08-23 16:31:23,906 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
105 |
+
2024-08-23 16:31:24,355 INFO Thread-12 :11966 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
106 |
+
2024-08-23 16:31:26,356 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
107 |
+
2024-08-23 16:31:28,944 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
108 |
+
2024-08-23 16:31:33,945 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
109 |
+
2024-08-23 16:31:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
110 |
+
2024-08-23 16:31:38,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
111 |
+
2024-08-23 16:31:38,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
112 |
+
2024-08-23 16:31:39,638 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
113 |
+
2024-08-23 16:31:44,639 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
114 |
+
2024-08-23 16:31:49,639 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
115 |
+
2024-08-23 16:31:53,383 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
116 |
+
2024-08-23 16:31:53,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
117 |
+
2024-08-23 16:31:53,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
118 |
+
2024-08-23 16:31:53,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
119 |
+
2024-08-23 16:31:54,679 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
120 |
+
2024-08-23 16:31:59,679 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
121 |
+
2024-08-23 16:32:04,680 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
122 |
+
2024-08-23 16:32:08,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
123 |
+
2024-08-23 16:32:08,418 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
124 |
+
2024-08-23 16:32:08,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
125 |
+
2024-08-23 16:32:10,613 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
126 |
+
2024-08-23 16:32:15,613 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
127 |
+
2024-08-23 16:32:19,715 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
|
128 |
+
2024-08-23 16:32:19,717 DEBUG SenderThread:11966 [sender.py:send():382] send: history
|
129 |
+
2024-08-23 16:32:19,717 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
|
130 |
+
2024-08-23 16:32:19,719 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
131 |
+
2024-08-23 16:32:20,383 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
132 |
+
2024-08-23 16:32:20,760 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
133 |
+
2024-08-23 16:32:22,384 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
134 |
+
2024-08-23 16:32:23,385 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
135 |
+
2024-08-23 16:32:23,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
136 |
+
2024-08-23 16:32:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
137 |
+
2024-08-23 16:32:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
138 |
+
2024-08-23 16:32:26,593 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
139 |
+
2024-08-23 16:32:31,594 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
140 |
+
2024-08-23 16:32:36,594 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
141 |
+
2024-08-23 16:32:38,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
142 |
+
2024-08-23 16:32:38,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
143 |
+
2024-08-23 16:32:38,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
144 |
+
2024-08-23 16:32:41,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
145 |
+
2024-08-23 16:32:46,675 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
146 |
+
2024-08-23 16:32:51,676 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
147 |
+
2024-08-23 16:32:53,384 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
148 |
+
2024-08-23 16:32:53,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
149 |
+
2024-08-23 16:32:53,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
150 |
+
2024-08-23 16:32:53,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
151 |
+
2024-08-23 16:32:57,603 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
152 |
+
2024-08-23 16:33:02,604 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
153 |
+
2024-08-23 16:33:07,604 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
154 |
+
2024-08-23 16:33:08,418 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
155 |
+
2024-08-23 16:33:08,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
156 |
+
2024-08-23 16:33:08,459 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
157 |
+
2024-08-23 16:33:12,676 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
158 |
+
2024-08-23 16:33:15,428 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
|
159 |
+
2024-08-23 16:33:15,430 DEBUG SenderThread:11966 [sender.py:send():382] send: history
|
160 |
+
2024-08-23 16:33:15,430 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
|
161 |
+
2024-08-23 16:33:15,432 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
162 |
+
2024-08-23 16:33:16,412 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
163 |
+
2024-08-23 16:33:18,413 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
164 |
+
2024-08-23 16:33:18,472 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
165 |
+
2024-08-23 16:33:23,385 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
166 |
+
2024-08-23 16:33:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
167 |
+
2024-08-23 16:33:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
168 |
+
2024-08-23 16:33:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
169 |
+
2024-08-23 16:33:23,682 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
170 |
+
2024-08-23 16:33:28,682 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
171 |
+
2024-08-23 16:33:33,683 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
172 |
+
2024-08-23 16:33:38,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
173 |
+
2024-08-23 16:33:38,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
174 |
+
2024-08-23 16:33:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
175 |
+
2024-08-23 16:33:39,667 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
176 |
+
2024-08-23 16:33:44,667 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
177 |
+
2024-08-23 16:33:49,668 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
178 |
+
2024-08-23 16:33:53,386 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
179 |
+
2024-08-23 16:33:53,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
180 |
+
2024-08-23 16:33:53,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
181 |
+
2024-08-23 16:33:53,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
182 |
+
2024-08-23 16:33:55,611 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
183 |
+
2024-08-23 16:34:00,612 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
184 |
+
2024-08-23 16:34:05,612 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
185 |
+
2024-08-23 16:34:08,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
186 |
+
2024-08-23 16:34:08,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
187 |
+
2024-08-23 16:34:08,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
188 |
+
2024-08-23 16:34:10,679 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
189 |
+
2024-08-23 16:34:11,150 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
|
190 |
+
2024-08-23 16:34:11,152 DEBUG SenderThread:11966 [sender.py:send():382] send: history
|
191 |
+
2024-08-23 16:34:11,152 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
|
192 |
+
2024-08-23 16:34:11,154 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
193 |
+
2024-08-23 16:34:11,439 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
194 |
+
2024-08-23 16:34:12,439 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
195 |
+
2024-08-23 16:34:16,192 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
196 |
+
2024-08-23 16:34:21,193 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
197 |
+
2024-08-23 16:34:23,387 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
198 |
+
2024-08-23 16:34:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
199 |
+
2024-08-23 16:34:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
200 |
+
2024-08-23 16:34:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
201 |
+
2024-08-23 16:34:26,589 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
202 |
+
2024-08-23 16:34:31,590 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
203 |
+
2024-08-23 16:34:36,590 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
204 |
+
2024-08-23 16:34:38,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
205 |
+
2024-08-23 16:34:38,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
206 |
+
2024-08-23 16:34:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
207 |
+
2024-08-23 16:34:41,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
208 |
+
2024-08-23 16:34:46,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
209 |
+
2024-08-23 16:34:51,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
210 |
+
2024-08-23 16:34:53,389 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
211 |
+
2024-08-23 16:34:53,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
212 |
+
2024-08-23 16:34:53,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
213 |
+
2024-08-23 16:34:53,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
214 |
+
2024-08-23 16:34:57,671 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
215 |
+
2024-08-23 16:35:02,672 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
216 |
+
2024-08-23 16:35:06,945 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
|
217 |
+
2024-08-23 16:35:06,948 DEBUG SenderThread:11966 [sender.py:send():382] send: history
|
218 |
+
2024-08-23 16:35:06,948 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
|
219 |
+
2024-08-23 16:35:06,949 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
220 |
+
2024-08-23 16:35:07,467 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
221 |
+
2024-08-23 16:35:07,988 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
222 |
+
2024-08-23 16:35:08,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
223 |
+
2024-08-23 16:35:08,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
224 |
+
2024-08-23 16:35:08,421 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
225 |
+
2024-08-23 16:35:08,468 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
226 |
+
2024-08-23 16:35:13,689 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
227 |
+
2024-08-23 16:35:18,689 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
228 |
+
2024-08-23 16:35:23,389 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
229 |
+
2024-08-23 16:35:23,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
230 |
+
2024-08-23 16:35:23,419 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
231 |
+
2024-08-23 16:35:23,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
232 |
+
2024-08-23 16:35:24,596 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
233 |
+
2024-08-23 16:35:29,597 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
234 |
+
2024-08-23 16:35:34,597 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
235 |
+
2024-08-23 16:35:38,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
236 |
+
2024-08-23 16:35:38,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
237 |
+
2024-08-23 16:35:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
238 |
+
2024-08-23 16:35:39,689 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
239 |
+
2024-08-23 16:35:44,690 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
240 |
+
2024-08-23 16:35:49,690 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
241 |
+
2024-08-23 16:35:53,390 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
242 |
+
2024-08-23 16:35:53,419 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
243 |
+
2024-08-23 16:35:53,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
244 |
+
2024-08-23 16:35:53,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
245 |
+
2024-08-23 16:35:55,688 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
246 |
+
2024-08-23 16:36:00,688 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
247 |
+
2024-08-23 16:36:02,665 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
|
248 |
+
2024-08-23 16:36:02,667 DEBUG SenderThread:11966 [sender.py:send():382] send: history
|
249 |
+
2024-08-23 16:36:02,667 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
|
250 |
+
2024-08-23 16:36:02,668 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
251 |
+
2024-08-23 16:36:03,495 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
252 |
+
2024-08-23 16:36:04,496 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
253 |
+
2024-08-23 16:36:05,708 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
254 |
+
2024-08-23 16:36:08,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
255 |
+
2024-08-23 16:36:08,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
256 |
+
2024-08-23 16:36:08,421 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
257 |
+
2024-08-23 16:36:11,641 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
258 |
+
2024-08-23 16:36:16,641 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
259 |
+
2024-08-23 16:36:21,642 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
260 |
+
2024-08-23 16:36:23,391 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
261 |
+
2024-08-23 16:36:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
262 |
+
2024-08-23 16:36:23,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
263 |
+
2024-08-23 16:36:23,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
264 |
+
2024-08-23 16:36:26,684 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
265 |
+
2024-08-23 16:36:31,685 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
266 |
+
2024-08-23 16:36:36,685 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
267 |
+
2024-08-23 16:36:38,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
268 |
+
2024-08-23 16:36:38,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
269 |
+
2024-08-23 16:36:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
270 |
+
2024-08-23 16:36:41,692 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
271 |
+
2024-08-23 16:36:46,692 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
272 |
+
2024-08-23 16:36:51,693 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
273 |
+
2024-08-23 16:36:53,392 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
274 |
+
2024-08-23 16:36:53,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
275 |
+
2024-08-23 16:36:53,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
276 |
+
2024-08-23 16:36:53,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
277 |
+
2024-08-23 16:36:57,615 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
278 |
+
2024-08-23 16:36:58,466 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: partial_history
|
279 |
+
2024-08-23 16:36:58,468 DEBUG SenderThread:11966 [sender.py:send():382] send: history
|
280 |
+
2024-08-23 16:36:58,468 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
|
281 |
+
2024-08-23 16:36:58,469 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
282 |
+
2024-08-23 16:36:58,523 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
283 |
+
2024-08-23 16:37:00,524 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
284 |
+
2024-08-23 16:37:03,508 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
285 |
+
2024-08-23 16:37:08,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
286 |
+
2024-08-23 16:37:08,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
287 |
+
2024-08-23 16:37:08,422 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
288 |
+
2024-08-23 16:37:08,688 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
289 |
+
2024-08-23 16:37:13,688 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
290 |
+
2024-08-23 16:37:18,689 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
291 |
+
2024-08-23 16:37:23,393 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
292 |
+
2024-08-23 16:37:23,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
293 |
+
2024-08-23 16:37:23,420 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
294 |
+
2024-08-23 16:37:23,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
295 |
+
2024-08-23 16:37:24,586 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
296 |
+
2024-08-23 16:37:29,587 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
297 |
+
2024-08-23 16:37:34,587 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
298 |
+
2024-08-23 16:37:38,420 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: stop_status
|
299 |
+
2024-08-23 16:37:38,421 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: stop_status
|
300 |
+
2024-08-23 16:37:38,463 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
301 |
+
2024-08-23 16:37:40,196 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
302 |
+
2024-08-23 16:37:45,197 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
303 |
+
2024-08-23 16:37:48,676 DEBUG SenderThread:11966 [sender.py:send():382] send: exit
|
304 |
+
2024-08-23 16:37:48,676 INFO SenderThread:11966 [sender.py:send_exit():589] handling exit code: 255
|
305 |
+
2024-08-23 16:37:48,676 INFO SenderThread:11966 [sender.py:send_exit():591] handling runtime: 505
|
306 |
+
2024-08-23 16:37:48,677 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
307 |
+
2024-08-23 16:37:48,677 INFO SenderThread:11966 [sender.py:send_exit():597] send defer
|
308 |
+
2024-08-23 16:37:48,678 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
309 |
+
2024-08-23 16:37:48,678 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 0
|
310 |
+
2024-08-23 16:37:48,678 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
311 |
+
2024-08-23 16:37:48,678 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 0
|
312 |
+
2024-08-23 16:37:48,678 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 1
|
313 |
+
2024-08-23 16:37:48,678 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
314 |
+
2024-08-23 16:37:48,678 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 1
|
315 |
+
2024-08-23 16:37:48,678 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
316 |
+
2024-08-23 16:37:48,678 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 1
|
317 |
+
2024-08-23 16:37:48,678 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 2
|
318 |
+
2024-08-23 16:37:48,678 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
319 |
+
2024-08-23 16:37:48,678 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 2
|
320 |
+
2024-08-23 16:37:48,678 INFO HandlerThread:11966 [system_monitor.py:finish():203] Stopping system monitor
|
321 |
+
2024-08-23 16:37:48,679 DEBUG SystemMonitor:11966 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
322 |
+
2024-08-23 16:37:48,679 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined cpu monitor
|
323 |
+
2024-08-23 16:37:48,679 DEBUG SystemMonitor:11966 [system_monitor.py:_start():183] Publishing last batch of metrics
|
324 |
+
2024-08-23 16:37:48,679 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined disk monitor
|
325 |
+
2024-08-23 16:37:48,712 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined gpu monitor
|
326 |
+
2024-08-23 16:37:48,712 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined memory monitor
|
327 |
+
2024-08-23 16:37:48,712 INFO HandlerThread:11966 [interfaces.py:finish():202] Joined network monitor
|
328 |
+
2024-08-23 16:37:48,713 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
329 |
+
2024-08-23 16:37:48,713 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 2
|
330 |
+
2024-08-23 16:37:48,713 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 3
|
331 |
+
2024-08-23 16:37:48,713 DEBUG SenderThread:11966 [sender.py:send():382] send: stats
|
332 |
+
2024-08-23 16:37:48,713 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
333 |
+
2024-08-23 16:37:48,713 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 3
|
334 |
+
2024-08-23 16:37:48,715 DEBUG SenderThread:11966 [sender.py:send():382] send: history
|
335 |
+
2024-08-23 16:37:48,715 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: summary_record
|
336 |
+
2024-08-23 16:37:48,716 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
337 |
+
2024-08-23 16:37:48,716 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
338 |
+
2024-08-23 16:37:48,716 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 3
|
339 |
+
2024-08-23 16:37:48,716 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 4
|
340 |
+
2024-08-23 16:37:48,716 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
341 |
+
2024-08-23 16:37:48,716 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 4
|
342 |
+
2024-08-23 16:37:48,717 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
343 |
+
2024-08-23 16:37:48,717 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 4
|
344 |
+
2024-08-23 16:37:48,717 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 5
|
345 |
+
2024-08-23 16:37:48,717 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
346 |
+
2024-08-23 16:37:48,717 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 5
|
347 |
+
2024-08-23 16:37:48,718 DEBUG SenderThread:11966 [sender.py:send():382] send: summary
|
348 |
+
2024-08-23 16:37:48,718 INFO SenderThread:11966 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
349 |
+
2024-08-23 16:37:48,719 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
350 |
+
2024-08-23 16:37:48,719 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 5
|
351 |
+
2024-08-23 16:37:48,719 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 6
|
352 |
+
2024-08-23 16:37:48,719 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
353 |
+
2024-08-23 16:37:48,719 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 6
|
354 |
+
2024-08-23 16:37:48,719 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
355 |
+
2024-08-23 16:37:48,719 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 6
|
356 |
+
2024-08-23 16:37:48,719 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 7
|
357 |
+
2024-08-23 16:37:48,719 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
358 |
+
2024-08-23 16:37:48,719 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
359 |
+
2024-08-23 16:37:48,719 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 7
|
360 |
+
2024-08-23 16:37:48,720 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
361 |
+
2024-08-23 16:37:48,720 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 7
|
362 |
+
2024-08-23 16:37:49,550 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
363 |
+
2024-08-23 16:37:49,671 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
|
364 |
+
2024-08-23 16:37:50,524 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 8
|
365 |
+
2024-08-23 16:37:50,524 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
|
366 |
+
2024-08-23 16:37:50,524 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
367 |
+
2024-08-23 16:37:50,524 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 8
|
368 |
+
2024-08-23 16:37:50,524 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
369 |
+
2024-08-23 16:37:50,524 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 8
|
370 |
+
2024-08-23 16:37:50,525 INFO SenderThread:11966 [job_builder.py:build():296] Attempting to build job artifact
|
371 |
+
2024-08-23 16:37:50,525 INFO SenderThread:11966 [job_builder.py:_get_source_type():426] is repo sourced job
|
372 |
+
2024-08-23 16:37:50,540 INFO SenderThread:11966 [job_builder.py:build():402] adding wandb-job metadata file
|
373 |
+
2024-08-23 16:37:50,549 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 9
|
374 |
+
2024-08-23 16:37:50,550 DEBUG SenderThread:11966 [sender.py:send():382] send: artifact
|
375 |
+
2024-08-23 16:37:50,550 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
376 |
+
2024-08-23 16:37:50,551 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 9
|
377 |
+
2024-08-23 16:37:50,551 INFO Thread-12 :11966 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
378 |
+
2024-08-23 16:37:50,671 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
|
379 |
+
2024-08-23 16:37:51,630 INFO wandb-upload_0:11966 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmp7k_y5w7r
|
380 |
+
2024-08-23 16:37:52,084 INFO wandb-upload_1:11966 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp4s29y7vc
|
381 |
+
2024-08-23 16:37:53,444 INFO SenderThread:11966 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MjAxODA1Mw==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MTk5MDU4OQ==', 'versionIndex': 2}}}
|
382 |
+
2024-08-23 16:37:53,444 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
383 |
+
2024-08-23 16:37:53,444 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 9
|
384 |
+
2024-08-23 16:37:53,444 INFO SenderThread:11966 [dir_watcher.py:finish():358] shutting down directory watcher
|
385 |
+
2024-08-23 16:37:53,553 INFO SenderThread:11966 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_162922-z3gs82jm/files
|
386 |
+
2024-08-23 16:37:53,553 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/requirements.txt requirements.txt
|
387 |
+
2024-08-23 16:37:53,553 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/config.yaml config.yaml
|
388 |
+
2024-08-23 16:37:53,555 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-metadata.json wandb-metadata.json
|
389 |
+
2024-08-23 16:37:53,555 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json wandb-summary.json
|
390 |
+
2024-08-23 16:37:53,556 INFO SenderThread:11966 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162922-z3gs82jm/files/output.log output.log
|
391 |
+
2024-08-23 16:37:53,558 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 10
|
392 |
+
2024-08-23 16:37:53,558 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
|
393 |
+
2024-08-23 16:37:53,559 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
394 |
+
2024-08-23 16:37:53,559 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 10
|
395 |
+
2024-08-23 16:37:53,560 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
396 |
+
2024-08-23 16:37:53,560 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 10
|
397 |
+
2024-08-23 16:37:53,560 INFO SenderThread:11966 [file_pusher.py:finish():172] shutting down file pusher
|
398 |
+
2024-08-23 16:37:53,672 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
|
399 |
+
2024-08-23 16:37:53,672 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
|
400 |
+
2024-08-23 16:37:53,988 INFO wandb-upload_0:11966 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162922-z3gs82jm/files/requirements.txt
|
401 |
+
2024-08-23 16:37:54,025 INFO wandb-upload_1:11966 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162922-z3gs82jm/files/config.yaml
|
402 |
+
2024-08-23 16:37:54,673 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
|
403 |
+
2024-08-23 16:37:54,673 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
|
404 |
+
2024-08-23 16:37:55,582 INFO wandb-upload_2:11966 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162922-z3gs82jm/files/wandb-summary.json
|
405 |
+
2024-08-23 16:37:55,638 INFO wandb-upload_3:11966 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162922-z3gs82jm/files/output.log
|
406 |
+
2024-08-23 16:37:55,673 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
|
407 |
+
2024-08-23 16:37:55,674 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
|
408 |
+
2024-08-23 16:37:55,674 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: status_report
|
409 |
+
2024-08-23 16:37:55,838 INFO Thread-11 (_thread_body):11966 [sender.py:transition_state():617] send defer: 11
|
410 |
+
2024-08-23 16:37:55,839 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
411 |
+
2024-08-23 16:37:55,839 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 11
|
412 |
+
2024-08-23 16:37:55,839 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
413 |
+
2024-08-23 16:37:55,839 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 11
|
414 |
+
2024-08-23 16:37:55,839 INFO SenderThread:11966 [file_pusher.py:join():178] waiting for file pusher
|
415 |
+
2024-08-23 16:37:55,840 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 12
|
416 |
+
2024-08-23 16:37:55,840 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
417 |
+
2024-08-23 16:37:55,840 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 12
|
418 |
+
2024-08-23 16:37:55,840 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
419 |
+
2024-08-23 16:37:55,840 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 12
|
420 |
+
2024-08-23 16:37:55,840 INFO SenderThread:11966 [file_stream.py:finish():595] file stream finish called
|
421 |
+
2024-08-23 16:37:56,021 INFO SenderThread:11966 [file_stream.py:finish():599] file stream finish is done
|
422 |
+
2024-08-23 16:37:56,021 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 13
|
423 |
+
2024-08-23 16:37:56,021 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
424 |
+
2024-08-23 16:37:56,021 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 13
|
425 |
+
2024-08-23 16:37:56,021 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
426 |
+
2024-08-23 16:37:56,021 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 13
|
427 |
+
2024-08-23 16:37:56,021 INFO SenderThread:11966 [sender.py:transition_state():617] send defer: 14
|
428 |
+
2024-08-23 16:37:56,021 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: defer
|
429 |
+
2024-08-23 16:37:56,021 DEBUG SenderThread:11966 [sender.py:send():382] send: final
|
430 |
+
2024-08-23 16:37:56,021 INFO HandlerThread:11966 [handler.py:handle_request_defer():172] handle defer: 14
|
431 |
+
2024-08-23 16:37:56,022 DEBUG SenderThread:11966 [sender.py:send():382] send: footer
|
432 |
+
2024-08-23 16:37:56,022 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: defer
|
433 |
+
2024-08-23 16:37:56,022 INFO SenderThread:11966 [sender.py:send_request_defer():613] handle sender defer: 14
|
434 |
+
2024-08-23 16:37:56,022 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
|
435 |
+
2024-08-23 16:37:56,022 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
|
436 |
+
2024-08-23 16:37:56,023 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: poll_exit
|
437 |
+
2024-08-23 16:37:56,023 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: poll_exit
|
438 |
+
2024-08-23 16:37:56,023 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: server_info
|
439 |
+
2024-08-23 16:37:56,023 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: get_summary
|
440 |
+
2024-08-23 16:37:56,024 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: server_info
|
441 |
+
2024-08-23 16:37:56,025 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: sampled_history
|
442 |
+
2024-08-23 16:37:56,026 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: internal_messages
|
443 |
+
2024-08-23 16:37:56,027 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: job_info
|
444 |
+
2024-08-23 16:37:56,184 DEBUG SenderThread:11966 [sender.py:send_request():409] send_request: job_info
|
445 |
+
2024-08-23 16:37:56,185 INFO MainThread:11966 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
446 |
+
2024-08-23 16:37:56,185 INFO MainThread:11966 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
447 |
+
2024-08-23 16:37:56,186 INFO MainThread:11966 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
448 |
+
2024-08-23 16:37:56,186 DEBUG HandlerThread:11966 [handler.py:handle_request():146] handle_request: shutdown
|
449 |
+
2024-08-23 16:37:56,186 INFO HandlerThread:11966 [handler.py:finish():869] shutting down handler
|
450 |
+
2024-08-23 16:37:57,027 INFO WriterThread:11966 [datastore.py:close():296] close: /project/wandb/run-20240823_162922-z3gs82jm/run-z3gs82jm.wandb
|
451 |
+
2024-08-23 16:37:57,185 INFO SenderThread:11966 [sender.py:finish():1572] shutting down sender
|
452 |
+
2024-08-23 16:37:57,185 INFO SenderThread:11966 [file_pusher.py:finish():172] shutting down file pusher
|
453 |
+
2024-08-23 16:37:57,185 INFO SenderThread:11966 [file_pusher.py:join():178] waiting for file pusher
|