Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- wandb/run-20240802_173428-s75vpwte/files/config.yaml +335 -0
- wandb/run-20240802_173428-s75vpwte/files/output.log +0 -0
- wandb/run-20240802_173428-s75vpwte/files/requirements.txt +271 -0
- wandb/run-20240802_173428-s75vpwte/files/wandb-metadata.json +215 -0
- wandb/run-20240802_173428-s75vpwte/files/wandb-summary.json +1 -0
- wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log +0 -0
- wandb/run-20240802_173428-s75vpwte/logs/debug.log +29 -0
- wandb/run-20240804_135607-ikp7tdz1/files/config.yaml +335 -0
- wandb/run-20240804_135607-ikp7tdz1/files/output.log +130 -0
- wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt +271 -0
- wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json +215 -0
- wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json +1 -0
- wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log +216 -0
- wandb/run-20240804_135607-ikp7tdz1/logs/debug.log +30 -0
- wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb +0 -0
- wandb/run-20240812_070449-ufge4h1y/files/config.yaml +335 -0
- wandb/run-20240812_070449-ufge4h1y/files/output.log +158 -0
- wandb/run-20240812_070449-ufge4h1y/files/requirements.txt +271 -0
- wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json +215 -0
- wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json +1 -0
- wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log +616 -0
- wandb/run-20240812_070449-ufge4h1y/logs/debug.log +29 -0
- wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb +0 -0
- wandb/run-20240812_073202-yby212na/files/config.yaml +335 -0
- wandb/run-20240812_073202-yby212na/files/output.log +116 -0
- wandb/run-20240812_073202-yby212na/files/requirements.txt +271 -0
- wandb/run-20240812_073202-yby212na/files/wandb-metadata.json +215 -0
- wandb/run-20240812_073202-yby212na/files/wandb-summary.json +1 -0
- wandb/run-20240812_073202-yby212na/logs/debug-internal.log +236 -0
- wandb/run-20240812_073202-yby212na/logs/debug.log +29 -0
- wandb/run-20240812_073202-yby212na/run-yby212na.wandb +0 -0
- wandb/run-20240815_041534-1ld4rgmy/files/config.yaml +337 -0
- wandb/run-20240815_041534-1ld4rgmy/files/output.log +92 -0
- wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt +354 -0
- wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json +215 -0
- wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json +1 -0
- wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log +162 -0
- wandb/run-20240815_041534-1ld4rgmy/logs/debug.log +29 -0
- wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb +0 -0
- wandb/run-20240824_202022-z2bjbf6e/files/config.yaml +321 -0
- wandb/run-20240824_202022-z2bjbf6e/files/output.log +51 -0
- wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt +375 -0
- wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json +880 -0
- wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json +1 -0
- wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log +191 -0
- wandb/run-20240824_202022-z2bjbf6e/logs/debug.log +28 -0
- wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb +0 -0
- wandb/run-20240826_221726-7jzdp89j/files/config.yaml +342 -0
- wandb/run-20240826_221726-7jzdp89j/files/output.log +0 -0
- wandb/run-20240826_221726-7jzdp89j/files/requirements.txt +375 -0
wandb/run-20240802_173428-s75vpwte/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 512
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-mistral-sample_train_2024-08-02-17:34:15
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-mistral-sample
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-mistral-sample
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/custom/tiny-mistral
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 8
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-mistral-sample
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32768
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722587668.341658
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 256
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: mistral
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 512
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 4
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 4
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: MistralForCausalLM
|
wandb/run-20240802_173428-s75vpwte/files/output.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240802_173428-s75vpwte/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240802_173428-s75vpwte/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-02T08:34:28.941229",
|
5 |
+
"startedAt": "2024-08-02T08:34:28.326109",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"512",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"8",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/custom/tiny-mistral",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-mistral-sample",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-mistral-sample",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-mistral-sample",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-mistral-sample_train_2024-08-02-17:34:15"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0409999999997,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.041,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.041,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.041,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.041,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.041,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.041,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.041,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.041,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.041,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.041,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.041,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.041,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.041,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.041,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.041,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.041,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.041,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.041,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240802_173428-s75vpwte/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 8.780712127685547, "training/perplexity": 6507.50970149773, "utils/batch_size": 8, "utils/global_batch_size": 320, "utils/seq_len": 513, "utils/gradient_accumulation_steps": 40, "utils/iteration": 1410, "optimizer/lr": 1.989808738231659e-05, "optimizer/variance_l2": 0.013855160145659429, "optimizer/variance_sqrt_l2": 0.9992841304001847, "optimizer/momentum_l2": 0.9839698623853019, "optimizer/weight_l2": 101.83051175850979, "optimizer/variance_l1": 1.002197265625, "optimizer/variance_sqrt_l1": 536.5, "optimizer/momentum_l1": 403.875, "optimizer/weight_l1": 332288.0, "optimizer/variance_abs_max": 0.0011444091796875, "optimizer/variance_sqrt_abs_max": 0.033935546875, "optimizer/momentum_abs_max": 0.03369140625, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 1.277997902000152, "stats/tokens_per_sec": 128450.91509389698, "stats/tokens_per_sec_per_gpu": 128450.91509389698, "stats/tflops": 9.093190310165799, "_timestamp": 1722589282.0763872, "_runtime": 1613.73472905159, "_step": 1410, "evaluation/val_loss": 8.783937454223633, "evaluation/val_ppl": 6528.5322265625, "_wandb": {"runtime": 1614}}
|
wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240802_173428-s75vpwte/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-02 17:34:28,332 INFO MainThread:13969 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Configure stats pid to 13969
|
3 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
|
6 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240802_173428-s75vpwte/logs/debug.log
|
9 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log
|
10 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-02-17:34:15', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-02 17:34:28,333 INFO MainThread:13969 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-02 17:34:28,339 INFO MainThread:13969 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-02 17:34:28,341 INFO MainThread:13969 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-02 17:34:28,346 INFO MainThread:13969 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-02 17:34:28,360 INFO MainThread:13969 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-02 17:34:28,832 INFO MainThread:13969 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-02 17:34:28,915 INFO MainThread:13969 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-02 17:34:28,915 INFO MainThread:13969 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-02 17:34:28,976 INFO MainThread:13969 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-02 17:34:28,976 INFO MainThread:13969 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-02 17:34:28,976 INFO MainThread:13969 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-02 17:34:28,976 INFO MainThread:13969 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-02 17:34:28,977 INFO MainThread:13969 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-02 17:34:33,327 INFO MainThread:13969 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
|
29 |
+
2024-08-02 17:34:33,327 INFO MainThread:13969 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 256
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-llama-sample_train_2024-08-04-13:55:35
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-llama-sample
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-llama-sample
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 2000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 2000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 8
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 2048
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-llama-sample
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32000
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722747367.911791
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 2048
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: llama
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 2048
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 32
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 22
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: LlamaForCausalLM
|
wandb/run-20240804_135607-ikp7tdz1/files/output.log
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/tiny-llama-sample.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
8 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
9 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
10 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping model loading
|
11 |
+
--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
12 |
+
--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
|
13 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
14 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
15 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
16 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
17 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
18 |
+
--> applying fsdp activation checkpointing...
|
19 |
+
> datasets target sizes (minimum size):
|
20 |
+
train: 640000
|
21 |
+
validation: 35200
|
22 |
+
test: 3200
|
23 |
+
> building train, validation, and test datasets for GPT ...
|
24 |
+
> finished creating GPT datasets ...
|
25 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
26 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
27 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping optimizer loading
|
28 |
+
File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
29 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
|
30 |
+
model info: FullyShardedDataParallel(
|
31 |
+
(_fsdp_wrapped_module): LlamaForCausalLM(
|
32 |
+
(model): LlamaModel(
|
33 |
+
(embed_tokens): Embedding(32000, 2048)
|
34 |
+
(layers): ModuleList(
|
35 |
+
(0-21): 22 x FullyShardedDataParallel(
|
36 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
37 |
+
(_checkpoint_wrapped_module): LlamaDecoderLayer(
|
38 |
+
(self_attn): LlamaFlashAttention2(
|
39 |
+
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
40 |
+
(k_proj): Linear(in_features=2048, out_features=256, bias=False)
|
41 |
+
(v_proj): Linear(in_features=2048, out_features=256, bias=False)
|
42 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
43 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
44 |
+
)
|
45 |
+
(mlp): LlamaMLP(
|
46 |
+
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
47 |
+
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
48 |
+
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
|
49 |
+
(act_fn): SiLU()
|
50 |
+
)
|
51 |
+
(input_layernorm): LlamaRMSNorm()
|
52 |
+
(post_attention_layernorm): LlamaRMSNorm()
|
53 |
+
)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
)
|
57 |
+
(norm): LlamaRMSNorm()
|
58 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
59 |
+
)
|
60 |
+
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
|
61 |
+
)
|
62 |
+
)
|
63 |
+
model config: LlamaConfig {
|
64 |
+
"_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
65 |
+
"architectures": [
|
66 |
+
"LlamaForCausalLM"
|
67 |
+
],
|
68 |
+
"attention_bias": false,
|
69 |
+
"attention_dropout": 0.0,
|
70 |
+
"bos_token_id": 1,
|
71 |
+
"eos_token_id": 2,
|
72 |
+
"hidden_act": "silu",
|
73 |
+
"hidden_size": 2048,
|
74 |
+
"initializer_range": 0.02,
|
75 |
+
"intermediate_size": 5632,
|
76 |
+
"label_smoothing": 0.0,
|
77 |
+
"max_position_embeddings": 2048,
|
78 |
+
"mlp_bias": false,
|
79 |
+
"model_type": "llama",
|
80 |
+
"num_attention_heads": 32,
|
81 |
+
"num_hidden_layers": 22,
|
82 |
+
"num_key_value_heads": 4,
|
83 |
+
"pretraining_tp": 1,
|
84 |
+
"rms_norm_eps": 1e-05,
|
85 |
+
"rope_scaling": null,
|
86 |
+
"rope_theta": 10000.0,
|
87 |
+
"tie_word_embeddings": false,
|
88 |
+
"torch_dtype": "float32",
|
89 |
+
"transformers_version": "4.43.3",
|
90 |
+
"use_cache": false,
|
91 |
+
"vocab_size": 32000
|
92 |
+
}
|
93 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
94 |
+
warnings.warn(
|
95 |
+
Let split = None
|
96 |
+
Building a BlendedDataset for a single MegatronDataset
|
97 |
+
Unable to save the indexes because path_to_cache is None
|
98 |
+
Building a BlendedDataset for a single MegatronDataset
|
99 |
+
Unable to save the indexes because path_to_cache is None
|
100 |
+
Building a BlendedDataset for a single MegatronDataset
|
101 |
+
Unable to save the indexes because path_to_cache is None
|
102 |
+
Traceback (most recent call last):
|
103 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
104 |
+
main()
|
105 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
106 |
+
train(
|
107 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
|
108 |
+
loss: torch.Tensor = model(**batch).loss
|
109 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
110 |
+
return self._call_impl(*args, **kwargs)
|
111 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
112 |
+
return forward_call(*args, **kwargs)
|
113 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
114 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
116 |
+
return self._call_impl(*args, **kwargs)
|
117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
118 |
+
return forward_call(*args, **kwargs)
|
119 |
+
File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
|
120 |
+
outputs = self.model(
|
121 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
122 |
+
return self._call_impl(*args, **kwargs)
|
123 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
124 |
+
return forward_call(*args, **kwargs)
|
125 |
+
File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
|
126 |
+
cache_position = torch.arange(
|
127 |
+
RuntimeError: CUDA error: device-side assert triggered
|
128 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
129 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
|
130 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-04T04:56:08.637907",
|
5 |
+
"startedAt": "2024-08-04T04:56:07.879507",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"256",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"2048",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"8",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"2000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"2000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-llama-sample",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-llama-sample",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-llama-sample",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-llama-sample_train_2024-08-04-13:55:35"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0389999999993,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.039,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48781967163086
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 67}}
|
wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 13:56:07,912 INFO StreamThr :9151 [internal.py:wandb_internal():86] W&B internal server running at pid: 9151, started at: 2024-08-04 13:56:07.911369
|
2 |
+
2024-08-04 13:56:07,914 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-04 13:56:07,916 INFO WriterThread:9151 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb
|
4 |
+
2024-08-04 13:56:07,917 DEBUG SenderThread:9151 [sender.py:send():382] send: header
|
5 |
+
2024-08-04 13:56:08,068 DEBUG SenderThread:9151 [sender.py:send():382] send: run
|
6 |
+
2024-08-04 13:56:08,527 INFO SenderThread:9151 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_135607-ikp7tdz1/files
|
7 |
+
2024-08-04 13:56:08,527 INFO SenderThread:9151 [sender.py:_start_run_threads():1136] run started: ikp7tdz1 with start time 1722747367.911791
|
8 |
+
2024-08-04 13:56:08,532 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-04 13:56:08,533 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-04 13:56:08,619 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-04 13:56:08,625 DEBUG HandlerThread:9151 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-04 13:56:08,625 DEBUG HandlerThread:9151 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-04 13:56:08,625 INFO HandlerThread:9151 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-04 13:56:08,625 INFO SystemMonitor:9151 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-04 13:56:08,626 INFO HandlerThread:9151 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-04 13:56:08,626 INFO SystemMonitor:9151 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-04 13:56:08,627 INFO SystemMonitor:9151 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-04 13:56:08,628 INFO SystemMonitor:9151 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-04 13:56:08,628 INFO SystemMonitor:9151 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-04 13:56:08,629 INFO SystemMonitor:9151 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-04 13:56:08,637 DEBUG HandlerThread:9151 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-04 13:56:08,639 DEBUG HandlerThread:9151 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-04 13:56:08,651 DEBUG HandlerThread:9151 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-04 13:56:08,651 DEBUG HandlerThread:9151 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-04 13:56:08,651 DEBUG HandlerThread:9151 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T04:56:08.637907', 'startedAt': '2024-08-04T04:56:07.879507', 'docker': None, 'cuda': None, 'args': ('--seq-length', '256', '--sliding-window-size', '2048', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama-sample', '--load', '/work/llm_recipes/models/tiny-llama-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama-sample_train_2024-08-04-13:55:35'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
|
26 |
+
2024-08-04 13:56:08,651 INFO HandlerThread:9151 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-04 13:56:08,651 INFO HandlerThread:9151 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-04 13:56:08,653 INFO HandlerThread:9151 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-04 13:56:08,681 DEBUG SenderThread:9151 [sender.py:send():382] send: files
|
30 |
+
2024-08-04 13:56:08,681 INFO SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-04 13:56:08,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-04 13:56:08,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-04 13:56:08,691 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: python_packages
|
34 |
+
2024-08-04 13:56:08,691 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
|
35 |
+
2024-08-04 13:56:08,692 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-04 13:56:08,938 DEBUG SenderThread:9151 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-04 13:56:09,405 INFO wandb-upload_0:9151 [upload_job.py:push():131] Uploaded file /tmp/tmpins_li9awandb/mkgvo0s4-wandb-metadata.json
|
38 |
+
2024-08-04 13:56:09,529 INFO Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt
|
39 |
+
2024-08-04 13:56:09,529 INFO Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json
|
40 |
+
2024-08-04 13:56:10,529 INFO Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
|
41 |
+
2024-08-04 13:56:12,531 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
|
42 |
+
2024-08-04 13:56:13,586 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-04 13:56:16,533 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
|
44 |
+
2024-08-04 13:56:19,567 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
45 |
+
2024-08-04 13:56:23,689 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
|
46 |
+
2024-08-04 13:56:23,690 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
|
47 |
+
2024-08-04 13:56:23,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
|
48 |
+
2024-08-04 13:56:24,913 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
49 |
+
2024-08-04 13:56:29,913 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
50 |
+
2024-08-04 13:56:34,914 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
51 |
+
2024-08-04 13:56:38,689 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
|
52 |
+
2024-08-04 13:56:38,690 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
|
53 |
+
2024-08-04 13:56:38,732 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
|
54 |
+
2024-08-04 13:56:39,955 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
55 |
+
2024-08-04 13:56:40,547 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
|
56 |
+
2024-08-04 13:56:45,164 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
57 |
+
2024-08-04 13:56:50,164 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
58 |
+
2024-08-04 13:56:53,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
|
59 |
+
2024-08-04 13:56:53,690 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
|
60 |
+
2024-08-04 13:56:53,732 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
|
61 |
+
2024-08-04 13:56:55,957 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-04 13:57:00,957 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
63 |
+
2024-08-04 13:57:05,958 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
64 |
+
2024-08-04 13:57:08,629 DEBUG SystemMonitor:9151 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
65 |
+
2024-08-04 13:57:08,630 DEBUG SenderThread:9151 [sender.py:send():382] send: stats
|
66 |
+
2024-08-04 13:57:08,690 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
|
67 |
+
2024-08-04 13:57:08,690 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
|
68 |
+
2024-08-04 13:57:08,732 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
|
69 |
+
2024-08-04 13:57:11,872 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
70 |
+
2024-08-04 13:57:15,196 DEBUG SenderThread:9151 [sender.py:send():382] send: config
|
71 |
+
2024-08-04 13:57:15,197 DEBUG SenderThread:9151 [sender.py:send():382] send: config
|
72 |
+
2024-08-04 13:57:16,571 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
|
73 |
+
2024-08-04 13:57:16,600 DEBUG SenderThread:9151 [sender.py:send():382] send: exit
|
74 |
+
2024-08-04 13:57:16,601 INFO SenderThread:9151 [sender.py:send_exit():589] handling exit code: 1
|
75 |
+
2024-08-04 13:57:16,601 INFO SenderThread:9151 [sender.py:send_exit():591] handling runtime: 67
|
76 |
+
2024-08-04 13:57:16,602 INFO SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
77 |
+
2024-08-04 13:57:16,602 INFO SenderThread:9151 [sender.py:send_exit():597] send defer
|
78 |
+
2024-08-04 13:57:16,602 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
79 |
+
2024-08-04 13:57:16,603 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 0
|
80 |
+
2024-08-04 13:57:16,603 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
81 |
+
2024-08-04 13:57:16,603 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 0
|
82 |
+
2024-08-04 13:57:16,603 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 1
|
83 |
+
2024-08-04 13:57:16,603 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
84 |
+
2024-08-04 13:57:16,603 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 1
|
85 |
+
2024-08-04 13:57:16,603 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
86 |
+
2024-08-04 13:57:16,603 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 1
|
87 |
+
2024-08-04 13:57:16,603 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 2
|
88 |
+
2024-08-04 13:57:16,603 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
89 |
+
2024-08-04 13:57:16,603 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 2
|
90 |
+
2024-08-04 13:57:16,603 INFO HandlerThread:9151 [system_monitor.py:finish():203] Stopping system monitor
|
91 |
+
2024-08-04 13:57:16,603 DEBUG SystemMonitor:9151 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
92 |
+
2024-08-04 13:57:16,604 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined cpu monitor
|
93 |
+
2024-08-04 13:57:16,604 DEBUG SystemMonitor:9151 [system_monitor.py:_start():183] Publishing last batch of metrics
|
94 |
+
2024-08-04 13:57:16,604 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined disk monitor
|
95 |
+
2024-08-04 13:57:16,637 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined gpu monitor
|
96 |
+
2024-08-04 13:57:16,637 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined memory monitor
|
97 |
+
2024-08-04 13:57:16,637 INFO HandlerThread:9151 [interfaces.py:finish():202] Joined network monitor
|
98 |
+
2024-08-04 13:57:16,638 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
99 |
+
2024-08-04 13:57:16,638 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 2
|
100 |
+
2024-08-04 13:57:16,638 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 3
|
101 |
+
2024-08-04 13:57:16,638 DEBUG SenderThread:9151 [sender.py:send():382] send: stats
|
102 |
+
2024-08-04 13:57:16,638 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
103 |
+
2024-08-04 13:57:16,638 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 3
|
104 |
+
2024-08-04 13:57:16,638 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
105 |
+
2024-08-04 13:57:16,638 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 3
|
106 |
+
2024-08-04 13:57:16,638 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 4
|
107 |
+
2024-08-04 13:57:16,638 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
108 |
+
2024-08-04 13:57:16,638 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 4
|
109 |
+
2024-08-04 13:57:16,639 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
110 |
+
2024-08-04 13:57:16,639 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 4
|
111 |
+
2024-08-04 13:57:16,639 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 5
|
112 |
+
2024-08-04 13:57:16,639 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
113 |
+
2024-08-04 13:57:16,639 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 5
|
114 |
+
2024-08-04 13:57:16,639 DEBUG SenderThread:9151 [sender.py:send():382] send: summary
|
115 |
+
2024-08-04 13:57:16,640 INFO SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
116 |
+
2024-08-04 13:57:16,640 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
117 |
+
2024-08-04 13:57:16,640 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 5
|
118 |
+
2024-08-04 13:57:16,640 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 6
|
119 |
+
2024-08-04 13:57:16,640 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
120 |
+
2024-08-04 13:57:16,640 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 6
|
121 |
+
2024-08-04 13:57:16,640 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
122 |
+
2024-08-04 13:57:16,640 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 6
|
123 |
+
2024-08-04 13:57:16,643 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
|
124 |
+
2024-08-04 13:57:16,835 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 7
|
125 |
+
2024-08-04 13:57:16,836 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
126 |
+
2024-08-04 13:57:16,836 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 7
|
127 |
+
2024-08-04 13:57:16,836 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
128 |
+
2024-08-04 13:57:16,836 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 7
|
129 |
+
2024-08-04 13:57:17,572 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
|
130 |
+
2024-08-04 13:57:17,572 INFO Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json
|
131 |
+
2024-08-04 13:57:17,600 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
|
132 |
+
2024-08-04 13:57:18,334 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 8
|
133 |
+
2024-08-04 13:57:18,334 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
|
134 |
+
2024-08-04 13:57:18,334 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
135 |
+
2024-08-04 13:57:18,335 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 8
|
136 |
+
2024-08-04 13:57:18,335 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
137 |
+
2024-08-04 13:57:18,335 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 8
|
138 |
+
2024-08-04 13:57:18,335 INFO SenderThread:9151 [job_builder.py:build():296] Attempting to build job artifact
|
139 |
+
2024-08-04 13:57:18,336 INFO SenderThread:9151 [job_builder.py:_get_source_type():426] is repo sourced job
|
140 |
+
2024-08-04 13:57:18,350 INFO SenderThread:9151 [job_builder.py:build():402] adding wandb-job metadata file
|
141 |
+
2024-08-04 13:57:18,359 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 9
|
142 |
+
2024-08-04 13:57:18,360 DEBUG SenderThread:9151 [sender.py:send():382] send: artifact
|
143 |
+
2024-08-04 13:57:18,360 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
144 |
+
2024-08-04 13:57:18,361 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 9
|
145 |
+
2024-08-04 13:57:18,573 INFO Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
|
146 |
+
2024-08-04 13:57:18,601 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
|
147 |
+
2024-08-04 13:57:19,234 INFO SenderThread:9151 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
148 |
+
2024-08-04 13:57:19,234 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
149 |
+
2024-08-04 13:57:19,234 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 9
|
150 |
+
2024-08-04 13:57:19,234 INFO SenderThread:9151 [dir_watcher.py:finish():358] shutting down directory watcher
|
151 |
+
2024-08-04 13:57:19,573 INFO SenderThread:9151 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_135607-ikp7tdz1/files
|
152 |
+
2024-08-04 13:57:19,574 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt requirements.txt
|
153 |
+
2024-08-04 13:57:19,574 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml config.yaml
|
154 |
+
2024-08-04 13:57:19,575 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json wandb-metadata.json
|
155 |
+
2024-08-04 13:57:19,576 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json wandb-summary.json
|
156 |
+
2024-08-04 13:57:19,577 INFO SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log output.log
|
157 |
+
2024-08-04 13:57:19,579 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 10
|
158 |
+
2024-08-04 13:57:19,579 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
|
159 |
+
2024-08-04 13:57:19,579 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
160 |
+
2024-08-04 13:57:19,580 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 10
|
161 |
+
2024-08-04 13:57:19,581 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
162 |
+
2024-08-04 13:57:19,581 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 10
|
163 |
+
2024-08-04 13:57:19,581 INFO SenderThread:9151 [file_pusher.py:finish():172] shutting down file pusher
|
164 |
+
2024-08-04 13:57:19,601 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
|
165 |
+
2024-08-04 13:57:19,601 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
|
166 |
+
2024-08-04 13:57:19,983 INFO wandb-upload_0:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt
|
167 |
+
2024-08-04 13:57:20,084 INFO wandb-upload_1:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
|
168 |
+
2024-08-04 13:57:20,165 INFO wandb-upload_2:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json
|
169 |
+
2024-08-04 13:57:20,334 INFO wandb-upload_3:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
|
170 |
+
2024-08-04 13:57:20,534 INFO Thread-11 (_thread_body):9151 [sender.py:transition_state():617] send defer: 11
|
171 |
+
2024-08-04 13:57:20,534 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
172 |
+
2024-08-04 13:57:20,534 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 11
|
173 |
+
2024-08-04 13:57:20,535 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
174 |
+
2024-08-04 13:57:20,535 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 11
|
175 |
+
2024-08-04 13:57:20,535 INFO SenderThread:9151 [file_pusher.py:join():178] waiting for file pusher
|
176 |
+
2024-08-04 13:57:20,535 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 12
|
177 |
+
2024-08-04 13:57:20,535 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
178 |
+
2024-08-04 13:57:20,535 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 12
|
179 |
+
2024-08-04 13:57:20,535 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
180 |
+
2024-08-04 13:57:20,535 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 12
|
181 |
+
2024-08-04 13:57:20,535 INFO SenderThread:9151 [file_stream.py:finish():595] file stream finish called
|
182 |
+
2024-08-04 13:57:20,601 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
|
183 |
+
2024-08-04 13:57:20,717 INFO SenderThread:9151 [file_stream.py:finish():599] file stream finish is done
|
184 |
+
2024-08-04 13:57:20,717 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 13
|
185 |
+
2024-08-04 13:57:20,717 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
|
186 |
+
2024-08-04 13:57:20,717 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
187 |
+
2024-08-04 13:57:20,718 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 13
|
188 |
+
2024-08-04 13:57:20,718 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
189 |
+
2024-08-04 13:57:20,718 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 13
|
190 |
+
2024-08-04 13:57:20,718 INFO SenderThread:9151 [sender.py:transition_state():617] send defer: 14
|
191 |
+
2024-08-04 13:57:20,718 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
|
192 |
+
2024-08-04 13:57:20,718 DEBUG SenderThread:9151 [sender.py:send():382] send: final
|
193 |
+
2024-08-04 13:57:20,718 INFO HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 14
|
194 |
+
2024-08-04 13:57:20,718 DEBUG SenderThread:9151 [sender.py:send():382] send: footer
|
195 |
+
2024-08-04 13:57:20,719 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: defer
|
196 |
+
2024-08-04 13:57:20,719 INFO SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 14
|
197 |
+
2024-08-04 13:57:20,719 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
|
198 |
+
2024-08-04 13:57:20,719 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
|
199 |
+
2024-08-04 13:57:20,719 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
|
200 |
+
2024-08-04 13:57:20,720 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: server_info
|
201 |
+
2024-08-04 13:57:20,720 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
|
202 |
+
2024-08-04 13:57:20,720 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: get_summary
|
203 |
+
2024-08-04 13:57:20,720 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: server_info
|
204 |
+
2024-08-04 13:57:20,721 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: sampled_history
|
205 |
+
2024-08-04 13:57:20,722 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
|
206 |
+
2024-08-04 13:57:20,722 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: job_info
|
207 |
+
2024-08-04 13:57:20,885 DEBUG SenderThread:9151 [sender.py:send_request():409] send_request: job_info
|
208 |
+
2024-08-04 13:57:20,885 INFO MainThread:9151 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
209 |
+
2024-08-04 13:57:20,885 INFO MainThread:9151 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
210 |
+
2024-08-04 13:57:20,885 INFO MainThread:9151 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
211 |
+
2024-08-04 13:57:20,886 DEBUG HandlerThread:9151 [handler.py:handle_request():146] handle_request: shutdown
|
212 |
+
2024-08-04 13:57:20,886 INFO HandlerThread:9151 [handler.py:finish():869] shutting down handler
|
213 |
+
2024-08-04 13:57:21,722 INFO WriterThread:9151 [datastore.py:close():296] close: /project/wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb
|
214 |
+
2024-08-04 13:57:21,885 INFO SenderThread:9151 [sender.py:finish():1572] shutting down sender
|
215 |
+
2024-08-04 13:57:21,885 INFO SenderThread:9151 [file_pusher.py:finish():172] shutting down file pusher
|
216 |
+
2024-08-04 13:57:21,885 INFO SenderThread:9151 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_135607-ikp7tdz1/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Configure stats pid to 9079
|
3 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
|
6 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_135607-ikp7tdz1/logs/debug.log
|
9 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log
|
10 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-04 13:56:07,904 INFO MainThread:9079 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 256, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama-sample_train_2024-08-04-13:55:35', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama-sample', 'save': '/work/llm_recipes/models/tiny-llama-sample', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 2048, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-04 13:56:07,905 INFO MainThread:9079 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-04 13:56:07,905 INFO MainThread:9079 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-04 13:56:07,909 INFO MainThread:9079 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-04 13:56:07,911 INFO MainThread:9079 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-04 13:56:07,916 INFO MainThread:9079 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-04 13:56:08,064 INFO MainThread:9079 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-04 13:56:08,532 INFO MainThread:9079 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-04 13:56:08,612 INFO MainThread:9079 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-04 13:56:08,612 INFO MainThread:9079 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-04 13:56:08,689 INFO MainThread:9079 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-04 13:56:08,689 INFO MainThread:9079 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-04 13:56:08,690 INFO MainThread:9079 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-04 13:56:08,690 INFO MainThread:9079 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-04 13:56:08,691 INFO MainThread:9079 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-04 13:57:15,195 INFO MainThread:9079 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
|
29 |
+
2024-08-04 13:57:15,196 INFO MainThread:9079 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-04 13:57:21,887 WARNING MsgRouterThr:9079 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb
ADDED
Binary file (22.5 kB). View file
|
|
wandb/run-20240812_070449-ufge4h1y/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '304771887'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '304771887'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '304771887'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 4096
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-qwen2-0.5B_train_2024-08-12-07:04:37
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 5
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-qwen2-0.5B
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 151680
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1723413889.11596
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
model_architecture:
|
316 |
+
desc: null
|
317 |
+
value: Qwen2ForCausalLM
|
318 |
+
activation_function:
|
319 |
+
desc: null
|
320 |
+
value: silu
|
321 |
+
hidden_size:
|
322 |
+
desc: null
|
323 |
+
value: 896
|
324 |
+
model_type:
|
325 |
+
desc: null
|
326 |
+
value: qwen2
|
327 |
+
max_position_embeddings:
|
328 |
+
desc: null
|
329 |
+
value: 4096
|
330 |
+
num_attention_heads:
|
331 |
+
desc: null
|
332 |
+
value: 14
|
333 |
+
num_hidden_layers:
|
334 |
+
desc: null
|
335 |
+
value: 24
|
wandb/run-20240812_070449-ufge4h1y/files/output.log
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
8 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
9 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
10 |
+
warnings.warn(
|
11 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
12 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
13 |
+
No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
|
14 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
15 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
16 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
17 |
+
Let split = None
|
18 |
+
Building a BlendedDataset for a single MegatronDataset
|
19 |
+
Unable to save the indexes because path_to_cache is None
|
20 |
+
Building a BlendedDataset for a single MegatronDataset
|
21 |
+
Unable to save the indexes because path_to_cache is None
|
22 |
+
Building a BlendedDataset for a single MegatronDataset
|
23 |
+
Unable to save the indexes because path_to_cache is None
|
24 |
+
--> applying fsdp activation checkpointing...
|
25 |
+
> datasets target sizes (minimum size):
|
26 |
+
train: 6400000
|
27 |
+
validation: 323200
|
28 |
+
test: 3200
|
29 |
+
> building train, validation, and test datasets for GPT ...
|
30 |
+
> finished creating GPT datasets ...
|
31 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
32 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
33 |
+
No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
|
34 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
35 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
36 |
+
model info: FullyShardedDataParallel(
|
37 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
38 |
+
(model): Qwen2Model(
|
39 |
+
(embed_tokens): Embedding(151936, 896)
|
40 |
+
(layers): ModuleList(
|
41 |
+
(0-23): 24 x FullyShardedDataParallel(
|
42 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
43 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
44 |
+
(self_attn): Qwen2FlashAttention2(
|
45 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
46 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
47 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
48 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
49 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
50 |
+
)
|
51 |
+
(mlp): Qwen2MLP(
|
52 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
53 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
54 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
55 |
+
(act_fn): SiLU()
|
56 |
+
)
|
57 |
+
(input_layernorm): Qwen2RMSNorm()
|
58 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
59 |
+
)
|
60 |
+
)
|
61 |
+
)
|
62 |
+
)
|
63 |
+
(norm): Qwen2RMSNorm()
|
64 |
+
)
|
65 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
66 |
+
)
|
67 |
+
)
|
68 |
+
model config: Qwen2Config {
|
69 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
70 |
+
"architectures": [
|
71 |
+
"Qwen2ForCausalLM"
|
72 |
+
],
|
73 |
+
"attention_dropout": 0.0,
|
74 |
+
"bos_token_id": 151643,
|
75 |
+
"eos_token_id": 151643,
|
76 |
+
"hidden_act": "silu",
|
77 |
+
"hidden_size": 896,
|
78 |
+
"initializer_range": 0.02,
|
79 |
+
"intermediate_size": 4864,
|
80 |
+
"label_smoothing": 0.0,
|
81 |
+
"max_position_embeddings": 4096,
|
82 |
+
"max_window_layers": 24,
|
83 |
+
"model_type": "qwen2",
|
84 |
+
"num_attention_heads": 14,
|
85 |
+
"num_hidden_layers": 24,
|
86 |
+
"num_key_value_heads": 2,
|
87 |
+
"rms_norm_eps": 1e-06,
|
88 |
+
"rope_theta": 1000000.0,
|
89 |
+
"sliding_window": null,
|
90 |
+
"tie_word_embeddings": true,
|
91 |
+
"torch_dtype": "bfloat16",
|
92 |
+
"transformers_version": "4.43.3",
|
93 |
+
"use_cache": false,
|
94 |
+
"use_sliding_window": false,
|
95 |
+
"vocab_size": 151936
|
96 |
+
}
|
97 |
+
------------------------------------------------------------------
|
98 |
+
iteration: 1 , TFLOPS: 69.43623917184445, Tokens per sec: 17268.44384112612, Loss: 4.1814446449279785
|
99 |
+
------------------------------------------------------------------
|
100 |
+
------------------------------------------------------------------
|
101 |
+
iteration: 2 , TFLOPS: 69.64205785663373, Tokens per sec: 17319.629914020166, Loss: 4.191491603851318
|
102 |
+
------------------------------------------------------------------
|
103 |
+
------------------------------------------------------------------
|
104 |
+
iteration: 3 , TFLOPS: 69.60094665048808, Tokens per sec: 17309.405763590446, Loss: 4.197597026824951
|
105 |
+
------------------------------------------------------------------
|
106 |
+
------------------------------------------------------------------
|
107 |
+
iteration: 4 , TFLOPS: 69.47512522949748, Tokens per sec: 17278.114608304662, Loss: 4.183670520782471
|
108 |
+
------------------------------------------------------------------
|
109 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
110 |
+
warnings.warn(
|
111 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
112 |
+
warnings.warn(
|
113 |
+
------------------------------------------------------------------
|
114 |
+
iteration: 5 , TFLOPS: 69.67467547447801, Tokens per sec: 17327.7417517103, Loss: 4.198245048522949
|
115 |
+
------------------------------------------------------------------
|
116 |
+
Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005
|
117 |
+
Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
|
118 |
+
Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
|
119 |
+
[rank0]:[2024-08-12 07:11:16,345] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.006517466999866883, 'preprocessing_with_comm': 0.0007555539996246807, 'state_converting': 0.9849483990001318, <Type.ALL: 'all'>: 0.9936859660001574})
|
120 |
+
Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
|
121 |
+
Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
|
122 |
+
Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
|
123 |
+
Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
|
124 |
+
Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
|
125 |
+
Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
|
126 |
+
Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005, took 4.44s
|
127 |
+
------------------------------------------------------------------
|
128 |
+
iteration: 6 , TFLOPS: 70.22008480550622, Tokens per sec: 17463.382312253587, Loss: 4.179391860961914
|
129 |
+
------------------------------------------------------------------
|
130 |
+
------------------------------------------------------------------
|
131 |
+
iteration: 7 , TFLOPS: 69.98955682269778, Tokens per sec: 17406.051161079293, Loss: 4.190949440002441
|
132 |
+
------------------------------------------------------------------
|
133 |
+
------------------------------------------------------------------
|
134 |
+
iteration: 8 , TFLOPS: 69.94509258955091, Tokens per sec: 17394.993129679646, Loss: 4.189082622528076
|
135 |
+
------------------------------------------------------------------
|
136 |
+
------------------------------------------------------------------
|
137 |
+
iteration: 9 , TFLOPS: 70.07602036768274, Tokens per sec: 17427.55421033261, Loss: 4.181089878082275
|
138 |
+
------------------------------------------------------------------
|
139 |
+
------------------------------------------------------------------
|
140 |
+
iteration: 10 , TFLOPS: 70.03395601975187, Tokens per sec: 17417.093018329397, Loss: 4.1603803634643555
|
141 |
+
------------------------------------------------------------------
|
142 |
+
Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010
|
143 |
+
Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/model.pt
|
144 |
+
Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/model.pt
|
145 |
+
[rank0]:[2024-08-12 07:17:37,283] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.0064329239994549425, 'preprocessing_with_comm': 0.0007190309997895383, 'state_converting': 0.9757228209991808, <Type.ALL: 'all'>: 0.9842789310005173})
|
146 |
+
Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/optimizer.pt
|
147 |
+
Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/optimizer.pt
|
148 |
+
Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/scheduler.pt
|
149 |
+
Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/scheduler.pt
|
150 |
+
Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/rng.pt
|
151 |
+
Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/rng.pt
|
152 |
+
Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010, took 4.48s
|
153 |
+
------------------------------------------------------------------
|
154 |
+
iteration: 11 , TFLOPS: 70.31766010694388, Tokens per sec: 17487.64879951231, Loss: 4.118324279785156
|
155 |
+
------------------------------------------------------------------
|
156 |
+
------------------------------------------------------------------
|
157 |
+
iteration: 12 , TFLOPS: 70.37958976318761, Tokens per sec: 17503.050393891557, Loss: 4.171144008636475
|
158 |
+
------------------------------------------------------------------
|
wandb/run-20240812_070449-ufge4h1y/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-11T22:04:49.754332",
|
5 |
+
"startedAt": "2024-08-11T22:04:49.102690",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
23 |
+
"--train-data-path",
|
24 |
+
"304771887",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"304771887",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"304771887",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"5",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-qwen2-0.5B",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-qwen2-0.5B_train_2024-08-12-07:04:37"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0429999999997,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.043,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.043,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.043,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.043,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.043,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.043,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.043,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.043,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.043,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.043,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.043,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.043,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.043,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.043,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.043,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.043,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.043,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.043,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.487823486328125
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 4.171144008636475, "training/perplexity": 64.78952950804121, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 12, "optimizer/lr": 1.4560000000000001e-06, "optimizer/variance_l2": 0.012989128226478895, "optimizer/variance_sqrt_l2": 0.6784465027663834, "optimizer/momentum_l2": 0.7107880089338467, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.4604034423828125, "optimizer/variance_sqrt_l1": 2849.0, "optimizer/momentum_l1": 2785.25, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.00909423828125, "optimizer/variance_sqrt_abs_max": 0.09521484375, "optimizer/momentum_abs_max": 0.10107421875, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.9035151299995, "stats/tokens_per_sec": 17503.050393891557, "stats/tokens_per_sec_per_gpu": 17503.050393891557, "stats/tflops": 70.37958976318761, "_timestamp": 1723414808.909133, "_runtime": 919.7931730747223, "_step": 12, "_wandb": {"runtime": 922}}
|
wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log
ADDED
@@ -0,0 +1,616 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 07:04:49,117 INFO StreamThr :13762 [internal.py:wandb_internal():86] W&B internal server running at pid: 13762, started at: 2024-08-12 07:04:49.116639
|
2 |
+
2024-08-12 07:04:49,119 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-12 07:04:49,121 INFO WriterThread:13762 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb
|
4 |
+
2024-08-12 07:04:49,122 DEBUG SenderThread:13762 [sender.py:send():382] send: header
|
5 |
+
2024-08-12 07:04:49,136 DEBUG SenderThread:13762 [sender.py:send():382] send: run
|
6 |
+
2024-08-12 07:04:49,638 INFO SenderThread:13762 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_070449-ufge4h1y/files
|
7 |
+
2024-08-12 07:04:49,638 INFO SenderThread:13762 [sender.py:_start_run_threads():1136] run started: ufge4h1y with start time 1723413889.11596
|
8 |
+
2024-08-12 07:04:49,643 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-12 07:04:49,643 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-12 07:04:49,733 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-12 07:04:49,739 DEBUG HandlerThread:13762 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-12 07:04:49,739 DEBUG HandlerThread:13762 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-12 07:04:49,740 INFO HandlerThread:13762 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-12 07:04:49,740 INFO SystemMonitor:13762 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-12 07:04:49,740 INFO HandlerThread:13762 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-12 07:04:49,740 INFO SystemMonitor:13762 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-12 07:04:49,741 INFO SystemMonitor:13762 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-12 07:04:49,741 INFO SystemMonitor:13762 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-12 07:04:49,742 INFO SystemMonitor:13762 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-12 07:04:49,744 INFO SystemMonitor:13762 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-12 07:04:49,754 DEBUG HandlerThread:13762 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-12 07:04:49,756 DEBUG HandlerThread:13762 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-12 07:04:49,770 DEBUG HandlerThread:13762 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-12 07:04:49,771 DEBUG HandlerThread:13762 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-12 07:04:49,771 DEBUG HandlerThread:13762 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T22:04:49.754332', 'startedAt': '2024-08-11T22:04:49.102690', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '5', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-07:04:37'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
|
26 |
+
2024-08-12 07:04:49,771 INFO HandlerThread:13762 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-12 07:04:49,771 INFO HandlerThread:13762 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-12 07:04:49,772 INFO HandlerThread:13762 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-12 07:04:49,779 DEBUG SenderThread:13762 [sender.py:send():382] send: files
|
30 |
+
2024-08-12 07:04:49,779 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-12 07:04:49,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-12 07:04:49,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-12 07:04:49,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-12 07:04:49,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-12 07:04:49,791 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-12 07:04:50,088 DEBUG SenderThread:13762 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-12 07:04:50,465 INFO wandb-upload_0:13762 [upload_job.py:push():131] Uploaded file /tmp/tmp0h3j51sdwandb/z7nk28zc-wandb-metadata.json
|
38 |
+
2024-08-12 07:04:50,640 INFO Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json
|
39 |
+
2024-08-12 07:04:50,640 INFO Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
40 |
+
2024-08-12 07:04:50,640 INFO Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/requirements.txt
|
41 |
+
2024-08-12 07:04:52,640 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
42 |
+
2024-08-12 07:04:54,468 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-12 07:04:54,641 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
44 |
+
2024-08-12 07:04:54,719 DEBUG SenderThread:13762 [sender.py:send():382] send: config
|
45 |
+
2024-08-12 07:04:54,719 DEBUG SenderThread:13762 [sender.py:send():382] send: config
|
46 |
+
2024-08-12 07:04:56,643 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
47 |
+
2024-08-12 07:04:59,720 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
48 |
+
2024-08-12 07:05:04,721 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
49 |
+
2024-08-12 07:05:04,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
50 |
+
2024-08-12 07:05:04,790 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
51 |
+
2024-08-12 07:05:04,790 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
52 |
+
2024-08-12 07:05:10,015 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
53 |
+
2024-08-12 07:05:15,015 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
54 |
+
2024-08-12 07:05:19,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
55 |
+
2024-08-12 07:05:19,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
56 |
+
2024-08-12 07:05:19,828 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
57 |
+
2024-08-12 07:05:20,046 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
58 |
+
2024-08-12 07:05:20,658 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/config.yaml
|
59 |
+
2024-08-12 07:05:25,253 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
60 |
+
2024-08-12 07:05:30,254 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
61 |
+
2024-08-12 07:05:34,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
62 |
+
2024-08-12 07:05:34,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
63 |
+
2024-08-12 07:05:34,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
64 |
+
2024-08-12 07:05:36,061 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
65 |
+
2024-08-12 07:05:41,062 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
66 |
+
2024-08-12 07:05:46,063 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
67 |
+
2024-08-12 07:05:49,744 DEBUG SystemMonitor:13762 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
68 |
+
2024-08-12 07:05:49,746 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
69 |
+
2024-08-12 07:05:49,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
70 |
+
2024-08-12 07:05:49,788 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
71 |
+
2024-08-12 07:05:49,828 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
72 |
+
2024-08-12 07:05:51,986 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
73 |
+
2024-08-12 07:05:56,987 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
74 |
+
2024-08-12 07:06:01,988 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
75 |
+
2024-08-12 07:06:04,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
76 |
+
2024-08-12 07:06:04,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
77 |
+
2024-08-12 07:06:04,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
78 |
+
2024-08-12 07:06:06,993 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
79 |
+
2024-08-12 07:06:10,837 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
80 |
+
2024-08-12 07:06:12,691 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
81 |
+
2024-08-12 07:06:12,882 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
82 |
+
2024-08-12 07:06:17,882 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
83 |
+
2024-08-12 07:06:19,747 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
84 |
+
2024-08-12 07:06:19,788 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
85 |
+
2024-08-12 07:06:19,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
86 |
+
2024-08-12 07:06:19,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
87 |
+
2024-08-12 07:06:23,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
88 |
+
2024-08-12 07:06:28,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
89 |
+
2024-08-12 07:06:33,040 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
90 |
+
2024-08-12 07:06:34,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
91 |
+
2024-08-12 07:06:34,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
92 |
+
2024-08-12 07:06:34,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
93 |
+
2024-08-12 07:06:39,036 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
94 |
+
2024-08-12 07:06:44,037 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
95 |
+
2024-08-12 07:06:49,037 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
96 |
+
2024-08-12 07:06:49,748 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
97 |
+
2024-08-12 07:06:49,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
98 |
+
2024-08-12 07:06:49,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
99 |
+
2024-08-12 07:06:49,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
100 |
+
2024-08-12 07:06:54,988 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
101 |
+
2024-08-12 07:06:59,989 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
102 |
+
2024-08-12 07:07:04,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
103 |
+
2024-08-12 07:07:04,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
104 |
+
2024-08-12 07:07:04,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
105 |
+
2024-08-12 07:07:05,036 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
106 |
+
2024-08-12 07:07:10,037 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
107 |
+
2024-08-12 07:07:15,038 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
108 |
+
2024-08-12 07:07:19,749 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
109 |
+
2024-08-12 07:07:19,789 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
110 |
+
2024-08-12 07:07:19,789 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
111 |
+
2024-08-12 07:07:19,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
112 |
+
2024-08-12 07:07:20,985 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
113 |
+
2024-08-12 07:07:25,986 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
114 |
+
2024-08-12 07:07:26,535 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
115 |
+
2024-08-12 07:07:26,538 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
116 |
+
2024-08-12 07:07:26,538 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
117 |
+
2024-08-12 07:07:26,540 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
118 |
+
2024-08-12 07:07:26,739 INFO Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
119 |
+
2024-08-12 07:07:28,741 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
120 |
+
2024-08-12 07:07:31,578 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
121 |
+
2024-08-12 07:07:34,791 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
122 |
+
2024-08-12 07:07:34,791 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
123 |
+
2024-08-12 07:07:34,791 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
124 |
+
2024-08-12 07:07:37,002 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
125 |
+
2024-08-12 07:07:42,003 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
126 |
+
2024-08-12 07:07:47,004 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
127 |
+
2024-08-12 07:07:49,750 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
128 |
+
2024-08-12 07:07:49,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
129 |
+
2024-08-12 07:07:49,792 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
130 |
+
2024-08-12 07:07:49,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
131 |
+
2024-08-12 07:07:52,985 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
132 |
+
2024-08-12 07:07:57,986 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
133 |
+
2024-08-12 07:08:02,986 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
134 |
+
2024-08-12 07:08:04,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
135 |
+
2024-08-12 07:08:04,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
136 |
+
2024-08-12 07:08:04,793 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
137 |
+
2024-08-12 07:08:08,037 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
138 |
+
2024-08-12 07:08:13,038 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
139 |
+
2024-08-12 07:08:18,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
140 |
+
2024-08-12 07:08:19,751 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
141 |
+
2024-08-12 07:08:19,791 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
142 |
+
2024-08-12 07:08:19,792 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
143 |
+
2024-08-12 07:08:19,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
144 |
+
2024-08-12 07:08:23,989 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
145 |
+
2024-08-12 07:08:28,990 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
146 |
+
2024-08-12 07:08:33,991 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
147 |
+
2024-08-12 07:08:34,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
148 |
+
2024-08-12 07:08:34,792 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
149 |
+
2024-08-12 07:08:34,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
150 |
+
2024-08-12 07:08:39,042 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
151 |
+
2024-08-12 07:08:42,279 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
152 |
+
2024-08-12 07:08:42,281 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
153 |
+
2024-08-12 07:08:42,282 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
154 |
+
2024-08-12 07:08:42,283 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
155 |
+
2024-08-12 07:08:42,792 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
156 |
+
2024-08-12 07:08:44,322 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
157 |
+
2024-08-12 07:08:44,793 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
158 |
+
2024-08-12 07:08:49,322 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
159 |
+
2024-08-12 07:08:49,752 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
160 |
+
2024-08-12 07:08:49,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
161 |
+
2024-08-12 07:08:49,792 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
162 |
+
2024-08-12 07:08:49,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
163 |
+
2024-08-12 07:08:54,999 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
164 |
+
2024-08-12 07:08:59,999 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
165 |
+
2024-08-12 07:09:04,792 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
166 |
+
2024-08-12 07:09:04,793 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
167 |
+
2024-08-12 07:09:04,832 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
168 |
+
2024-08-12 07:09:05,001 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
169 |
+
2024-08-12 07:09:10,002 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
170 |
+
2024-08-12 07:09:15,003 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
171 |
+
2024-08-12 07:09:19,753 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
172 |
+
2024-08-12 07:09:19,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
173 |
+
2024-08-12 07:09:19,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
174 |
+
2024-08-12 07:09:19,793 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
175 |
+
2024-08-12 07:09:20,044 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
176 |
+
2024-08-12 07:09:25,045 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
177 |
+
2024-08-12 07:09:30,046 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
178 |
+
2024-08-12 07:09:34,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
179 |
+
2024-08-12 07:09:34,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
180 |
+
2024-08-12 07:09:34,793 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
181 |
+
2024-08-12 07:09:35,995 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
182 |
+
2024-08-12 07:09:40,995 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
183 |
+
2024-08-12 07:09:45,996 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
184 |
+
2024-08-12 07:09:49,754 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
185 |
+
2024-08-12 07:09:49,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
186 |
+
2024-08-12 07:09:49,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
187 |
+
2024-08-12 07:09:49,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
188 |
+
2024-08-12 07:09:51,979 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
189 |
+
2024-08-12 07:09:56,980 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
190 |
+
2024-08-12 07:09:58,160 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
191 |
+
2024-08-12 07:09:58,162 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
192 |
+
2024-08-12 07:09:58,162 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
193 |
+
2024-08-12 07:09:58,163 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
194 |
+
2024-08-12 07:09:58,845 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
195 |
+
2024-08-12 07:10:00,846 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
196 |
+
2024-08-12 07:10:02,202 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
197 |
+
2024-08-12 07:10:04,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
198 |
+
2024-08-12 07:10:04,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
199 |
+
2024-08-12 07:10:04,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
200 |
+
2024-08-12 07:10:08,061 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
201 |
+
2024-08-12 07:10:13,062 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
202 |
+
2024-08-12 07:10:18,063 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
203 |
+
2024-08-12 07:10:19,755 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
204 |
+
2024-08-12 07:10:19,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
205 |
+
2024-08-12 07:10:19,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
206 |
+
2024-08-12 07:10:19,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
207 |
+
2024-08-12 07:10:23,070 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
208 |
+
2024-08-12 07:10:28,071 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
209 |
+
2024-08-12 07:10:33,072 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
210 |
+
2024-08-12 07:10:34,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
211 |
+
2024-08-12 07:10:34,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
212 |
+
2024-08-12 07:10:34,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
213 |
+
2024-08-12 07:10:38,976 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
214 |
+
2024-08-12 07:10:43,977 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
215 |
+
2024-08-12 07:10:48,978 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
216 |
+
2024-08-12 07:10:49,758 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
217 |
+
2024-08-12 07:10:49,793 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
218 |
+
2024-08-12 07:10:49,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
219 |
+
2024-08-12 07:10:49,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
220 |
+
2024-08-12 07:10:54,010 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
221 |
+
2024-08-12 07:10:59,011 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
222 |
+
2024-08-12 07:11:04,012 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
223 |
+
2024-08-12 07:11:04,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
224 |
+
2024-08-12 07:11:04,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
225 |
+
2024-08-12 07:11:04,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
226 |
+
2024-08-12 07:11:09,041 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
227 |
+
2024-08-12 07:11:13,824 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
228 |
+
2024-08-12 07:11:13,826 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
229 |
+
2024-08-12 07:11:13,826 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
230 |
+
2024-08-12 07:11:13,827 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
231 |
+
2024-08-12 07:11:13,896 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
232 |
+
2024-08-12 07:11:14,866 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
233 |
+
2024-08-12 07:11:14,897 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
234 |
+
2024-08-12 07:11:16,898 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
235 |
+
2024-08-12 07:11:18,900 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
236 |
+
2024-08-12 07:11:19,757 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
237 |
+
2024-08-12 07:11:19,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
238 |
+
2024-08-12 07:11:19,794 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
239 |
+
2024-08-12 07:11:19,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
240 |
+
2024-08-12 07:11:20,004 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
241 |
+
2024-08-12 07:11:20,901 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
242 |
+
2024-08-12 07:11:25,004 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
243 |
+
2024-08-12 07:11:30,005 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
244 |
+
2024-08-12 07:11:34,794 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
245 |
+
2024-08-12 07:11:34,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
246 |
+
2024-08-12 07:11:34,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
247 |
+
2024-08-12 07:11:35,993 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
248 |
+
2024-08-12 07:11:40,994 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
249 |
+
2024-08-12 07:11:45,994 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
250 |
+
2024-08-12 07:11:49,758 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
251 |
+
2024-08-12 07:11:49,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
252 |
+
2024-08-12 07:11:49,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
253 |
+
2024-08-12 07:11:49,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
254 |
+
2024-08-12 07:11:51,989 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
255 |
+
2024-08-12 07:11:56,990 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
256 |
+
2024-08-12 07:12:01,990 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
257 |
+
2024-08-12 07:12:04,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
258 |
+
2024-08-12 07:12:04,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
259 |
+
2024-08-12 07:12:04,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
260 |
+
2024-08-12 07:12:06,998 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
261 |
+
2024-08-12 07:12:11,999 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
262 |
+
2024-08-12 07:12:17,000 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
263 |
+
2024-08-12 07:12:19,760 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
264 |
+
2024-08-12 07:12:19,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
265 |
+
2024-08-12 07:12:19,795 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
266 |
+
2024-08-12 07:12:19,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
267 |
+
2024-08-12 07:12:22,010 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
268 |
+
2024-08-12 07:12:27,011 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
269 |
+
2024-08-12 07:12:32,011 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
270 |
+
2024-08-12 07:12:33,344 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
271 |
+
2024-08-12 07:12:33,346 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
272 |
+
2024-08-12 07:12:33,346 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
273 |
+
2024-08-12 07:12:33,348 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
274 |
+
2024-08-12 07:12:33,948 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
275 |
+
2024-08-12 07:12:34,796 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
276 |
+
2024-08-12 07:12:34,796 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
277 |
+
2024-08-12 07:12:34,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
278 |
+
2024-08-12 07:12:34,948 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
279 |
+
2024-08-12 07:12:38,002 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
280 |
+
2024-08-12 07:12:43,002 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
281 |
+
2024-08-12 07:12:48,003 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
282 |
+
2024-08-12 07:12:49,760 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
283 |
+
2024-08-12 07:12:49,795 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
284 |
+
2024-08-12 07:12:49,796 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
285 |
+
2024-08-12 07:12:49,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
286 |
+
2024-08-12 07:12:53,056 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
287 |
+
2024-08-12 07:12:58,057 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
288 |
+
2024-08-12 07:13:03,057 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
289 |
+
2024-08-12 07:13:04,796 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
290 |
+
2024-08-12 07:13:04,796 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
291 |
+
2024-08-12 07:13:04,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
292 |
+
2024-08-12 07:13:09,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
293 |
+
2024-08-12 07:13:14,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
294 |
+
2024-08-12 07:13:19,034 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
295 |
+
2024-08-12 07:13:19,761 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
296 |
+
2024-08-12 07:13:19,796 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
297 |
+
2024-08-12 07:13:19,796 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
298 |
+
2024-08-12 07:13:19,836 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
299 |
+
2024-08-12 07:13:25,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
300 |
+
2024-08-12 07:13:30,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
301 |
+
2024-08-12 07:13:34,838 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
302 |
+
2024-08-12 07:13:34,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
303 |
+
2024-08-12 07:13:34,839 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
304 |
+
2024-08-12 07:13:35,055 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
305 |
+
2024-08-12 07:13:40,056 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
306 |
+
2024-08-12 07:13:45,057 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
307 |
+
2024-08-12 07:13:48,668 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
308 |
+
2024-08-12 07:13:48,670 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
309 |
+
2024-08-12 07:13:48,671 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
310 |
+
2024-08-12 07:13:48,672 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
311 |
+
2024-08-12 07:13:48,997 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
312 |
+
2024-08-12 07:13:49,762 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
313 |
+
2024-08-12 07:13:49,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
314 |
+
2024-08-12 07:13:49,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
315 |
+
2024-08-12 07:13:49,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
316 |
+
2024-08-12 07:13:50,998 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
317 |
+
2024-08-12 07:13:51,022 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
318 |
+
2024-08-12 07:13:56,023 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
319 |
+
2024-08-12 07:14:01,023 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
320 |
+
2024-08-12 07:14:04,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
321 |
+
2024-08-12 07:14:04,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
322 |
+
2024-08-12 07:14:04,839 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
323 |
+
2024-08-12 07:14:06,089 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
324 |
+
2024-08-12 07:14:11,090 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
325 |
+
2024-08-12 07:14:16,090 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
326 |
+
2024-08-12 07:14:19,763 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
327 |
+
2024-08-12 07:14:19,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
328 |
+
2024-08-12 07:14:19,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
329 |
+
2024-08-12 07:14:19,839 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
330 |
+
2024-08-12 07:14:21,108 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
331 |
+
2024-08-12 07:14:26,109 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
332 |
+
2024-08-12 07:14:31,109 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
333 |
+
2024-08-12 07:14:34,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
334 |
+
2024-08-12 07:14:34,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
335 |
+
2024-08-12 07:14:34,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
336 |
+
2024-08-12 07:14:37,031 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
337 |
+
2024-08-12 07:14:42,032 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
338 |
+
2024-08-12 07:14:47,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
339 |
+
2024-08-12 07:14:49,764 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
340 |
+
2024-08-12 07:14:49,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
341 |
+
2024-08-12 07:14:49,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
342 |
+
2024-08-12 07:14:49,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
343 |
+
2024-08-12 07:14:52,060 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
344 |
+
2024-08-12 07:14:57,061 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
345 |
+
2024-08-12 07:15:02,061 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
346 |
+
2024-08-12 07:15:04,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
347 |
+
2024-08-12 07:15:04,041 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
348 |
+
2024-08-12 07:15:04,041 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
349 |
+
2024-08-12 07:15:04,043 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
350 |
+
2024-08-12 07:15:04,047 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
351 |
+
2024-08-12 07:15:04,839 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
352 |
+
2024-08-12 07:15:04,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
353 |
+
2024-08-12 07:15:04,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
354 |
+
2024-08-12 07:15:05,048 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
355 |
+
2024-08-12 07:15:07,077 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
356 |
+
2024-08-12 07:15:12,077 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
357 |
+
2024-08-12 07:15:17,078 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
358 |
+
2024-08-12 07:15:19,765 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
359 |
+
2024-08-12 07:15:19,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
360 |
+
2024-08-12 07:15:19,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
361 |
+
2024-08-12 07:15:19,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
362 |
+
2024-08-12 07:15:22,080 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
363 |
+
2024-08-12 07:15:27,081 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
364 |
+
2024-08-12 07:15:32,082 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
365 |
+
2024-08-12 07:15:34,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
366 |
+
2024-08-12 07:15:34,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
367 |
+
2024-08-12 07:15:34,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
368 |
+
2024-08-12 07:15:38,041 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
369 |
+
2024-08-12 07:15:43,042 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
370 |
+
2024-08-12 07:15:48,042 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
371 |
+
2024-08-12 07:15:49,766 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
372 |
+
2024-08-12 07:15:49,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
373 |
+
2024-08-12 07:15:49,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
374 |
+
2024-08-12 07:15:49,840 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
375 |
+
2024-08-12 07:15:53,080 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
376 |
+
2024-08-12 07:15:58,080 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
377 |
+
2024-08-12 07:16:03,081 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
378 |
+
2024-08-12 07:16:04,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
379 |
+
2024-08-12 07:16:04,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
380 |
+
2024-08-12 07:16:04,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
381 |
+
2024-08-12 07:16:09,051 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
382 |
+
2024-08-12 07:16:14,052 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
383 |
+
2024-08-12 07:16:19,053 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
384 |
+
2024-08-12 07:16:19,269 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
385 |
+
2024-08-12 07:16:19,271 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
386 |
+
2024-08-12 07:16:19,271 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
387 |
+
2024-08-12 07:16:19,273 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
388 |
+
2024-08-12 07:16:19,767 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
389 |
+
2024-08-12 07:16:19,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
390 |
+
2024-08-12 07:16:19,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
391 |
+
2024-08-12 07:16:19,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
392 |
+
2024-08-12 07:16:20,099 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
393 |
+
2024-08-12 07:16:21,099 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
394 |
+
2024-08-12 07:16:25,052 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
395 |
+
2024-08-12 07:16:30,052 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
396 |
+
2024-08-12 07:16:34,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
397 |
+
2024-08-12 07:16:34,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
398 |
+
2024-08-12 07:16:34,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
399 |
+
2024-08-12 07:16:35,100 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
400 |
+
2024-08-12 07:16:40,100 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
401 |
+
2024-08-12 07:16:45,101 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
402 |
+
2024-08-12 07:16:49,768 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
403 |
+
2024-08-12 07:16:49,840 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
404 |
+
2024-08-12 07:16:49,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
405 |
+
2024-08-12 07:16:49,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
406 |
+
2024-08-12 07:16:51,038 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
407 |
+
2024-08-12 07:16:56,039 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
408 |
+
2024-08-12 07:17:01,040 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
409 |
+
2024-08-12 07:17:04,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
410 |
+
2024-08-12 07:17:04,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
411 |
+
2024-08-12 07:17:04,841 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
412 |
+
2024-08-12 07:17:06,122 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
413 |
+
2024-08-12 07:17:11,123 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
414 |
+
2024-08-12 07:17:16,124 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
415 |
+
2024-08-12 07:17:19,769 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
416 |
+
2024-08-12 07:17:19,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
417 |
+
2024-08-12 07:17:19,841 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
418 |
+
2024-08-12 07:17:19,842 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
419 |
+
2024-08-12 07:17:22,020 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
420 |
+
2024-08-12 07:17:27,021 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
421 |
+
2024-08-12 07:17:32,022 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
422 |
+
2024-08-12 07:17:34,545 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
423 |
+
2024-08-12 07:17:34,548 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
424 |
+
2024-08-12 07:17:34,548 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
425 |
+
2024-08-12 07:17:34,550 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
426 |
+
2024-08-12 07:17:35,013 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
427 |
+
2024-08-12 07:17:35,041 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
428 |
+
2024-08-12 07:17:35,041 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
429 |
+
2024-08-12 07:17:35,149 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
430 |
+
2024-08-12 07:17:37,151 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
431 |
+
2024-08-12 07:17:37,272 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
432 |
+
2024-08-12 07:17:39,152 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
433 |
+
2024-08-12 07:17:41,154 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
434 |
+
2024-08-12 07:17:43,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
435 |
+
2024-08-12 07:17:48,033 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
436 |
+
2024-08-12 07:17:49,770 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
437 |
+
2024-08-12 07:17:49,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
438 |
+
2024-08-12 07:17:49,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
439 |
+
2024-08-12 07:17:49,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
440 |
+
2024-08-12 07:17:53,197 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
441 |
+
2024-08-12 07:17:58,198 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
442 |
+
2024-08-12 07:18:03,198 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
443 |
+
2024-08-12 07:18:04,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
444 |
+
2024-08-12 07:18:04,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
445 |
+
2024-08-12 07:18:04,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
446 |
+
2024-08-12 07:18:08,232 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
447 |
+
2024-08-12 07:18:13,233 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
448 |
+
2024-08-12 07:18:18,233 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
449 |
+
2024-08-12 07:18:19,771 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
450 |
+
2024-08-12 07:18:19,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
451 |
+
2024-08-12 07:18:19,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
452 |
+
2024-08-12 07:18:19,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
453 |
+
2024-08-12 07:18:23,237 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
454 |
+
2024-08-12 07:18:28,237 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
455 |
+
2024-08-12 07:18:33,238 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
456 |
+
2024-08-12 07:18:34,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
457 |
+
2024-08-12 07:18:34,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
458 |
+
2024-08-12 07:18:34,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
459 |
+
2024-08-12 07:18:39,167 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
460 |
+
2024-08-12 07:18:44,168 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
461 |
+
2024-08-12 07:18:49,168 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
462 |
+
2024-08-12 07:18:49,772 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
463 |
+
2024-08-12 07:18:49,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
464 |
+
2024-08-12 07:18:49,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
465 |
+
2024-08-12 07:18:49,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
466 |
+
2024-08-12 07:18:54,004 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
467 |
+
2024-08-12 07:18:54,006 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
468 |
+
2024-08-12 07:18:54,007 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
469 |
+
2024-08-12 07:18:54,008 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
470 |
+
2024-08-12 07:18:54,198 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
471 |
+
2024-08-12 07:18:55,009 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
472 |
+
2024-08-12 07:18:55,199 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
473 |
+
2024-08-12 07:19:00,010 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
474 |
+
2024-08-12 07:19:04,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
475 |
+
2024-08-12 07:19:04,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
476 |
+
2024-08-12 07:19:04,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
477 |
+
2024-08-12 07:19:05,244 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
478 |
+
2024-08-12 07:19:10,245 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
479 |
+
2024-08-12 07:19:15,245 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
480 |
+
2024-08-12 07:19:19,773 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
481 |
+
2024-08-12 07:19:19,970 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
482 |
+
2024-08-12 07:19:19,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
483 |
+
2024-08-12 07:19:19,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
484 |
+
2024-08-12 07:19:21,167 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
485 |
+
2024-08-12 07:19:26,168 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
486 |
+
2024-08-12 07:19:31,169 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
487 |
+
2024-08-12 07:19:34,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
488 |
+
2024-08-12 07:19:34,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
489 |
+
2024-08-12 07:19:34,972 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
490 |
+
2024-08-12 07:19:37,149 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
491 |
+
2024-08-12 07:19:42,150 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
492 |
+
2024-08-12 07:19:47,151 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
493 |
+
2024-08-12 07:19:49,774 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
494 |
+
2024-08-12 07:19:49,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
495 |
+
2024-08-12 07:19:49,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
496 |
+
2024-08-12 07:19:49,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
497 |
+
2024-08-12 07:19:52,230 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
498 |
+
2024-08-12 07:19:57,230 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
499 |
+
2024-08-12 07:20:02,231 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
500 |
+
2024-08-12 07:20:04,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
|
501 |
+
2024-08-12 07:20:04,971 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
|
502 |
+
2024-08-12 07:20:04,971 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
|
503 |
+
2024-08-12 07:20:08,210 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
504 |
+
2024-08-12 07:20:08,910 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
|
505 |
+
2024-08-12 07:20:08,913 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
506 |
+
2024-08-12 07:20:08,913 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
507 |
+
2024-08-12 07:20:08,914 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
508 |
+
2024-08-12 07:20:09,243 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
509 |
+
2024-08-12 07:20:09,244 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
510 |
+
2024-08-12 07:20:12,332 DEBUG SenderThread:13762 [sender.py:send():382] send: exit
|
511 |
+
2024-08-12 07:20:12,332 INFO SenderThread:13762 [sender.py:send_exit():589] handling exit code: 255
|
512 |
+
2024-08-12 07:20:12,332 INFO SenderThread:13762 [sender.py:send_exit():591] handling runtime: 922
|
513 |
+
2024-08-12 07:20:12,333 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
514 |
+
2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:send_exit():597] send defer
|
515 |
+
2024-08-12 07:20:12,334 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
516 |
+
2024-08-12 07:20:12,334 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 0
|
517 |
+
2024-08-12 07:20:12,334 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
518 |
+
2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 0
|
519 |
+
2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 1
|
520 |
+
2024-08-12 07:20:12,334 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
521 |
+
2024-08-12 07:20:12,334 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 1
|
522 |
+
2024-08-12 07:20:12,334 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
523 |
+
2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 1
|
524 |
+
2024-08-12 07:20:12,334 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 2
|
525 |
+
2024-08-12 07:20:12,335 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
526 |
+
2024-08-12 07:20:12,335 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 2
|
527 |
+
2024-08-12 07:20:12,335 INFO HandlerThread:13762 [system_monitor.py:finish():203] Stopping system monitor
|
528 |
+
2024-08-12 07:20:12,335 DEBUG SystemMonitor:13762 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
529 |
+
2024-08-12 07:20:12,335 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined cpu monitor
|
530 |
+
2024-08-12 07:20:12,335 DEBUG SystemMonitor:13762 [system_monitor.py:_start():183] Publishing last batch of metrics
|
531 |
+
2024-08-12 07:20:12,335 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined disk monitor
|
532 |
+
2024-08-12 07:20:12,371 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined gpu monitor
|
533 |
+
2024-08-12 07:20:12,371 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined memory monitor
|
534 |
+
2024-08-12 07:20:12,371 INFO HandlerThread:13762 [interfaces.py:finish():202] Joined network monitor
|
535 |
+
2024-08-12 07:20:12,372 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
536 |
+
2024-08-12 07:20:12,372 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 2
|
537 |
+
2024-08-12 07:20:12,372 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 3
|
538 |
+
2024-08-12 07:20:12,372 DEBUG SenderThread:13762 [sender.py:send():382] send: stats
|
539 |
+
2024-08-12 07:20:12,372 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
540 |
+
2024-08-12 07:20:12,373 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 3
|
541 |
+
2024-08-12 07:20:12,374 DEBUG SenderThread:13762 [sender.py:send():382] send: history
|
542 |
+
2024-08-12 07:20:12,374 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
|
543 |
+
2024-08-12 07:20:12,375 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
544 |
+
2024-08-12 07:20:12,376 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
545 |
+
2024-08-12 07:20:12,376 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 3
|
546 |
+
2024-08-12 07:20:12,376 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 4
|
547 |
+
2024-08-12 07:20:12,376 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
548 |
+
2024-08-12 07:20:12,376 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 4
|
549 |
+
2024-08-12 07:20:12,376 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
550 |
+
2024-08-12 07:20:12,376 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 4
|
551 |
+
2024-08-12 07:20:12,376 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 5
|
552 |
+
2024-08-12 07:20:12,376 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
553 |
+
2024-08-12 07:20:12,376 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 5
|
554 |
+
2024-08-12 07:20:12,377 DEBUG SenderThread:13762 [sender.py:send():382] send: summary
|
555 |
+
2024-08-12 07:20:12,378 INFO SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
556 |
+
2024-08-12 07:20:12,378 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
557 |
+
2024-08-12 07:20:12,378 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 5
|
558 |
+
2024-08-12 07:20:12,378 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 6
|
559 |
+
2024-08-12 07:20:12,378 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
560 |
+
2024-08-12 07:20:12,378 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 6
|
561 |
+
2024-08-12 07:20:12,379 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
562 |
+
2024-08-12 07:20:12,379 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 6
|
563 |
+
2024-08-12 07:20:12,379 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 7
|
564 |
+
2024-08-12 07:20:12,379 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
565 |
+
2024-08-12 07:20:12,379 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
566 |
+
2024-08-12 07:20:12,379 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 7
|
567 |
+
2024-08-12 07:20:12,379 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
568 |
+
2024-08-12 07:20:12,379 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 7
|
569 |
+
2024-08-12 07:20:13,247 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
|
570 |
+
2024-08-12 07:20:13,332 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: poll_exit
|
571 |
+
2024-08-12 07:20:15,017 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 8
|
572 |
+
2024-08-12 07:20:15,017 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: poll_exit
|
573 |
+
2024-08-12 07:20:15,017 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
574 |
+
2024-08-12 07:20:15,017 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 8
|
575 |
+
2024-08-12 07:20:15,018 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
576 |
+
2024-08-12 07:20:15,018 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 8
|
577 |
+
2024-08-12 07:20:15,018 INFO SenderThread:13762 [job_builder.py:build():296] Attempting to build job artifact
|
578 |
+
2024-08-12 07:20:15,019 INFO SenderThread:13762 [job_builder.py:_get_source_type():426] is repo sourced job
|
579 |
+
2024-08-12 07:20:15,033 INFO SenderThread:13762 [job_builder.py:build():402] adding wandb-job metadata file
|
580 |
+
2024-08-12 07:20:15,042 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 9
|
581 |
+
2024-08-12 07:20:15,042 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
582 |
+
2024-08-12 07:20:15,042 DEBUG SenderThread:13762 [sender.py:send():382] send: artifact
|
583 |
+
2024-08-12 07:20:15,042 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 9
|
584 |
+
2024-08-12 07:20:15,248 INFO Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
|
585 |
+
2024-08-12 07:20:15,333 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: poll_exit
|
586 |
+
2024-08-12 07:20:15,953 INFO SenderThread:13762 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
|
587 |
+
2024-08-12 07:20:15,953 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
588 |
+
2024-08-12 07:20:15,953 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 9
|
589 |
+
2024-08-12 07:20:15,953 INFO SenderThread:13762 [dir_watcher.py:finish():358] shutting down directory watcher
|
590 |
+
2024-08-12 07:20:16,249 INFO SenderThread:13762 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_070449-ufge4h1y/files
|
591 |
+
2024-08-12 07:20:16,249 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/requirements.txt requirements.txt
|
592 |
+
2024-08-12 07:20:16,250 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/config.yaml config.yaml
|
593 |
+
2024-08-12 07:20:16,250 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json wandb-metadata.json
|
594 |
+
2024-08-12 07:20:16,250 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json wandb-summary.json
|
595 |
+
2024-08-12 07:20:16,250 INFO SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log output.log
|
596 |
+
2024-08-12 07:20:16,250 INFO SenderThread:13762 [sender.py:transition_state():617] send defer: 10
|
597 |
+
2024-08-12 07:20:16,250 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: poll_exit
|
598 |
+
2024-08-12 07:20:16,251 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
|
599 |
+
2024-08-12 07:20:16,251 INFO HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 10
|
600 |
+
2024-08-12 07:20:16,251 DEBUG SenderThread:13762 [sender.py:send_request():409] send_request: defer
|
601 |
+
2024-08-12 07:20:16,251 INFO SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 10
|
602 |
+
2024-08-12 07:20:16,251 INFO SenderThread:13762 [file_pusher.py:finish():172] shutting down file pusher
|
603 |
+
2024-08-12 07:20:20,252 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
604 |
+
2024-08-12 07:20:25,252 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
605 |
+
2024-08-12 07:20:30,253 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
606 |
+
2024-08-12 07:20:35,254 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
607 |
+
2024-08-12 07:20:40,254 DEBUG HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
|
608 |
+
2024-08-12 07:20:43,105 WARNING StreamThr :13762 [internal.py:is_dead():414] Internal process exiting, parent pid 13691 disappeared
|
609 |
+
2024-08-12 07:20:43,105 ERROR StreamThr :13762 [internal.py:wandb_internal():152] Internal process shutdown.
|
610 |
+
2024-08-12 07:20:43,255 INFO SenderThread:13762 [sender.py:finish():1572] shutting down sender
|
611 |
+
2024-08-12 07:20:43,255 INFO SenderThread:13762 [file_pusher.py:finish():172] shutting down file pusher
|
612 |
+
2024-08-12 07:20:43,255 INFO HandlerThread:13762 [handler.py:finish():869] shutting down handler
|
613 |
+
2024-08-12 07:20:43,255 INFO SenderThread:13762 [file_pusher.py:join():178] waiting for file pusher
|
614 |
+
2024-08-12 07:20:43,255 INFO WriterThread:13762 [datastore.py:close():296] close: /project/wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb
|
615 |
+
2024-08-12 07:20:43,255 INFO SenderThread:13762 [file_stream.py:finish():595] file stream finish called
|
616 |
+
2024-08-12 07:20:43,425 INFO SenderThread:13762 [file_stream.py:finish():599] file stream finish is done
|
wandb/run-20240812_070449-ufge4h1y/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 07:04:49,108 INFO MainThread:13691 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Configure stats pid to 13691
|
3 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
|
6 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_070449-ufge4h1y/logs/debug.log
|
9 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log
|
10 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:04:37', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 5, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-12 07:04:49,109 INFO MainThread:13691 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-12 07:04:49,114 INFO MainThread:13691 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-12 07:04:49,115 INFO MainThread:13691 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-12 07:04:49,120 INFO MainThread:13691 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-12 07:04:49,131 INFO MainThread:13691 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-12 07:04:49,642 INFO MainThread:13691 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-12 07:04:49,725 INFO MainThread:13691 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-12 07:04:49,725 INFO MainThread:13691 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-12 07:04:49,788 INFO MainThread:13691 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-12 07:04:49,788 INFO MainThread:13691 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-12 07:04:49,788 INFO MainThread:13691 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-12 07:04:49,788 INFO MainThread:13691 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-12 07:04:49,789 INFO MainThread:13691 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-12 07:04:54,718 INFO MainThread:13691 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-12 07:04:54,718 INFO MainThread:13691 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb
ADDED
Binary file (81.8 kB). View file
|
|
wandb/run-20240812_073202-yby212na/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '304771887'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '304771887'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '304771887'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 4096
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-qwen2-0.5B_train_2024-08-12-07:31:51
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 5
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 1
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-qwen2-0.5B
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 151680
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1723415522.366221
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
model_architecture:
|
316 |
+
desc: null
|
317 |
+
value: Qwen2ForCausalLM
|
318 |
+
activation_function:
|
319 |
+
desc: null
|
320 |
+
value: silu
|
321 |
+
hidden_size:
|
322 |
+
desc: null
|
323 |
+
value: 896
|
324 |
+
model_type:
|
325 |
+
desc: null
|
326 |
+
value: qwen2
|
327 |
+
max_position_embeddings:
|
328 |
+
desc: null
|
329 |
+
value: 4096
|
330 |
+
num_attention_heads:
|
331 |
+
desc: null
|
332 |
+
value: 14
|
333 |
+
num_hidden_layers:
|
334 |
+
desc: null
|
335 |
+
value: 24
|
wandb/run-20240812_073202-yby212na/files/output.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
8 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
9 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
10 |
+
No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
|
11 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
12 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
13 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
14 |
+
--> applying fsdp activation checkpointing...
|
15 |
+
> datasets target sizes (minimum size):
|
16 |
+
train: 6400000
|
17 |
+
validation: 12803200
|
18 |
+
test: 3200
|
19 |
+
> building train, validation, and test datasets for GPT ...
|
20 |
+
> finished creating GPT datasets ...
|
21 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
22 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
23 |
+
No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
|
24 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
25 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
26 |
+
model info: FullyShardedDataParallel(
|
27 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
28 |
+
(model): Qwen2Model(
|
29 |
+
(embed_tokens): Embedding(151936, 896)
|
30 |
+
(layers): ModuleList(
|
31 |
+
(0-23): 24 x FullyShardedDataParallel(
|
32 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
33 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
34 |
+
(self_attn): Qwen2FlashAttention2(
|
35 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
36 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
37 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
38 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
39 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
40 |
+
)
|
41 |
+
(mlp): Qwen2MLP(
|
42 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
43 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
44 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
45 |
+
(act_fn): SiLU()
|
46 |
+
)
|
47 |
+
(input_layernorm): Qwen2RMSNorm()
|
48 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
49 |
+
)
|
50 |
+
)
|
51 |
+
)
|
52 |
+
)
|
53 |
+
(norm): Qwen2RMSNorm()
|
54 |
+
)
|
55 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
56 |
+
)
|
57 |
+
)
|
58 |
+
model config: Qwen2Config {
|
59 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
60 |
+
"architectures": [
|
61 |
+
"Qwen2ForCausalLM"
|
62 |
+
],
|
63 |
+
"attention_dropout": 0.0,
|
64 |
+
"bos_token_id": 151643,
|
65 |
+
"eos_token_id": 151643,
|
66 |
+
"hidden_act": "silu",
|
67 |
+
"hidden_size": 896,
|
68 |
+
"initializer_range": 0.02,
|
69 |
+
"intermediate_size": 4864,
|
70 |
+
"label_smoothing": 0.0,
|
71 |
+
"max_position_embeddings": 4096,
|
72 |
+
"max_window_layers": 24,
|
73 |
+
"model_type": "qwen2",
|
74 |
+
"num_attention_heads": 14,
|
75 |
+
"num_hidden_layers": 24,
|
76 |
+
"num_key_value_heads": 2,
|
77 |
+
"rms_norm_eps": 1e-06,
|
78 |
+
"rope_theta": 1000000.0,
|
79 |
+
"sliding_window": null,
|
80 |
+
"tie_word_embeddings": true,
|
81 |
+
"torch_dtype": "bfloat16",
|
82 |
+
"transformers_version": "4.43.3",
|
83 |
+
"use_cache": false,
|
84 |
+
"use_sliding_window": false,
|
85 |
+
"vocab_size": 151936
|
86 |
+
}
|
87 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
88 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
89 |
+
warnings.warn(
|
90 |
+
Let split = None
|
91 |
+
Building a BlendedDataset for a single MegatronDataset
|
92 |
+
Unable to save the indexes because path_to_cache is None
|
93 |
+
Building a BlendedDataset for a single MegatronDataset
|
94 |
+
Unable to save the indexes because path_to_cache is None
|
95 |
+
Building a BlendedDataset for a single MegatronDataset
|
96 |
+
Unable to save the indexes because path_to_cache is None
|
97 |
+
------------------------------------------------------------------
|
98 |
+
iteration: 1 , TFLOPS: 69.93553660778689, Tokens per sec: 17392.616605023257, Loss: 4.1814446449279785
|
99 |
+
------------------------------------------------------------------
|
100 |
+
Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001
|
101 |
+
Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/model.pt
|
102 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
103 |
+
warnings.warn(
|
104 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
105 |
+
warnings.warn(
|
106 |
+
Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/model.pt
|
107 |
+
Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/optimizer.pt
|
108 |
+
[rank0]:[2024-08-12 07:33:22,462] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.006542664999869885, 'preprocessing_with_comm': 0.0007797380003466969, 'state_converting': 0.9963913259998662, <Type.ALL: 'all'>: 1.0051406040001893})
|
109 |
+
Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/optimizer.pt
|
110 |
+
Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/scheduler.pt
|
111 |
+
Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/scheduler.pt
|
112 |
+
Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/rng.pt
|
113 |
+
Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/rng.pt
|
114 |
+
None
|
115 |
+
/work/llm_recipes/models/yans-qwen2-0.5B/tokenizer
|
116 |
+
Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001, took 4.39s
|
wandb/run-20240812_073202-yby212na/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240812_073202-yby212na/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-11T22:32:03.032279",
|
5 |
+
"startedAt": "2024-08-11T22:32:02.353340",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
23 |
+
"--train-data-path",
|
24 |
+
"304771887",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"304771887",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"304771887",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"1",
|
56 |
+
"--eval-interval",
|
57 |
+
"5",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-qwen2-0.5B",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-qwen2-0.5B_train_2024-08-12-07:31:51"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0429999999997,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.043,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.043,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.043,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.043,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.043,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.043,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.043,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.043,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.043,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.043,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.043,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.043,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.043,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.043,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.043,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.043,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.043,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.043,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.487823486328125
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240812_073202-yby212na/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 132}, "training/loss": 4.1814446449279785, "training/perplexity": 65.46035190441053, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1, "optimizer/lr": 1.038e-06, "optimizer/variance_l2": 0.001437161465185535, "optimizer/variance_sqrt_l2": 0.22307888709863474, "optimizer/momentum_l2": 0.09989735636562776, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.04984140396118164, "optimizer/variance_sqrt_l1": 889.25, "optimizer/momentum_l1": 397.875, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.00101470947265625, "optimizer/variance_sqrt_abs_max": 0.03173828125, "optimizer/momentum_abs_max": 0.0142822265625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 75.37911228499979, "stats/tokens_per_sec": 17392.616605023257, "stats/tokens_per_sec_per_gpu": 17392.616605023257, "stats/tflops": 69.93553660778689, "_timestamp": 1723415599.9530108, "_runtime": 77.58678984642029, "_step": 1}
|
wandb/run-20240812_073202-yby212na/logs/debug-internal.log
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 07:32:02,368 INFO StreamThr :14458 [internal.py:wandb_internal():86] W&B internal server running at pid: 14458, started at: 2024-08-12 07:32:02.367023
|
2 |
+
2024-08-12 07:32:02,369 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-12 07:32:02,371 INFO WriterThread:14458 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_073202-yby212na/run-yby212na.wandb
|
4 |
+
2024-08-12 07:32:02,372 DEBUG SenderThread:14458 [sender.py:send():382] send: header
|
5 |
+
2024-08-12 07:32:02,386 DEBUG SenderThread:14458 [sender.py:send():382] send: run
|
6 |
+
2024-08-12 07:32:02,917 INFO SenderThread:14458 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_073202-yby212na/files
|
7 |
+
2024-08-12 07:32:02,917 INFO SenderThread:14458 [sender.py:_start_run_threads():1136] run started: yby212na with start time 1723415522.366221
|
8 |
+
2024-08-12 07:32:02,923 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-12 07:32:02,923 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-12 07:32:03,012 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-12 07:32:03,018 DEBUG HandlerThread:14458 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-12 07:32:03,018 DEBUG HandlerThread:14458 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-12 07:32:03,018 INFO HandlerThread:14458 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-12 07:32:03,019 INFO SystemMonitor:14458 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-12 07:32:03,019 INFO HandlerThread:14458 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-12 07:32:03,019 INFO SystemMonitor:14458 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-12 07:32:03,020 INFO SystemMonitor:14458 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-12 07:32:03,020 INFO SystemMonitor:14458 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-12 07:32:03,021 INFO SystemMonitor:14458 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-12 07:32:03,022 INFO SystemMonitor:14458 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-12 07:32:03,032 DEBUG HandlerThread:14458 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-12 07:32:03,034 DEBUG HandlerThread:14458 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-12 07:32:03,046 DEBUG HandlerThread:14458 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-12 07:32:03,047 DEBUG HandlerThread:14458 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-12 07:32:03,047 DEBUG HandlerThread:14458 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T22:32:03.032279', 'startedAt': '2024-08-11T22:32:02.353340', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '1', '--eval-interval', '5', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-07:31:51'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
|
26 |
+
2024-08-12 07:32:03,047 INFO HandlerThread:14458 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-12 07:32:03,047 INFO HandlerThread:14458 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-12 07:32:03,048 INFO HandlerThread:14458 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-12 07:32:03,054 DEBUG SenderThread:14458 [sender.py:send():382] send: files
|
30 |
+
2024-08-12 07:32:03,054 INFO SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-12 07:32:03,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-12 07:32:03,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-12 07:32:03,065 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-12 07:32:03,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-12 07:32:03,067 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-12 07:32:03,383 DEBUG SenderThread:14458 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-12 07:32:03,716 INFO wandb-upload_0:14458 [upload_job.py:push():131] Uploaded file /tmp/tmpjkv15ab8wandb/lrd2pdzk-wandb-metadata.json
|
38 |
+
2024-08-12 07:32:03,919 INFO Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/requirements.txt
|
39 |
+
2024-08-12 07:32:03,920 INFO Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/output.log
|
40 |
+
2024-08-12 07:32:03,920 INFO Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/wandb-metadata.json
|
41 |
+
2024-08-12 07:32:04,384 DEBUG SenderThread:14458 [sender.py:send():382] send: config
|
42 |
+
2024-08-12 07:32:04,384 DEBUG SenderThread:14458 [sender.py:send():382] send: config
|
43 |
+
2024-08-12 07:32:05,920 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
|
44 |
+
2024-08-12 07:32:07,384 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
45 |
+
2024-08-12 07:32:12,385 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
46 |
+
2024-08-12 07:32:17,386 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
47 |
+
2024-08-12 07:32:18,065 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
48 |
+
2024-08-12 07:32:18,065 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
49 |
+
2024-08-12 07:32:18,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
50 |
+
2024-08-12 07:32:23,322 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
51 |
+
2024-08-12 07:32:28,323 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
52 |
+
2024-08-12 07:32:33,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
53 |
+
2024-08-12 07:32:33,064 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
54 |
+
2024-08-12 07:32:33,104 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
55 |
+
2024-08-12 07:32:34,273 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
56 |
+
2024-08-12 07:32:34,938 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/config.yaml
|
57 |
+
2024-08-12 07:32:39,667 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
58 |
+
2024-08-12 07:32:44,667 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
59 |
+
2024-08-12 07:32:48,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
60 |
+
2024-08-12 07:32:48,064 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
61 |
+
2024-08-12 07:32:48,108 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
62 |
+
2024-08-12 07:32:50,338 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
63 |
+
2024-08-12 07:32:55,338 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
64 |
+
2024-08-12 07:33:00,339 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
65 |
+
2024-08-12 07:33:03,022 DEBUG SystemMonitor:14458 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
66 |
+
2024-08-12 07:33:03,024 DEBUG SenderThread:14458 [sender.py:send():382] send: stats
|
67 |
+
2024-08-12 07:33:03,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
68 |
+
2024-08-12 07:33:03,064 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
69 |
+
2024-08-12 07:33:03,104 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
70 |
+
2024-08-12 07:33:06,281 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
71 |
+
2024-08-12 07:33:11,281 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
72 |
+
2024-08-12 07:33:16,282 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
73 |
+
2024-08-12 07:33:18,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
74 |
+
2024-08-12 07:33:18,064 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
75 |
+
2024-08-12 07:33:18,108 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
76 |
+
2024-08-12 07:33:19,954 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: partial_history
|
77 |
+
2024-08-12 07:33:21,450 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
78 |
+
2024-08-12 07:33:21,967 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
|
79 |
+
2024-08-12 07:33:23,969 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
|
80 |
+
2024-08-12 07:33:25,970 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
|
81 |
+
2024-08-12 07:33:27,344 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
82 |
+
2024-08-12 07:33:32,345 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
83 |
+
2024-08-12 07:33:33,025 DEBUG SenderThread:14458 [sender.py:send():382] send: stats
|
84 |
+
2024-08-12 07:33:33,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
85 |
+
2024-08-12 07:33:33,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
86 |
+
2024-08-12 07:33:33,066 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
87 |
+
2024-08-12 07:33:38,333 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
88 |
+
2024-08-12 07:33:43,334 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
89 |
+
2024-08-12 07:33:48,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
90 |
+
2024-08-12 07:33:48,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
91 |
+
2024-08-12 07:33:48,104 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
92 |
+
2024-08-12 07:33:49,288 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
93 |
+
2024-08-12 07:33:54,289 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
94 |
+
2024-08-12 07:33:59,290 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
95 |
+
2024-08-12 07:34:03,026 DEBUG SenderThread:14458 [sender.py:send():382] send: stats
|
96 |
+
2024-08-12 07:34:03,064 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
|
97 |
+
2024-08-12 07:34:03,065 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
|
98 |
+
2024-08-12 07:34:03,108 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
|
99 |
+
2024-08-12 07:34:05,251 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
100 |
+
2024-08-12 07:34:10,252 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
101 |
+
2024-08-12 07:34:15,187 DEBUG SenderThread:14458 [sender.py:send():382] send: exit
|
102 |
+
2024-08-12 07:34:15,187 INFO SenderThread:14458 [sender.py:send_exit():589] handling exit code: 255
|
103 |
+
2024-08-12 07:34:15,187 INFO SenderThread:14458 [sender.py:send_exit():591] handling runtime: 132
|
104 |
+
2024-08-12 07:34:15,189 INFO SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
105 |
+
2024-08-12 07:34:15,189 INFO SenderThread:14458 [sender.py:send_exit():597] send defer
|
106 |
+
2024-08-12 07:34:15,189 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
107 |
+
2024-08-12 07:34:15,190 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 0
|
108 |
+
2024-08-12 07:34:15,190 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
109 |
+
2024-08-12 07:34:15,190 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 0
|
110 |
+
2024-08-12 07:34:15,190 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 1
|
111 |
+
2024-08-12 07:34:15,190 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
112 |
+
2024-08-12 07:34:15,190 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 1
|
113 |
+
2024-08-12 07:34:15,190 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
114 |
+
2024-08-12 07:34:15,190 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 1
|
115 |
+
2024-08-12 07:34:15,190 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 2
|
116 |
+
2024-08-12 07:34:15,190 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
117 |
+
2024-08-12 07:34:15,190 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 2
|
118 |
+
2024-08-12 07:34:15,190 INFO HandlerThread:14458 [system_monitor.py:finish():203] Stopping system monitor
|
119 |
+
2024-08-12 07:34:15,191 DEBUG SystemMonitor:14458 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
120 |
+
2024-08-12 07:34:15,191 DEBUG SystemMonitor:14458 [system_monitor.py:_start():183] Publishing last batch of metrics
|
121 |
+
2024-08-12 07:34:15,191 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined cpu monitor
|
122 |
+
2024-08-12 07:34:15,192 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined disk monitor
|
123 |
+
2024-08-12 07:34:15,225 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined gpu monitor
|
124 |
+
2024-08-12 07:34:15,226 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined memory monitor
|
125 |
+
2024-08-12 07:34:15,226 INFO HandlerThread:14458 [interfaces.py:finish():202] Joined network monitor
|
126 |
+
2024-08-12 07:34:15,226 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
127 |
+
2024-08-12 07:34:15,226 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 2
|
128 |
+
2024-08-12 07:34:15,226 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 3
|
129 |
+
2024-08-12 07:34:15,227 DEBUG SenderThread:14458 [sender.py:send():382] send: stats
|
130 |
+
2024-08-12 07:34:15,227 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
131 |
+
2024-08-12 07:34:15,227 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 3
|
132 |
+
2024-08-12 07:34:15,229 DEBUG SenderThread:14458 [sender.py:send():382] send: history
|
133 |
+
2024-08-12 07:34:15,230 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: summary_record
|
134 |
+
2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
135 |
+
2024-08-12 07:34:15,231 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
136 |
+
2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 3
|
137 |
+
2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 4
|
138 |
+
2024-08-12 07:34:15,231 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
139 |
+
2024-08-12 07:34:15,231 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 4
|
140 |
+
2024-08-12 07:34:15,231 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
141 |
+
2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 4
|
142 |
+
2024-08-12 07:34:15,231 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 5
|
143 |
+
2024-08-12 07:34:15,231 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
144 |
+
2024-08-12 07:34:15,232 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 5
|
145 |
+
2024-08-12 07:34:15,232 DEBUG SenderThread:14458 [sender.py:send():382] send: summary
|
146 |
+
2024-08-12 07:34:15,233 INFO SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
147 |
+
2024-08-12 07:34:15,233 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
148 |
+
2024-08-12 07:34:15,233 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 5
|
149 |
+
2024-08-12 07:34:15,233 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 6
|
150 |
+
2024-08-12 07:34:15,233 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
151 |
+
2024-08-12 07:34:15,234 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 6
|
152 |
+
2024-08-12 07:34:15,234 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
153 |
+
2024-08-12 07:34:15,234 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 6
|
154 |
+
2024-08-12 07:34:15,234 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 7
|
155 |
+
2024-08-12 07:34:15,234 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
156 |
+
2024-08-12 07:34:15,234 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
157 |
+
2024-08-12 07:34:15,234 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 7
|
158 |
+
2024-08-12 07:34:15,234 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
159 |
+
2024-08-12 07:34:15,234 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 7
|
160 |
+
2024-08-12 07:34:15,862 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 8
|
161 |
+
2024-08-12 07:34:15,862 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
162 |
+
2024-08-12 07:34:15,862 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 8
|
163 |
+
2024-08-12 07:34:15,863 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
164 |
+
2024-08-12 07:34:15,863 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 8
|
165 |
+
2024-08-12 07:34:15,863 INFO SenderThread:14458 [job_builder.py:build():296] Attempting to build job artifact
|
166 |
+
2024-08-12 07:34:15,864 INFO SenderThread:14458 [job_builder.py:_get_source_type():426] is repo sourced job
|
167 |
+
2024-08-12 07:34:15,878 INFO SenderThread:14458 [job_builder.py:build():402] adding wandb-job metadata file
|
168 |
+
2024-08-12 07:34:15,887 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 9
|
169 |
+
2024-08-12 07:34:15,887 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
170 |
+
2024-08-12 07:34:15,887 DEBUG SenderThread:14458 [sender.py:send():382] send: artifact
|
171 |
+
2024-08-12 07:34:15,887 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 9
|
172 |
+
2024-08-12 07:34:16,002 INFO Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
|
173 |
+
2024-08-12 07:34:16,002 INFO Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json
|
174 |
+
2024-08-12 07:34:16,187 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: poll_exit
|
175 |
+
2024-08-12 07:34:16,750 INFO SenderThread:14458 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
|
176 |
+
2024-08-12 07:34:16,750 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
177 |
+
2024-08-12 07:34:16,750 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 9
|
178 |
+
2024-08-12 07:34:16,750 INFO SenderThread:14458 [dir_watcher.py:finish():358] shutting down directory watcher
|
179 |
+
2024-08-12 07:34:17,003 INFO SenderThread:14458 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_073202-yby212na/files
|
180 |
+
2024-08-12 07:34:17,004 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/requirements.txt requirements.txt
|
181 |
+
2024-08-12 07:34:17,004 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/config.yaml config.yaml
|
182 |
+
2024-08-12 07:34:17,004 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/wandb-metadata.json wandb-metadata.json
|
183 |
+
2024-08-12 07:34:17,006 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json wandb-summary.json
|
184 |
+
2024-08-12 07:34:17,008 INFO SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/output.log output.log
|
185 |
+
2024-08-12 07:34:17,009 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 10
|
186 |
+
2024-08-12 07:34:17,009 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: poll_exit
|
187 |
+
2024-08-12 07:34:17,011 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
188 |
+
2024-08-12 07:34:17,011 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 10
|
189 |
+
2024-08-12 07:34:17,012 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
190 |
+
2024-08-12 07:34:17,012 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 10
|
191 |
+
2024-08-12 07:34:17,012 INFO SenderThread:14458 [file_pusher.py:finish():172] shutting down file pusher
|
192 |
+
2024-08-12 07:34:17,188 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: poll_exit
|
193 |
+
2024-08-12 07:34:17,188 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: poll_exit
|
194 |
+
2024-08-12 07:34:17,408 INFO wandb-upload_1:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/config.yaml
|
195 |
+
2024-08-12 07:34:17,511 INFO wandb-upload_0:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/requirements.txt
|
196 |
+
2024-08-12 07:34:17,588 INFO wandb-upload_2:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json
|
197 |
+
2024-08-12 07:34:17,614 INFO wandb-upload_3:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/output.log
|
198 |
+
2024-08-12 07:34:17,814 INFO Thread-11 (_thread_body):14458 [sender.py:transition_state():617] send defer: 11
|
199 |
+
2024-08-12 07:34:17,814 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
200 |
+
2024-08-12 07:34:17,815 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 11
|
201 |
+
2024-08-12 07:34:17,815 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
202 |
+
2024-08-12 07:34:17,815 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 11
|
203 |
+
2024-08-12 07:34:17,815 INFO SenderThread:14458 [file_pusher.py:join():178] waiting for file pusher
|
204 |
+
2024-08-12 07:34:17,815 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 12
|
205 |
+
2024-08-12 07:34:17,815 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
206 |
+
2024-08-12 07:34:17,815 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 12
|
207 |
+
2024-08-12 07:34:17,815 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
208 |
+
2024-08-12 07:34:17,815 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 12
|
209 |
+
2024-08-12 07:34:17,815 INFO SenderThread:14458 [file_stream.py:finish():595] file stream finish called
|
210 |
+
2024-08-12 07:34:18,362 INFO SenderThread:14458 [file_stream.py:finish():599] file stream finish is done
|
211 |
+
2024-08-12 07:34:18,362 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 13
|
212 |
+
2024-08-12 07:34:18,362 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
213 |
+
2024-08-12 07:34:18,363 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 13
|
214 |
+
2024-08-12 07:34:18,363 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
215 |
+
2024-08-12 07:34:18,363 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 13
|
216 |
+
2024-08-12 07:34:18,363 INFO SenderThread:14458 [sender.py:transition_state():617] send defer: 14
|
217 |
+
2024-08-12 07:34:18,363 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
|
218 |
+
2024-08-12 07:34:18,363 DEBUG SenderThread:14458 [sender.py:send():382] send: final
|
219 |
+
2024-08-12 07:34:18,363 INFO HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 14
|
220 |
+
2024-08-12 07:34:18,363 DEBUG SenderThread:14458 [sender.py:send():382] send: footer
|
221 |
+
2024-08-12 07:34:18,364 DEBUG SenderThread:14458 [sender.py:send_request():409] send_request: defer
|
222 |
+
2024-08-12 07:34:18,364 INFO SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 14
|
223 |
+
2024-08-12 07:34:21,364 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
224 |
+
2024-08-12 07:34:26,365 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
225 |
+
2024-08-12 07:34:31,366 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
226 |
+
2024-08-12 07:34:36,367 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
227 |
+
2024-08-12 07:34:41,367 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
228 |
+
2024-08-12 07:34:46,368 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
229 |
+
2024-08-12 07:34:51,369 DEBUG HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
|
230 |
+
2024-08-12 07:34:51,550 WARNING StreamThr :14458 [internal.py:is_dead():414] Internal process exiting, parent pid 14387 disappeared
|
231 |
+
2024-08-12 07:34:51,550 ERROR StreamThr :14458 [internal.py:wandb_internal():152] Internal process shutdown.
|
232 |
+
2024-08-12 07:34:52,369 INFO SenderThread:14458 [sender.py:finish():1572] shutting down sender
|
233 |
+
2024-08-12 07:34:52,369 INFO SenderThread:14458 [file_pusher.py:finish():172] shutting down file pusher
|
234 |
+
2024-08-12 07:34:52,369 INFO SenderThread:14458 [file_pusher.py:join():178] waiting for file pusher
|
235 |
+
2024-08-12 07:34:52,369 INFO HandlerThread:14458 [handler.py:finish():869] shutting down handler
|
236 |
+
2024-08-12 07:34:52,369 INFO WriterThread:14458 [datastore.py:close():296] close: /project/wandb/run-20240812_073202-yby212na/run-yby212na.wandb
|
wandb/run-20240812_073202-yby212na/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 07:32:02,359 INFO MainThread:14387 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-12 07:32:02,359 INFO MainThread:14387 [wandb_setup.py:_flush():76] Configure stats pid to 14387
|
3 |
+
2024-08-12 07:32:02,359 INFO MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
|
6 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_073202-yby212na/logs/debug.log
|
9 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_073202-yby212na/logs/debug-internal.log
|
10 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:31:51', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 5, 'save_interval': 1, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-12 07:32:02,360 INFO MainThread:14387 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-12 07:32:02,365 INFO MainThread:14387 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-12 07:32:02,366 INFO MainThread:14387 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-12 07:32:02,370 INFO MainThread:14387 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-12 07:32:02,382 INFO MainThread:14387 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-12 07:32:02,922 INFO MainThread:14387 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-12 07:32:03,004 INFO MainThread:14387 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-12 07:32:03,004 INFO MainThread:14387 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-12 07:32:03,064 INFO MainThread:14387 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-12 07:32:03,064 INFO MainThread:14387 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-12 07:32:03,064 INFO MainThread:14387 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-12 07:32:03,064 INFO MainThread:14387 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-12 07:32:03,065 INFO MainThread:14387 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-12 07:32:04,383 INFO MainThread:14387 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-12 07:32:04,383 INFO MainThread:14387 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
wandb/run-20240812_073202-yby212na/run-yby212na.wandb
ADDED
Binary file (26.1 kB). View file
|
|
wandb/run-20240815_041534-1ld4rgmy/files/config.yaml
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '304771887'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '304771887'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '304771887'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 4096
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-qwen2-0.5B_train_2024-08-15-04:15:21
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 10
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 10
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-qwen2-0.5B
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 151680
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1723662934.646627
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
- 105
|
300 |
+
2:
|
301 |
+
- 1
|
302 |
+
- 11
|
303 |
+
- 49
|
304 |
+
- 55
|
305 |
+
- 71
|
306 |
+
- 105
|
307 |
+
3:
|
308 |
+
- 13
|
309 |
+
- 16
|
310 |
+
- 23
|
311 |
+
4: 3.10.12
|
312 |
+
5: 0.16.3
|
313 |
+
6: 4.43.3
|
314 |
+
8:
|
315 |
+
- 5
|
316 |
+
13: linux-x86_64
|
317 |
+
model_architecture:
|
318 |
+
desc: null
|
319 |
+
value: Qwen2ForCausalLM
|
320 |
+
activation_function:
|
321 |
+
desc: null
|
322 |
+
value: silu
|
323 |
+
hidden_size:
|
324 |
+
desc: null
|
325 |
+
value: 896
|
326 |
+
model_type:
|
327 |
+
desc: null
|
328 |
+
value: qwen2
|
329 |
+
max_position_embeddings:
|
330 |
+
desc: null
|
331 |
+
value: 4096
|
332 |
+
num_attention_heads:
|
333 |
+
desc: null
|
334 |
+
value: 14
|
335 |
+
num_hidden_layers:
|
336 |
+
desc: null
|
337 |
+
value: 24
|
wandb/run-20240815_041534-1ld4rgmy/files/output.log
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
Loading model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
|
5 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
6 |
+
Loaded model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
|
7 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
8 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
9 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
10 |
+
--> applying fsdp activation checkpointing...
|
11 |
+
> datasets target sizes (minimum size):
|
12 |
+
train: 6400000
|
13 |
+
validation: 6403200
|
14 |
+
test: 3200
|
15 |
+
> building train, validation, and test datasets for GPT ...
|
16 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
17 |
+
warnings.warn(
|
18 |
+
Let split = None
|
19 |
+
Building a BlendedDataset for a single MegatronDataset
|
20 |
+
Unable to save the indexes because path_to_cache is None
|
21 |
+
Building a BlendedDataset for a single MegatronDataset
|
22 |
+
Unable to save the indexes because path_to_cache is None
|
23 |
+
Building a BlendedDataset for a single MegatronDataset
|
24 |
+
Unable to save the indexes because path_to_cache is None
|
25 |
+
> finished creating GPT datasets ...
|
26 |
+
Loading optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
|
27 |
+
Loaded optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
|
28 |
+
model info: FullyShardedDataParallel(
|
29 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
30 |
+
(model): Qwen2Model(
|
31 |
+
(embed_tokens): Embedding(151936, 896)
|
32 |
+
(layers): ModuleList(
|
33 |
+
(0-23): 24 x FullyShardedDataParallel(
|
34 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
35 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
36 |
+
(self_attn): Qwen2FlashAttention2(
|
37 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
38 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
39 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
40 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
41 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
42 |
+
)
|
43 |
+
(mlp): Qwen2MLP(
|
44 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
45 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
46 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
47 |
+
(act_fn): SiLU()
|
48 |
+
)
|
49 |
+
(input_layernorm): Qwen2RMSNorm()
|
50 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
51 |
+
)
|
52 |
+
)
|
53 |
+
)
|
54 |
+
)
|
55 |
+
(norm): Qwen2RMSNorm()
|
56 |
+
)
|
57 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
58 |
+
)
|
59 |
+
)
|
60 |
+
model config: Qwen2Config {
|
61 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
62 |
+
"architectures": [
|
63 |
+
"Qwen2ForCausalLM"
|
64 |
+
],
|
65 |
+
"attention_dropout": 0.0,
|
66 |
+
"bos_token_id": 151643,
|
67 |
+
"eos_token_id": 151643,
|
68 |
+
"hidden_act": "silu",
|
69 |
+
"hidden_size": 896,
|
70 |
+
"initializer_range": 0.02,
|
71 |
+
"intermediate_size": 4864,
|
72 |
+
"label_smoothing": 0.0,
|
73 |
+
"max_position_embeddings": 4096,
|
74 |
+
"max_window_layers": 24,
|
75 |
+
"model_type": "qwen2",
|
76 |
+
"num_attention_heads": 14,
|
77 |
+
"num_hidden_layers": 24,
|
78 |
+
"num_key_value_heads": 2,
|
79 |
+
"rms_norm_eps": 1e-06,
|
80 |
+
"rope_theta": 1000000.0,
|
81 |
+
"sliding_window": null,
|
82 |
+
"tie_word_embeddings": true,
|
83 |
+
"torch_dtype": "bfloat16",
|
84 |
+
"transformers_version": "4.43.3",
|
85 |
+
"use_cache": false,
|
86 |
+
"use_sliding_window": false,
|
87 |
+
"vocab_size": 151936
|
88 |
+
}
|
89 |
+
[rank0]:[2024-08-15 04:15:41,598] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
90 |
+
------------------------------------------------------------------
|
91 |
+
iteration: 1161 , TFLOPS: 71.0304706218284, Tokens per sec: 17664.9211934734, Loss: 2.442603349685669
|
92 |
+
------------------------------------------------------------------
|
wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt
ADDED
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
cachetools==5.3.2
|
23 |
+
catalogue==2.0.10
|
24 |
+
certifi==2024.2.2
|
25 |
+
cffi==1.16.0
|
26 |
+
chardet==5.2.0
|
27 |
+
charset-normalizer==3.3.2
|
28 |
+
click==8.1.7
|
29 |
+
cloudpathlib==0.16.0
|
30 |
+
cloudpickle==3.0.0
|
31 |
+
cmake==3.28.1
|
32 |
+
colorama==0.4.6
|
33 |
+
comm==0.2.1
|
34 |
+
confection==0.1.4
|
35 |
+
contourpy==1.2.0
|
36 |
+
cramjam==2.8.3
|
37 |
+
cubinlinker==0.3.0+2.g405ac64
|
38 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
39 |
+
cudf==23.12.0
|
40 |
+
cugraph-dgl==23.12.0
|
41 |
+
cugraph-service-client==23.12.0
|
42 |
+
cugraph-service-server==23.12.0
|
43 |
+
cugraph==23.12.0
|
44 |
+
cuml==23.12.0
|
45 |
+
cupy-cuda12x==12.3.0
|
46 |
+
cycler==0.12.1
|
47 |
+
cymem==2.0.8
|
48 |
+
cython==3.0.8
|
49 |
+
dask-cuda==23.12.0
|
50 |
+
dask-cudf==23.12.0
|
51 |
+
dask==2023.11.0
|
52 |
+
dataclasses-json==0.6.7
|
53 |
+
dataproperty==1.0.1
|
54 |
+
datasets==2.20.0
|
55 |
+
debugpy==1.8.1
|
56 |
+
decorator==5.1.1
|
57 |
+
defusedxml==0.7.1
|
58 |
+
dill==0.3.8
|
59 |
+
distributed==2023.11.0
|
60 |
+
distro==1.9.0
|
61 |
+
dm-tree==0.1.8
|
62 |
+
docker-pycreds==0.4.0
|
63 |
+
einops==0.7.0
|
64 |
+
emoji==2.12.1
|
65 |
+
entmax==1.3
|
66 |
+
evaluate==0.4.2
|
67 |
+
exceptiongroup==1.2.0
|
68 |
+
execnet==2.0.2
|
69 |
+
executing==2.0.1
|
70 |
+
expecttest==0.1.3
|
71 |
+
fastjsonschema==2.19.1
|
72 |
+
fastparquet==2023.10.1
|
73 |
+
fastrlock==0.8.2
|
74 |
+
filelock==3.13.1
|
75 |
+
flash-attn==2.4.2
|
76 |
+
fonttools==4.48.1
|
77 |
+
frozenlist==1.4.1
|
78 |
+
fsspec==2023.12.2
|
79 |
+
fugashi==1.3.2
|
80 |
+
fuzzywuzzy==0.18.0
|
81 |
+
gast==0.5.4
|
82 |
+
gitdb==4.0.11
|
83 |
+
gitpython==3.1.43
|
84 |
+
google-auth-oauthlib==0.4.6
|
85 |
+
google-auth==2.27.0
|
86 |
+
graphsurgeon==0.4.6
|
87 |
+
greenlet==3.0.3
|
88 |
+
grpcio==1.60.1
|
89 |
+
h11==0.14.0
|
90 |
+
httpcore==1.0.5
|
91 |
+
httpx==0.27.0
|
92 |
+
huggingface-hub==0.24.5
|
93 |
+
hydra-core==1.3.2
|
94 |
+
hypothesis==5.35.1
|
95 |
+
idna==3.6
|
96 |
+
importlib-metadata==7.0.1
|
97 |
+
iniconfig==2.0.0
|
98 |
+
intel-openmp==2021.4.0
|
99 |
+
ipadic==1.0.0
|
100 |
+
ipykernel==6.29.2
|
101 |
+
ipython-genutils==0.2.0
|
102 |
+
ipython==8.21.0
|
103 |
+
isort==5.13.2
|
104 |
+
jedi==0.19.1
|
105 |
+
jinja2==3.1.3
|
106 |
+
jiter==0.5.0
|
107 |
+
joblib==1.3.2
|
108 |
+
json5==0.9.14
|
109 |
+
jsonargparse==3.13.1
|
110 |
+
jsonlines==4.0.0
|
111 |
+
jsonnet==0.19.1
|
112 |
+
jsonpatch==1.33
|
113 |
+
jsonpointer==3.0.0
|
114 |
+
jsonschema-specifications==2023.12.1
|
115 |
+
jsonschema==4.21.1
|
116 |
+
jupyter-client==8.6.0
|
117 |
+
jupyter-core==5.7.1
|
118 |
+
jupyter-tensorboard==0.2.0
|
119 |
+
jupyterlab-pygments==0.3.0
|
120 |
+
jupyterlab-server==1.2.0
|
121 |
+
jupyterlab==2.3.2
|
122 |
+
jupytext==1.16.1
|
123 |
+
kiwisolver==1.4.5
|
124 |
+
langchain-community==0.2.12
|
125 |
+
langchain-core==0.2.31
|
126 |
+
langchain-huggingface==0.0.2
|
127 |
+
langchain-openai==0.1.21
|
128 |
+
langchain-text-splitters==0.2.2
|
129 |
+
langchain==0.2.13
|
130 |
+
langcodes==3.3.0
|
131 |
+
langsmith==0.1.99
|
132 |
+
lazy-loader==0.3
|
133 |
+
levenshtein==0.25.1
|
134 |
+
librosa==0.10.1
|
135 |
+
lightning-utilities==0.11.6
|
136 |
+
llm-jp-eval==1.4.0
|
137 |
+
llvmlite==0.40.1
|
138 |
+
lm-eval==0.3.0
|
139 |
+
locket==1.0.0
|
140 |
+
logzero==1.7.0
|
141 |
+
lxml==5.2.2
|
142 |
+
markdown-it-py==3.0.0
|
143 |
+
markdown==3.5.2
|
144 |
+
markupsafe==2.1.4
|
145 |
+
marshmallow==3.21.3
|
146 |
+
matplotlib-inline==0.1.6
|
147 |
+
matplotlib==3.8.2
|
148 |
+
mbstrdecoder==1.1.3
|
149 |
+
mccabe==0.7.0
|
150 |
+
mdit-py-plugins==0.4.0
|
151 |
+
mdurl==0.1.2
|
152 |
+
mecab-python3==1.0.6
|
153 |
+
mistune==3.0.2
|
154 |
+
mkl-devel==2021.1.1
|
155 |
+
mkl-include==2021.1.1
|
156 |
+
mkl==2021.1.1
|
157 |
+
mock==5.1.0
|
158 |
+
mojimoji==0.0.13
|
159 |
+
more-itertools==9.1.0
|
160 |
+
mpmath==1.3.0
|
161 |
+
msgpack==1.0.7
|
162 |
+
multidict==6.0.4
|
163 |
+
multiprocess==0.70.16
|
164 |
+
murmurhash==1.0.10
|
165 |
+
mypy-extensions==1.0.0
|
166 |
+
nbclient==0.9.0
|
167 |
+
nbconvert==7.16.0
|
168 |
+
nbformat==5.9.2
|
169 |
+
neologdn==0.5.3
|
170 |
+
nest-asyncio==1.6.0
|
171 |
+
networkx==2.6.3
|
172 |
+
ninja==1.11.1.1
|
173 |
+
nltk==3.8.1
|
174 |
+
notebook==6.4.10
|
175 |
+
numba==0.57.1+1.g1ff679645
|
176 |
+
numexpr==2.10.1
|
177 |
+
numpy==1.24.4
|
178 |
+
nvfuser==0.1.4a0+d0bb811
|
179 |
+
nvidia-dali-cuda120==1.34.0
|
180 |
+
nvidia-pyindex==1.0.9
|
181 |
+
nvtx==0.2.5
|
182 |
+
oauthlib==3.2.2
|
183 |
+
omegaconf==2.3.0
|
184 |
+
onnx==1.15.0rc2
|
185 |
+
openai==1.40.6
|
186 |
+
opencv==4.7.0
|
187 |
+
optree==0.10.0
|
188 |
+
orjson==3.10.7
|
189 |
+
packaging==23.2
|
190 |
+
pandas==2.2.2
|
191 |
+
pandocfilters==1.5.1
|
192 |
+
parso==0.8.3
|
193 |
+
partd==1.4.1
|
194 |
+
pathvalidate==3.2.0
|
195 |
+
peft==0.5.0
|
196 |
+
pexpect==4.9.0
|
197 |
+
pillow==10.2.0
|
198 |
+
pip==24.0
|
199 |
+
plac==1.4.3
|
200 |
+
platformdirs==4.2.0
|
201 |
+
pluggy==1.4.0
|
202 |
+
ply==3.11
|
203 |
+
polygraphy==0.49.4
|
204 |
+
pooch==1.8.0
|
205 |
+
portalocker==2.10.1
|
206 |
+
preshed==3.0.9
|
207 |
+
prettytable==3.9.0
|
208 |
+
prometheus-client==0.19.0
|
209 |
+
prompt-toolkit==3.0.43
|
210 |
+
protobuf==4.24.4
|
211 |
+
psutil==5.9.4
|
212 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
213 |
+
ptyprocess==0.7.0
|
214 |
+
pure-eval==0.2.2
|
215 |
+
pyarrow-hotfix==0.6
|
216 |
+
pyarrow==15.0.2
|
217 |
+
pyasn1-modules==0.3.0
|
218 |
+
pyasn1==0.5.1
|
219 |
+
pybind11-global==2.11.1
|
220 |
+
pybind11==2.11.1
|
221 |
+
pycocotools==2.0+nv0.8.0
|
222 |
+
pycountry==24.6.1
|
223 |
+
pycparser==2.21
|
224 |
+
pydantic-core==2.16.2
|
225 |
+
pydantic==2.6.1
|
226 |
+
pygments==2.17.2
|
227 |
+
pylibcugraph==23.12.0
|
228 |
+
pylibcugraphops==23.12.0
|
229 |
+
pylibraft==23.12.0
|
230 |
+
pylint==3.2.6
|
231 |
+
pynvml==11.4.1
|
232 |
+
pyparsing==3.1.1
|
233 |
+
pytablewriter==1.2.0
|
234 |
+
pytest-flakefinder==1.1.0
|
235 |
+
pytest-rerunfailures==13.0
|
236 |
+
pytest-shard==0.1.2
|
237 |
+
pytest-xdist==3.5.0
|
238 |
+
pytest==8.0.0
|
239 |
+
python-dateutil==2.8.2
|
240 |
+
python-dotenv==1.0.0
|
241 |
+
python-hostlist==1.23.0
|
242 |
+
python-levenshtein==0.25.1
|
243 |
+
pytorch-lightning==2.4.0
|
244 |
+
pytorch-quantization==2.1.2
|
245 |
+
pytz==2023.3.post1
|
246 |
+
pyyaml==6.0.1
|
247 |
+
pyzmq==25.1.2
|
248 |
+
raft-dask==23.12.0
|
249 |
+
rapidfuzz==3.9.6
|
250 |
+
rapids-dask-dependency==23.12.1
|
251 |
+
referencing==0.33.0
|
252 |
+
regex==2023.12.25
|
253 |
+
requests-oauthlib==1.3.1
|
254 |
+
requests==2.32.3
|
255 |
+
rhoknp==1.7.0
|
256 |
+
rich==13.7.0
|
257 |
+
rmm==23.12.0
|
258 |
+
rouge-score==0.1.2
|
259 |
+
rpds-py==0.17.1
|
260 |
+
rsa==4.9
|
261 |
+
sacrebleu==2.4.2
|
262 |
+
safetensors==0.4.3
|
263 |
+
scikit-learn==1.5.1
|
264 |
+
scipy==1.12.0
|
265 |
+
send2trash==1.8.2
|
266 |
+
sentence-transformers==3.0.1
|
267 |
+
sentencepiece==0.1.99
|
268 |
+
sentry-sdk==2.12.0
|
269 |
+
setproctitle==1.3.3
|
270 |
+
setuptools==68.2.2
|
271 |
+
six==1.16.0
|
272 |
+
smart-open==6.4.0
|
273 |
+
smmap==5.0.1
|
274 |
+
sniffio==1.3.1
|
275 |
+
sortedcontainers==2.4.0
|
276 |
+
soundfile==0.12.1
|
277 |
+
soupsieve==2.5
|
278 |
+
soxr==0.3.7
|
279 |
+
spacy-legacy==3.0.12
|
280 |
+
spacy-loggers==1.0.5
|
281 |
+
spacy==3.7.2
|
282 |
+
sphinx-glpi-theme==0.6
|
283 |
+
sqlalchemy==2.0.32
|
284 |
+
sqlitedict==2.1.0
|
285 |
+
srsly==2.4.8
|
286 |
+
stack-data==0.6.3
|
287 |
+
sumeval==0.2.2
|
288 |
+
sympy==1.12
|
289 |
+
tabledata==1.3.3
|
290 |
+
tabulate==0.9.0
|
291 |
+
tbb==2021.11.0
|
292 |
+
tblib==3.0.0
|
293 |
+
tcolorpy==0.1.6
|
294 |
+
tenacity==8.5.0
|
295 |
+
tensorboard-data-server==0.6.1
|
296 |
+
tensorboard-plugin-wit==1.8.1
|
297 |
+
tensorboard==2.9.0
|
298 |
+
tensorrt==8.6.3
|
299 |
+
terminado==0.18.0
|
300 |
+
termplotlib==0.3.9
|
301 |
+
text-generation==0.7.0
|
302 |
+
thinc==8.2.3
|
303 |
+
threadpoolctl==3.2.0
|
304 |
+
thriftpy2==0.4.17
|
305 |
+
tiktoken==0.7.0
|
306 |
+
tinycss2==1.2.1
|
307 |
+
tokenizers==0.19.1
|
308 |
+
toml==0.10.2
|
309 |
+
tomli==2.0.1
|
310 |
+
tomlkit==0.13.2
|
311 |
+
toolz==0.12.1
|
312 |
+
torch-tensorrt==2.3.0a0
|
313 |
+
torch==2.3.0a0+ebedce2
|
314 |
+
torchdata==0.7.1a0
|
315 |
+
torchmetrics==0.10.3
|
316 |
+
torchtext==0.17.0a0
|
317 |
+
torchvision==0.18.0a0
|
318 |
+
tornado==6.4
|
319 |
+
tqdm-multiprocess==0.0.11
|
320 |
+
tqdm==4.66.5
|
321 |
+
traitlets==5.9.0
|
322 |
+
transformer-engine==1.3.0+5b90b7f
|
323 |
+
transformers==4.43.3
|
324 |
+
treelite-runtime==3.9.1
|
325 |
+
treelite==3.9.1
|
326 |
+
triton==2.2.0+e28a256
|
327 |
+
typepy==1.3.2
|
328 |
+
typer==0.9.0
|
329 |
+
types-dataclasses==0.6.6
|
330 |
+
typing-extensions==4.12.2
|
331 |
+
typing-inspect==0.9.0
|
332 |
+
tzdata==2024.1
|
333 |
+
ucx-py==0.35.0
|
334 |
+
uff==0.6.9
|
335 |
+
ujson==5.8.0
|
336 |
+
unbabel-comet==2.2.2
|
337 |
+
unidic-lite==1.0.8
|
338 |
+
urllib3==1.26.18
|
339 |
+
wandb==0.16.3
|
340 |
+
wasabi==1.1.2
|
341 |
+
wcwidth==0.2.13
|
342 |
+
weasel==0.3.4
|
343 |
+
webencodings==0.5.1
|
344 |
+
werkzeug==3.0.1
|
345 |
+
wheel==0.42.0
|
346 |
+
word2number==1.1
|
347 |
+
xdoctest==1.0.2
|
348 |
+
xgboost==1.7.6
|
349 |
+
xmltodict==0.13.0
|
350 |
+
xxhash==3.4.1
|
351 |
+
yarl==1.9.4
|
352 |
+
zict==3.0.0
|
353 |
+
zipp==3.17.0
|
354 |
+
zstandard==0.23.0
|
wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-14T19:15:35.173102",
|
5 |
+
"startedAt": "2024-08-14T19:15:34.633818",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
23 |
+
"--train-data-path",
|
24 |
+
"304771887",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"304771887",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"304771887",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"10",
|
56 |
+
"--eval-interval",
|
57 |
+
"10",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-qwen2-0.5B",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-qwen2-0.5B_train_2024-08-15-04:15:21"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0389999999993,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.039,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 86}, "training/loss": 2.442603349685669, "training/perplexity": 11.502947992429535, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1161, "optimizer/lr": 1.9946184158325198e-05, "optimizer/variance_l2": 0.0046823736576586325, "optimizer/variance_sqrt_l2": 0.5343142380105511, "optimizer/momentum_l2": 0.12459250428605805, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.284942626953125, "optimizer/variance_sqrt_l1": 4625.0, "optimizer/momentum_l1": 977.875, "optimizer/weight_l1": 6918144.0, "optimizer/variance_abs_max": 0.0030059814453125, "optimizer/variance_sqrt_abs_max": 0.054931640625, "optimizer/momentum_abs_max": 0.0108642578125, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.21714400200017, "stats/tokens_per_sec": 17664.9211934734, "stats/tokens_per_sec_per_gpu": 17664.9211934734, "stats/tflops": 71.0304706218284, "_timestamp": 1723663016.4553976, "_runtime": 81.8087706565857, "_step": 1161}
|
wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-15 04:15:34,649 INFO StreamThr :12253 [internal.py:wandb_internal():86] W&B internal server running at pid: 12253, started at: 2024-08-15 04:15:34.648066
|
2 |
+
2024-08-15 04:15:34,650 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-15 04:15:34,652 INFO WriterThread:12253 [datastore.py:open_for_write():87] open: /project/wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb
|
4 |
+
2024-08-15 04:15:34,653 DEBUG SenderThread:12253 [sender.py:send():382] send: header
|
5 |
+
2024-08-15 04:15:34,666 DEBUG SenderThread:12253 [sender.py:send():382] send: run
|
6 |
+
2024-08-15 04:15:35,078 INFO SenderThread:12253 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240815_041534-1ld4rgmy/files
|
7 |
+
2024-08-15 04:15:35,078 INFO SenderThread:12253 [sender.py:_start_run_threads():1136] run started: 1ld4rgmy with start time 1723662934.646627
|
8 |
+
2024-08-15 04:15:35,084 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-15 04:15:35,084 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-15 04:15:35,155 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-15 04:15:35,161 DEBUG HandlerThread:12253 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-15 04:15:35,161 DEBUG HandlerThread:12253 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-15 04:15:35,161 INFO HandlerThread:12253 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-15 04:15:35,161 INFO SystemMonitor:12253 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-15 04:15:35,161 INFO HandlerThread:12253 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-15 04:15:35,162 INFO SystemMonitor:12253 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-15 04:15:35,162 INFO SystemMonitor:12253 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-15 04:15:35,163 INFO SystemMonitor:12253 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-15 04:15:35,164 INFO SystemMonitor:12253 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-15 04:15:35,164 INFO SystemMonitor:12253 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-15 04:15:35,173 DEBUG HandlerThread:12253 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-15 04:15:35,175 DEBUG HandlerThread:12253 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-15 04:15:35,188 DEBUG HandlerThread:12253 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-15 04:15:35,188 DEBUG HandlerThread:12253 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-15 04:15:35,188 DEBUG HandlerThread:12253 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-14T19:15:35.173102', 'startedAt': '2024-08-14T19:15:34.633818', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-15-04:15:21'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
26 |
+
2024-08-15 04:15:35,188 INFO HandlerThread:12253 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-15 04:15:35,188 INFO HandlerThread:12253 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-15 04:15:35,189 INFO HandlerThread:12253 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-15 04:15:35,195 DEBUG SenderThread:12253 [sender.py:send():382] send: files
|
30 |
+
2024-08-15 04:15:35,195 INFO SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-15 04:15:35,207 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-15 04:15:35,207 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-15 04:15:35,207 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: python_packages
|
34 |
+
2024-08-15 04:15:35,208 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
|
35 |
+
2024-08-15 04:15:35,209 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-15 04:15:35,448 DEBUG SenderThread:12253 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-15 04:15:35,826 INFO wandb-upload_0:12253 [upload_job.py:push():131] Uploaded file /tmp/tmprvuc38znwandb/8jb1h2yo-wandb-metadata.json
|
38 |
+
2024-08-15 04:15:36,080 INFO Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt
|
39 |
+
2024-08-15 04:15:36,080 INFO Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json
|
40 |
+
2024-08-15 04:15:36,081 INFO Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
|
41 |
+
2024-08-15 04:15:38,081 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
|
42 |
+
2024-08-15 04:15:40,019 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-15 04:15:40,082 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
|
44 |
+
2024-08-15 04:15:41,878 DEBUG SenderThread:12253 [sender.py:send():382] send: config
|
45 |
+
2024-08-15 04:15:41,878 DEBUG SenderThread:12253 [sender.py:send():382] send: config
|
46 |
+
2024-08-15 04:15:42,083 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
|
47 |
+
2024-08-15 04:15:44,084 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
|
48 |
+
2024-08-15 04:15:45,879 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
49 |
+
2024-08-15 04:15:50,206 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
|
50 |
+
2024-08-15 04:15:50,206 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
|
51 |
+
2024-08-15 04:15:50,208 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
|
52 |
+
2024-08-15 04:15:51,411 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
53 |
+
2024-08-15 04:15:56,411 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
54 |
+
2024-08-15 04:16:01,412 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
55 |
+
2024-08-15 04:16:05,206 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
|
56 |
+
2024-08-15 04:16:05,206 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
|
57 |
+
2024-08-15 04:16:05,246 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
|
58 |
+
2024-08-15 04:16:06,461 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
59 |
+
2024-08-15 04:16:08,114 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/config.yaml
|
60 |
+
2024-08-15 04:16:12,324 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
61 |
+
2024-08-15 04:16:17,325 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-15 04:16:20,207 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
|
63 |
+
2024-08-15 04:16:20,207 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
|
64 |
+
2024-08-15 04:16:20,250 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
|
65 |
+
2024-08-15 04:16:22,438 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
66 |
+
2024-08-15 04:16:27,438 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
67 |
+
2024-08-15 04:16:32,439 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
68 |
+
2024-08-15 04:16:35,164 DEBUG SystemMonitor:12253 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
69 |
+
2024-08-15 04:16:35,166 DEBUG SenderThread:12253 [sender.py:send():382] send: stats
|
70 |
+
2024-08-15 04:16:35,206 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
|
71 |
+
2024-08-15 04:16:35,206 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
|
72 |
+
2024-08-15 04:16:35,250 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
|
73 |
+
2024-08-15 04:16:38,433 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
74 |
+
2024-08-15 04:16:43,434 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
75 |
+
2024-08-15 04:16:48,434 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
76 |
+
2024-08-15 04:16:50,206 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
|
77 |
+
2024-08-15 04:16:50,206 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
|
78 |
+
2024-08-15 04:16:50,250 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
|
79 |
+
2024-08-15 04:16:54,406 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
80 |
+
2024-08-15 04:16:56,456 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: partial_history
|
81 |
+
2024-08-15 04:16:58,142 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
|
82 |
+
2024-08-15 04:16:59,499 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
83 |
+
2024-08-15 04:17:02,068 DEBUG SenderThread:12253 [sender.py:send():382] send: exit
|
84 |
+
2024-08-15 04:17:02,069 INFO SenderThread:12253 [sender.py:send_exit():589] handling exit code: 255
|
85 |
+
2024-08-15 04:17:02,069 INFO SenderThread:12253 [sender.py:send_exit():591] handling runtime: 86
|
86 |
+
2024-08-15 04:17:02,070 INFO SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
87 |
+
2024-08-15 04:17:02,070 INFO SenderThread:12253 [sender.py:send_exit():597] send defer
|
88 |
+
2024-08-15 04:17:02,071 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
89 |
+
2024-08-15 04:17:02,071 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 0
|
90 |
+
2024-08-15 04:17:02,071 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
91 |
+
2024-08-15 04:17:02,071 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 0
|
92 |
+
2024-08-15 04:17:02,071 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 1
|
93 |
+
2024-08-15 04:17:02,071 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
94 |
+
2024-08-15 04:17:02,071 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 1
|
95 |
+
2024-08-15 04:17:02,071 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
96 |
+
2024-08-15 04:17:02,071 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 1
|
97 |
+
2024-08-15 04:17:02,071 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 2
|
98 |
+
2024-08-15 04:17:02,071 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
99 |
+
2024-08-15 04:17:02,072 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 2
|
100 |
+
2024-08-15 04:17:02,072 INFO HandlerThread:12253 [system_monitor.py:finish():203] Stopping system monitor
|
101 |
+
2024-08-15 04:17:02,072 DEBUG SystemMonitor:12253 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
102 |
+
2024-08-15 04:17:02,072 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined cpu monitor
|
103 |
+
2024-08-15 04:17:02,072 DEBUG SystemMonitor:12253 [system_monitor.py:_start():183] Publishing last batch of metrics
|
104 |
+
2024-08-15 04:17:02,072 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined disk monitor
|
105 |
+
2024-08-15 04:17:02,107 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined gpu monitor
|
106 |
+
2024-08-15 04:17:02,107 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined memory monitor
|
107 |
+
2024-08-15 04:17:02,107 INFO HandlerThread:12253 [interfaces.py:finish():202] Joined network monitor
|
108 |
+
2024-08-15 04:17:02,108 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
109 |
+
2024-08-15 04:17:02,108 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 2
|
110 |
+
2024-08-15 04:17:02,108 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 3
|
111 |
+
2024-08-15 04:17:02,108 DEBUG SenderThread:12253 [sender.py:send():382] send: stats
|
112 |
+
2024-08-15 04:17:02,108 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
113 |
+
2024-08-15 04:17:02,108 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 3
|
114 |
+
2024-08-15 04:17:02,111 DEBUG SenderThread:12253 [sender.py:send():382] send: history
|
115 |
+
2024-08-15 04:17:02,111 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: summary_record
|
116 |
+
2024-08-15 04:17:02,112 INFO SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
117 |
+
2024-08-15 04:17:02,113 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
118 |
+
2024-08-15 04:17:02,113 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 3
|
119 |
+
2024-08-15 04:17:02,113 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 4
|
120 |
+
2024-08-15 04:17:02,113 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
121 |
+
2024-08-15 04:17:02,113 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 4
|
122 |
+
2024-08-15 04:17:02,113 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
123 |
+
2024-08-15 04:17:02,113 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 4
|
124 |
+
2024-08-15 04:17:02,113 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 5
|
125 |
+
2024-08-15 04:17:02,113 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
126 |
+
2024-08-15 04:17:02,113 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 5
|
127 |
+
2024-08-15 04:17:02,114 DEBUG SenderThread:12253 [sender.py:send():382] send: summary
|
128 |
+
2024-08-15 04:17:02,115 INFO SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
129 |
+
2024-08-15 04:17:02,115 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
130 |
+
2024-08-15 04:17:02,115 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 5
|
131 |
+
2024-08-15 04:17:02,115 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 6
|
132 |
+
2024-08-15 04:17:02,115 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
133 |
+
2024-08-15 04:17:02,115 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 6
|
134 |
+
2024-08-15 04:17:02,115 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
135 |
+
2024-08-15 04:17:02,115 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 6
|
136 |
+
2024-08-15 04:17:02,116 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 7
|
137 |
+
2024-08-15 04:17:02,116 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
|
138 |
+
2024-08-15 04:17:02,116 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
139 |
+
2024-08-15 04:17:02,116 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 7
|
140 |
+
2024-08-15 04:17:02,116 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
141 |
+
2024-08-15 04:17:02,116 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 7
|
142 |
+
2024-08-15 04:17:02,145 INFO Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json
|
143 |
+
2024-08-15 04:17:03,068 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: poll_exit
|
144 |
+
2024-08-15 04:17:03,854 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 8
|
145 |
+
2024-08-15 04:17:03,854 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: poll_exit
|
146 |
+
2024-08-15 04:17:03,854 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
147 |
+
2024-08-15 04:17:03,855 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 8
|
148 |
+
2024-08-15 04:17:03,855 DEBUG SenderThread:12253 [sender.py:send_request():409] send_request: defer
|
149 |
+
2024-08-15 04:17:03,855 INFO SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 8
|
150 |
+
2024-08-15 04:17:03,855 INFO SenderThread:12253 [job_builder.py:build():296] Attempting to build job artifact
|
151 |
+
2024-08-15 04:17:03,856 INFO SenderThread:12253 [job_builder.py:_get_source_type():426] is repo sourced job
|
152 |
+
2024-08-15 04:17:03,871 INFO SenderThread:12253 [job_builder.py:build():402] adding wandb-job metadata file
|
153 |
+
2024-08-15 04:17:03,880 INFO SenderThread:12253 [sender.py:transition_state():617] send defer: 9
|
154 |
+
2024-08-15 04:17:03,880 DEBUG SenderThread:12253 [sender.py:send():382] send: artifact
|
155 |
+
2024-08-15 04:17:03,880 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
|
156 |
+
2024-08-15 04:17:03,881 INFO HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 9
|
157 |
+
2024-08-15 04:17:04,069 DEBUG HandlerThread:12253 [handler.py:handle_request():146] handle_request: poll_exit
|
158 |
+
2024-08-15 04:17:04,146 INFO Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
|
159 |
+
2024-08-15 04:17:35,760 WARNING StreamThr :12253 [internal.py:is_dead():414] Internal process exiting, parent pid 12182 disappeared
|
160 |
+
2024-08-15 04:17:35,760 ERROR StreamThr :12253 [internal.py:wandb_internal():152] Internal process shutdown.
|
161 |
+
2024-08-15 04:17:36,070 INFO WriterThread:12253 [datastore.py:close():296] close: /project/wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb
|
162 |
+
2024-08-15 04:17:36,071 INFO HandlerThread:12253 [handler.py:finish():869] shutting down handler
|
wandb/run-20240815_041534-1ld4rgmy/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-15 04:15:34,639 INFO MainThread:12182 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Configure stats pid to 12182
|
3 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
|
6 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240815_041534-1ld4rgmy/logs/debug.log
|
9 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log
|
10 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-15-04:15:21', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-15 04:15:34,640 INFO MainThread:12182 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-15 04:15:34,645 INFO MainThread:12182 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-15 04:15:34,646 INFO MainThread:12182 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-15 04:15:34,651 INFO MainThread:12182 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-15 04:15:34,662 INFO MainThread:12182 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-15 04:15:35,083 INFO MainThread:12182 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-15 04:15:35,107 INFO MainThread:12182 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-15 04:15:35,107 INFO MainThread:12182 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-15 04:15:35,205 INFO MainThread:12182 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-15 04:15:35,206 INFO MainThread:12182 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-15 04:15:35,206 INFO MainThread:12182 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-15 04:15:35,206 INFO MainThread:12182 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-15 04:15:35,207 INFO MainThread:12182 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-15 04:15:41,877 INFO MainThread:12182 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-15 04:15:41,877 INFO MainThread:12182 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb
ADDED
Binary file (18 kB). View file
|
|
wandb/run-20240824_202022-z2bjbf6e/files/config.yaml
ADDED
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: NO_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '1754785366'
|
31 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
32 |
+
- '28623823675'
|
33 |
+
- /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
|
34 |
+
valid_data_path:
|
35 |
+
desc: null
|
36 |
+
value:
|
37 |
+
- '1205770'
|
38 |
+
- /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
|
39 |
+
test_data_path:
|
40 |
+
desc: null
|
41 |
+
value:
|
42 |
+
- '1205770'
|
43 |
+
- /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
|
44 |
+
data_cache_path:
|
45 |
+
desc: null
|
46 |
+
value: null
|
47 |
+
vocab_size:
|
48 |
+
desc: null
|
49 |
+
value: null
|
50 |
+
vocab_file:
|
51 |
+
desc: null
|
52 |
+
value: null
|
53 |
+
merge_file:
|
54 |
+
desc: null
|
55 |
+
value: null
|
56 |
+
seq_length:
|
57 |
+
desc: null
|
58 |
+
value: 1024
|
59 |
+
num_workers:
|
60 |
+
desc: null
|
61 |
+
value: 4
|
62 |
+
tokenizer_type:
|
63 |
+
desc: null
|
64 |
+
value: HFPreTrainedTokenizer
|
65 |
+
tokenizer_model:
|
66 |
+
desc: null
|
67 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
68 |
+
reset_position_ids:
|
69 |
+
desc: null
|
70 |
+
value: false
|
71 |
+
reset_attention_mask:
|
72 |
+
desc: null
|
73 |
+
value: false
|
74 |
+
eod_mask_loss:
|
75 |
+
desc: null
|
76 |
+
value: false
|
77 |
+
retro_return_doc_ids:
|
78 |
+
desc: null
|
79 |
+
value: false
|
80 |
+
short_seq_prob:
|
81 |
+
desc: null
|
82 |
+
value: 0.1
|
83 |
+
vocab_extra_ids:
|
84 |
+
desc: null
|
85 |
+
value: 0
|
86 |
+
seed:
|
87 |
+
desc: null
|
88 |
+
value: 1234
|
89 |
+
use_mpi:
|
90 |
+
desc: null
|
91 |
+
value: false
|
92 |
+
wandb_entity:
|
93 |
+
desc: null
|
94 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
95 |
+
wandb_name:
|
96 |
+
desc: null
|
97 |
+
value: yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07
|
98 |
+
wandb_project:
|
99 |
+
desc: null
|
100 |
+
value: yans_experiment
|
101 |
+
quantization:
|
102 |
+
desc: null
|
103 |
+
value: false
|
104 |
+
use_freeze_layers:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
freeze_layers:
|
108 |
+
desc: null
|
109 |
+
value: null
|
110 |
+
bf16:
|
111 |
+
desc: null
|
112 |
+
value: true
|
113 |
+
fp16:
|
114 |
+
desc: null
|
115 |
+
value: false
|
116 |
+
mixed_precision:
|
117 |
+
desc: null
|
118 |
+
value: true
|
119 |
+
param_dtype:
|
120 |
+
desc: null
|
121 |
+
value: null
|
122 |
+
load:
|
123 |
+
desc: null
|
124 |
+
value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B
|
125 |
+
save:
|
126 |
+
desc: null
|
127 |
+
value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B
|
128 |
+
base_model:
|
129 |
+
desc: null
|
130 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
131 |
+
use_better_transformer:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
grad_clip_norm:
|
135 |
+
desc: null
|
136 |
+
value: 1.0
|
137 |
+
eval_interval:
|
138 |
+
desc: null
|
139 |
+
value: 200
|
140 |
+
save_interval:
|
141 |
+
desc: null
|
142 |
+
value: 200
|
143 |
+
eval_iters:
|
144 |
+
desc: null
|
145 |
+
value: 10
|
146 |
+
optimizer:
|
147 |
+
desc: null
|
148 |
+
value: anyprecision
|
149 |
+
lr:
|
150 |
+
desc: null
|
151 |
+
value: 3.5e-06
|
152 |
+
lr_decay_style:
|
153 |
+
desc: null
|
154 |
+
value: cosine
|
155 |
+
lr_decay_iters:
|
156 |
+
desc: null
|
157 |
+
value: 23178
|
158 |
+
lr_warmup_iters:
|
159 |
+
desc: null
|
160 |
+
value: 500
|
161 |
+
min_lr:
|
162 |
+
desc: null
|
163 |
+
value: 3.5e-07
|
164 |
+
train_iters:
|
165 |
+
desc: null
|
166 |
+
value: 23178
|
167 |
+
train_samples:
|
168 |
+
desc: null
|
169 |
+
value: null
|
170 |
+
global_batch_size:
|
171 |
+
desc: null
|
172 |
+
value: 1280
|
173 |
+
micro_batch_size:
|
174 |
+
desc: null
|
175 |
+
value: 16
|
176 |
+
make_vocab_size_divisible_by:
|
177 |
+
desc: null
|
178 |
+
value: 128
|
179 |
+
sliding_window_size:
|
180 |
+
desc: null
|
181 |
+
value: 131072
|
182 |
+
skip_batch:
|
183 |
+
desc: null
|
184 |
+
value: null
|
185 |
+
no_save_optimizer_state:
|
186 |
+
desc: null
|
187 |
+
value: false
|
188 |
+
continual_pretraining:
|
189 |
+
desc: null
|
190 |
+
value: false
|
191 |
+
instruction_tuning:
|
192 |
+
desc: null
|
193 |
+
value: false
|
194 |
+
direct_preference_optimization:
|
195 |
+
desc: null
|
196 |
+
value: false
|
197 |
+
attention_dropout:
|
198 |
+
desc: null
|
199 |
+
value: 0.1
|
200 |
+
hidden_dropout:
|
201 |
+
desc: null
|
202 |
+
value: 0.1
|
203 |
+
weight_decay:
|
204 |
+
desc: null
|
205 |
+
value: 0.1
|
206 |
+
adam_beta1:
|
207 |
+
desc: null
|
208 |
+
value: 0.9
|
209 |
+
adam_beta2:
|
210 |
+
desc: null
|
211 |
+
value: 0.95
|
212 |
+
adam_eps:
|
213 |
+
desc: null
|
214 |
+
value: 1.0e-08
|
215 |
+
hf_transformer_model_dir:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
instruction_train_data_path:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
instruction_valid_data_path:
|
222 |
+
desc: null
|
223 |
+
value: null
|
224 |
+
epoch:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
instruction_dataset_size:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
save_sampler_state:
|
231 |
+
desc: null
|
232 |
+
value: false
|
233 |
+
label_smoothing:
|
234 |
+
desc: null
|
235 |
+
value: 0.0
|
236 |
+
save_n_checkpoints:
|
237 |
+
desc: null
|
238 |
+
value: 10
|
239 |
+
hf_repo_id:
|
240 |
+
desc: null
|
241 |
+
value: koichi12/yans-baseline-qwen2-0.5B
|
242 |
+
create_public_hf_repo:
|
243 |
+
desc: null
|
244 |
+
value: false
|
245 |
+
upload_all_checkpoints_to_hf:
|
246 |
+
desc: null
|
247 |
+
value: true
|
248 |
+
hf_upload_retry_limit:
|
249 |
+
desc: null
|
250 |
+
value: 2
|
251 |
+
exit_duration_in_mins:
|
252 |
+
desc: null
|
253 |
+
value: null
|
254 |
+
source_key:
|
255 |
+
desc: null
|
256 |
+
value: null
|
257 |
+
target_key:
|
258 |
+
desc: null
|
259 |
+
value: null
|
260 |
+
attn_implementation:
|
261 |
+
desc: null
|
262 |
+
value: flash_attention_2
|
263 |
+
efficient_instruction_tuning:
|
264 |
+
desc: null
|
265 |
+
value: false
|
266 |
+
remove_padding_masking:
|
267 |
+
desc: null
|
268 |
+
value: false
|
269 |
+
save_start_iter:
|
270 |
+
desc: null
|
271 |
+
value: null
|
272 |
+
valid_micro_batch_size:
|
273 |
+
desc: null
|
274 |
+
value: 1
|
275 |
+
rank:
|
276 |
+
desc: null
|
277 |
+
value: 0
|
278 |
+
world_size:
|
279 |
+
desc: null
|
280 |
+
value: 8
|
281 |
+
padded_vocab_size:
|
282 |
+
desc: null
|
283 |
+
value: 151680
|
284 |
+
gradient_accumulation_steps:
|
285 |
+
desc: null
|
286 |
+
value: 10
|
287 |
+
_wandb:
|
288 |
+
desc: null
|
289 |
+
value:
|
290 |
+
python_version: 3.10.12
|
291 |
+
cli_version: 0.16.3
|
292 |
+
framework: huggingface
|
293 |
+
huggingface_version: 4.43.3
|
294 |
+
is_jupyter_run: false
|
295 |
+
is_kaggle_kernel: false
|
296 |
+
start_time: 1724498422.652614
|
297 |
+
t:
|
298 |
+
1:
|
299 |
+
- 1
|
300 |
+
- 11
|
301 |
+
- 49
|
302 |
+
- 55
|
303 |
+
- 71
|
304 |
+
- 105
|
305 |
+
2:
|
306 |
+
- 1
|
307 |
+
- 11
|
308 |
+
- 49
|
309 |
+
- 55
|
310 |
+
- 71
|
311 |
+
- 105
|
312 |
+
3:
|
313 |
+
- 13
|
314 |
+
- 16
|
315 |
+
- 23
|
316 |
+
4: 3.10.12
|
317 |
+
5: 0.16.3
|
318 |
+
6: 4.43.3
|
319 |
+
8:
|
320 |
+
- 5
|
321 |
+
13: linux-x86_64
|
wandb/run-20240824_202022-z2bjbf6e/files/output.log
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-baseline-qwen2-0.5B.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
|
6 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
7 |
+
File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
|
8 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
|
9 |
+
File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
|
10 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
|
11 |
+
No checkpoint found in /work/llm_recipes/models/yans-baseline-qwen2-0.5B, skipping model loading
|
12 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
13 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
14 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
15 |
+
Let split = None
|
16 |
+
Unable to save the indexes because path_to_cache is None
|
17 |
+
Traceback (most recent call last):
|
18 |
+
File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 270, in build_generic_dataset
|
19 |
+
dataset = cls(*args)
|
20 |
+
File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 359, in __init__
|
21 |
+
self.initialize(path_prefix, multimodal)
|
22 |
+
File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 374, in initialize
|
23 |
+
self.index = _IndexReader(get_idx_path(self.path_prefix), self.multimodal)
|
24 |
+
File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 233, in __init__
|
25 |
+
with open(idx_path, "rb") as stream:
|
26 |
+
FileNotFoundError: [Errno 2] No such file or directory: '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document.idx'
|
27 |
+
The above exception was the direct cause of the following exception:
|
28 |
+
Traceback (most recent call last):
|
29 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
30 |
+
main()
|
31 |
+
File "/project/src/llama_recipes/finetuning.py", line 162, in main
|
32 |
+
train_dataset, validation_dataset, test_dataset = build_train_valid_test_datasets()
|
33 |
+
File "/project/src/llama_recipes/datasets/pretrain_dataset.py", line 76, in build_train_valid_test_datasets
|
34 |
+
return train_valid_test_datasets_provider(train_val_test_num_samples)
|
35 |
+
File "/project/src/llama_recipes/datasets/pretrain_dataset.py", line 46, in train_valid_test_datasets_provider
|
36 |
+
).build()
|
37 |
+
File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 56, in build
|
38 |
+
return self._build_blended_dataset_splits()
|
39 |
+
File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 162, in _build_blended_dataset_splits
|
40 |
+
self._build_megatron_dataset_splits(
|
41 |
+
File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 199, in _build_megatron_dataset_splits
|
42 |
+
indexed_dataset = self.build_generic_dataset(
|
43 |
+
File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 278, in build_generic_dataset
|
44 |
+
raise Exception(log) from err
|
45 |
+
Exception: Failed to write dataset materials to the data cache directory. Please supply a directory to which you have write access via the path_to_cache attribute in BlendedMegatronDatasetConfig and retry. Refer to the preserved traceback above for more information.
|
46 |
+
--> applying fsdp activation checkpointing...
|
47 |
+
> datasets target sizes (minimum size):
|
48 |
+
train: 29667840
|
49 |
+
validation: 1484800
|
50 |
+
test: 12800
|
51 |
+
> building train, validation, and test datasets for GPT ...
|
wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
build==1.2.1
|
23 |
+
cachecontrol==0.14.0
|
24 |
+
cachetools==5.3.2
|
25 |
+
catalogue==2.0.10
|
26 |
+
certifi==2024.2.2
|
27 |
+
cffi==1.16.0
|
28 |
+
chardet==5.2.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
cleo==2.1.0
|
31 |
+
click==8.1.7
|
32 |
+
cloudpathlib==0.16.0
|
33 |
+
cloudpickle==3.0.0
|
34 |
+
cmake==3.28.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.1
|
37 |
+
confection==0.1.4
|
38 |
+
contourpy==1.2.0
|
39 |
+
cramjam==2.8.3
|
40 |
+
crashtest==0.4.1
|
41 |
+
cryptography==43.0.0
|
42 |
+
cubinlinker==0.3.0+2.g405ac64
|
43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
44 |
+
cudf==23.12.0
|
45 |
+
cugraph-dgl==23.12.0
|
46 |
+
cugraph-service-client==23.12.0
|
47 |
+
cugraph-service-server==23.12.0
|
48 |
+
cugraph==23.12.0
|
49 |
+
cuml==23.12.0
|
50 |
+
cupy-cuda12x==12.3.0
|
51 |
+
cycler==0.12.1
|
52 |
+
cymem==2.0.8
|
53 |
+
cython==3.0.8
|
54 |
+
dask-cuda==23.12.0
|
55 |
+
dask-cudf==23.12.0
|
56 |
+
dask==2023.11.0
|
57 |
+
dataclasses-json==0.6.7
|
58 |
+
dataproperty==1.0.1
|
59 |
+
datasets==2.20.0
|
60 |
+
debugpy==1.8.1
|
61 |
+
decorator==5.1.1
|
62 |
+
defusedxml==0.7.1
|
63 |
+
dill==0.3.8
|
64 |
+
distlib==0.3.8
|
65 |
+
distributed==2023.11.0
|
66 |
+
distro==1.9.0
|
67 |
+
dm-tree==0.1.8
|
68 |
+
docker-pycreds==0.4.0
|
69 |
+
dulwich==0.21.7
|
70 |
+
einops==0.7.0
|
71 |
+
emoji==2.12.1
|
72 |
+
entmax==1.3
|
73 |
+
evaluate==0.4.2
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
execnet==2.0.2
|
76 |
+
executing==2.0.1
|
77 |
+
expecttest==0.1.3
|
78 |
+
fastjsonschema==2.19.1
|
79 |
+
fastparquet==2023.10.1
|
80 |
+
fastrlock==0.8.2
|
81 |
+
filelock==3.13.1
|
82 |
+
flash-attn==2.4.2
|
83 |
+
fonttools==4.48.1
|
84 |
+
frozenlist==1.4.1
|
85 |
+
fsspec==2023.12.2
|
86 |
+
fugashi==1.3.2
|
87 |
+
fuzzywuzzy==0.18.0
|
88 |
+
gast==0.5.4
|
89 |
+
gitdb==4.0.11
|
90 |
+
gitpython==3.1.43
|
91 |
+
google-auth-oauthlib==0.4.6
|
92 |
+
google-auth==2.27.0
|
93 |
+
graphsurgeon==0.4.6
|
94 |
+
greenlet==3.0.3
|
95 |
+
grpcio==1.60.1
|
96 |
+
h11==0.14.0
|
97 |
+
httpcore==1.0.5
|
98 |
+
httpx==0.27.0
|
99 |
+
huggingface-hub==0.24.5
|
100 |
+
hydra-core==1.3.2
|
101 |
+
hypothesis==5.35.1
|
102 |
+
idna==3.6
|
103 |
+
importlib-metadata==7.0.1
|
104 |
+
iniconfig==2.0.0
|
105 |
+
installer==0.7.0
|
106 |
+
intel-openmp==2021.4.0
|
107 |
+
ipadic==1.0.0
|
108 |
+
ipykernel==6.29.2
|
109 |
+
ipython-genutils==0.2.0
|
110 |
+
ipython==8.21.0
|
111 |
+
isort==5.13.2
|
112 |
+
jaraco.classes==3.4.0
|
113 |
+
jedi==0.19.1
|
114 |
+
jeepney==0.8.0
|
115 |
+
jinja2==3.1.3
|
116 |
+
jiter==0.5.0
|
117 |
+
joblib==1.3.2
|
118 |
+
json5==0.9.14
|
119 |
+
jsonargparse==3.13.1
|
120 |
+
jsonlines==4.0.0
|
121 |
+
jsonnet==0.19.1
|
122 |
+
jsonpatch==1.33
|
123 |
+
jsonpointer==3.0.0
|
124 |
+
jsonschema-specifications==2023.12.1
|
125 |
+
jsonschema==4.21.1
|
126 |
+
jupyter-client==8.6.0
|
127 |
+
jupyter-core==5.7.1
|
128 |
+
jupyter-tensorboard==0.2.0
|
129 |
+
jupyterlab-pygments==0.3.0
|
130 |
+
jupyterlab-server==1.2.0
|
131 |
+
jupyterlab==2.3.2
|
132 |
+
jupytext==1.16.1
|
133 |
+
keyring==24.3.1
|
134 |
+
kiwisolver==1.4.5
|
135 |
+
langchain-community==0.2.12
|
136 |
+
langchain-core==0.2.31
|
137 |
+
langchain-huggingface==0.0.2
|
138 |
+
langchain-openai==0.1.21
|
139 |
+
langchain-text-splitters==0.2.2
|
140 |
+
langchain==0.2.13
|
141 |
+
langcodes==3.3.0
|
142 |
+
langsmith==0.1.99
|
143 |
+
lazy-loader==0.3
|
144 |
+
levenshtein==0.25.1
|
145 |
+
librosa==0.10.1
|
146 |
+
lightning-utilities==0.11.6
|
147 |
+
llm-jp-eval==1.4.0
|
148 |
+
llvmlite==0.40.1
|
149 |
+
lm-eval==0.3.0
|
150 |
+
locket==1.0.0
|
151 |
+
logzero==1.7.0
|
152 |
+
lxml==5.2.2
|
153 |
+
markdown-it-py==3.0.0
|
154 |
+
markdown==3.5.2
|
155 |
+
markupsafe==2.1.4
|
156 |
+
marshmallow==3.21.3
|
157 |
+
matplotlib-inline==0.1.6
|
158 |
+
matplotlib==3.8.2
|
159 |
+
mbstrdecoder==1.1.3
|
160 |
+
mccabe==0.7.0
|
161 |
+
mdit-py-plugins==0.4.0
|
162 |
+
mdurl==0.1.2
|
163 |
+
mecab-python3==1.0.6
|
164 |
+
mistune==3.0.2
|
165 |
+
mkl-devel==2021.1.1
|
166 |
+
mkl-include==2021.1.1
|
167 |
+
mkl==2021.1.1
|
168 |
+
mock==5.1.0
|
169 |
+
mojimoji==0.0.13
|
170 |
+
more-itertools==9.1.0
|
171 |
+
mpmath==1.3.0
|
172 |
+
msgpack==1.0.7
|
173 |
+
multidict==6.0.4
|
174 |
+
multiprocess==0.70.16
|
175 |
+
murmurhash==1.0.10
|
176 |
+
mypy-extensions==1.0.0
|
177 |
+
nbclient==0.9.0
|
178 |
+
nbconvert==7.16.0
|
179 |
+
nbformat==5.9.2
|
180 |
+
neologdn==0.5.3
|
181 |
+
nest-asyncio==1.6.0
|
182 |
+
networkx==2.6.3
|
183 |
+
ninja==1.11.1.1
|
184 |
+
nltk==3.8.1
|
185 |
+
notebook==6.4.10
|
186 |
+
numba==0.57.1+1.g1ff679645
|
187 |
+
numexpr==2.10.1
|
188 |
+
numpy==1.24.4
|
189 |
+
nvfuser==0.1.4a0+d0bb811
|
190 |
+
nvidia-dali-cuda120==1.34.0
|
191 |
+
nvidia-pyindex==1.0.9
|
192 |
+
nvtx==0.2.5
|
193 |
+
oauthlib==3.2.2
|
194 |
+
omegaconf==2.3.0
|
195 |
+
onnx==1.15.0rc2
|
196 |
+
openai==1.40.6
|
197 |
+
opencv==4.7.0
|
198 |
+
optree==0.10.0
|
199 |
+
orjson==3.10.7
|
200 |
+
packaging==23.2
|
201 |
+
pandas==2.2.2
|
202 |
+
pandocfilters==1.5.1
|
203 |
+
parso==0.8.3
|
204 |
+
partd==1.4.1
|
205 |
+
pathvalidate==3.2.0
|
206 |
+
peft==0.5.0
|
207 |
+
pexpect==4.9.0
|
208 |
+
pillow==10.2.0
|
209 |
+
pip==24.0
|
210 |
+
pkginfo==1.11.1
|
211 |
+
plac==1.4.3
|
212 |
+
platformdirs==4.2.0
|
213 |
+
pluggy==1.4.0
|
214 |
+
ply==3.11
|
215 |
+
poetry-core==1.9.0
|
216 |
+
poetry-plugin-export==1.8.0
|
217 |
+
poetry==1.8.3
|
218 |
+
polygraphy==0.49.4
|
219 |
+
pooch==1.8.0
|
220 |
+
portalocker==2.10.1
|
221 |
+
preshed==3.0.9
|
222 |
+
prettytable==3.9.0
|
223 |
+
prometheus-client==0.19.0
|
224 |
+
prompt-toolkit==3.0.43
|
225 |
+
protobuf==4.24.4
|
226 |
+
psutil==5.9.4
|
227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
228 |
+
ptyprocess==0.7.0
|
229 |
+
pure-eval==0.2.2
|
230 |
+
pyarrow-hotfix==0.6
|
231 |
+
pyarrow==15.0.2
|
232 |
+
pyasn1-modules==0.3.0
|
233 |
+
pyasn1==0.5.1
|
234 |
+
pybind11-global==2.11.1
|
235 |
+
pybind11==2.11.1
|
236 |
+
pycocotools==2.0+nv0.8.0
|
237 |
+
pycountry==24.6.1
|
238 |
+
pycparser==2.21
|
239 |
+
pydantic-core==2.16.2
|
240 |
+
pydantic==2.6.1
|
241 |
+
pygments==2.17.2
|
242 |
+
pylibcugraph==23.12.0
|
243 |
+
pylibcugraphops==23.12.0
|
244 |
+
pylibraft==23.12.0
|
245 |
+
pylint==3.2.6
|
246 |
+
pynvml==11.4.1
|
247 |
+
pyparsing==3.1.1
|
248 |
+
pyproject-hooks==1.1.0
|
249 |
+
pytablewriter==1.2.0
|
250 |
+
pytest-flakefinder==1.1.0
|
251 |
+
pytest-rerunfailures==13.0
|
252 |
+
pytest-shard==0.1.2
|
253 |
+
pytest-xdist==3.5.0
|
254 |
+
pytest==8.0.0
|
255 |
+
python-dateutil==2.8.2
|
256 |
+
python-dotenv==1.0.0
|
257 |
+
python-hostlist==1.23.0
|
258 |
+
python-levenshtein==0.25.1
|
259 |
+
pytorch-lightning==2.4.0
|
260 |
+
pytorch-quantization==2.1.2
|
261 |
+
pytz==2023.3.post1
|
262 |
+
pyyaml==6.0.1
|
263 |
+
pyzmq==25.1.2
|
264 |
+
raft-dask==23.12.0
|
265 |
+
rapidfuzz==3.9.6
|
266 |
+
rapids-dask-dependency==23.12.1
|
267 |
+
referencing==0.33.0
|
268 |
+
regex==2023.12.25
|
269 |
+
requests-oauthlib==1.3.1
|
270 |
+
requests-toolbelt==1.0.0
|
271 |
+
requests==2.32.3
|
272 |
+
rhoknp==1.7.0
|
273 |
+
rich==13.7.0
|
274 |
+
rmm==23.12.0
|
275 |
+
rouge-score==0.1.2
|
276 |
+
rpds-py==0.17.1
|
277 |
+
rsa==4.9
|
278 |
+
sacrebleu==2.4.2
|
279 |
+
safetensors==0.4.3
|
280 |
+
scikit-learn==1.5.1
|
281 |
+
scipy==1.12.0
|
282 |
+
secretstorage==3.3.3
|
283 |
+
send2trash==1.8.2
|
284 |
+
sentence-transformers==3.0.1
|
285 |
+
sentencepiece==0.1.99
|
286 |
+
sentry-sdk==2.12.0
|
287 |
+
setproctitle==1.3.3
|
288 |
+
setuptools==68.2.2
|
289 |
+
shellingham==1.5.4
|
290 |
+
six==1.16.0
|
291 |
+
smart-open==6.4.0
|
292 |
+
smmap==5.0.1
|
293 |
+
sniffio==1.3.1
|
294 |
+
sortedcontainers==2.4.0
|
295 |
+
soundfile==0.12.1
|
296 |
+
soupsieve==2.5
|
297 |
+
soxr==0.3.7
|
298 |
+
spacy-legacy==3.0.12
|
299 |
+
spacy-loggers==1.0.5
|
300 |
+
spacy==3.7.2
|
301 |
+
sphinx-glpi-theme==0.6
|
302 |
+
sqlalchemy==2.0.32
|
303 |
+
sqlitedict==2.1.0
|
304 |
+
srsly==2.4.8
|
305 |
+
stack-data==0.6.3
|
306 |
+
sumeval==0.2.2
|
307 |
+
sympy==1.12
|
308 |
+
tabledata==1.3.3
|
309 |
+
tabulate==0.9.0
|
310 |
+
tbb==2021.11.0
|
311 |
+
tblib==3.0.0
|
312 |
+
tcolorpy==0.1.6
|
313 |
+
tenacity==8.5.0
|
314 |
+
tensorboard-data-server==0.6.1
|
315 |
+
tensorboard-plugin-wit==1.8.1
|
316 |
+
tensorboard==2.9.0
|
317 |
+
tensorrt==8.6.3
|
318 |
+
terminado==0.18.0
|
319 |
+
termplotlib==0.3.9
|
320 |
+
text-generation==0.7.0
|
321 |
+
thinc==8.2.3
|
322 |
+
threadpoolctl==3.2.0
|
323 |
+
thriftpy2==0.4.17
|
324 |
+
tiktoken==0.7.0
|
325 |
+
tinycss2==1.2.1
|
326 |
+
tokenizers==0.19.1
|
327 |
+
toml==0.10.2
|
328 |
+
tomli==2.0.1
|
329 |
+
tomlkit==0.13.2
|
330 |
+
toolz==0.12.1
|
331 |
+
torch-tensorrt==2.3.0a0
|
332 |
+
torch==2.3.0a0+ebedce2
|
333 |
+
torchdata==0.7.1a0
|
334 |
+
torchmetrics==0.10.3
|
335 |
+
torchtext==0.17.0a0
|
336 |
+
torchvision==0.18.0a0
|
337 |
+
tornado==6.4
|
338 |
+
tqdm-multiprocess==0.0.11
|
339 |
+
tqdm==4.66.5
|
340 |
+
traitlets==5.9.0
|
341 |
+
transformer-engine==1.3.0+5b90b7f
|
342 |
+
transformers==4.43.3
|
343 |
+
treelite-runtime==3.9.1
|
344 |
+
treelite==3.9.1
|
345 |
+
triton==2.2.0+e28a256
|
346 |
+
trove-classifiers==2024.7.2
|
347 |
+
typepy==1.3.2
|
348 |
+
typer==0.9.0
|
349 |
+
types-dataclasses==0.6.6
|
350 |
+
typing-extensions==4.12.2
|
351 |
+
typing-inspect==0.9.0
|
352 |
+
tzdata==2024.1
|
353 |
+
ucx-py==0.35.0
|
354 |
+
uff==0.6.9
|
355 |
+
ujson==5.8.0
|
356 |
+
unbabel-comet==2.2.2
|
357 |
+
unidic-lite==1.0.8
|
358 |
+
urllib3==1.26.18
|
359 |
+
virtualenv==20.26.3
|
360 |
+
wandb==0.16.3
|
361 |
+
wasabi==1.1.2
|
362 |
+
wcwidth==0.2.13
|
363 |
+
weasel==0.3.4
|
364 |
+
webencodings==0.5.1
|
365 |
+
werkzeug==3.0.1
|
366 |
+
wheel==0.42.0
|
367 |
+
word2number==1.1
|
368 |
+
xdoctest==1.0.2
|
369 |
+
xgboost==1.7.6
|
370 |
+
xmltodict==0.13.0
|
371 |
+
xxhash==3.4.1
|
372 |
+
yarl==1.9.4
|
373 |
+
zict==3.0.0
|
374 |
+
zipp==3.17.0
|
375 |
+
zstandard==0.23.0
|
wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json
ADDED
@@ -0,0 +1,880 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-24T11:20:23.248321",
|
5 |
+
"startedAt": "2024-08-24T11:20:22.637930",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"1024",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"131072",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"16",
|
15 |
+
"--valid_micro_batch_size",
|
16 |
+
"1",
|
17 |
+
"--global-batch-size",
|
18 |
+
"1280",
|
19 |
+
"--train-iters",
|
20 |
+
"23178",
|
21 |
+
"--tokenizer-type",
|
22 |
+
"HFPreTrainedTokenizer",
|
23 |
+
"--tokenizer-model",
|
24 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
25 |
+
"--train-data-path",
|
26 |
+
"1754785366",
|
27 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
28 |
+
"28623823675",
|
29 |
+
"/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
|
30 |
+
"--valid-data-path",
|
31 |
+
"1205770",
|
32 |
+
"/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document",
|
33 |
+
"--test-data-path",
|
34 |
+
"1205770",
|
35 |
+
"/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document",
|
36 |
+
"--lr",
|
37 |
+
"3.5e-6",
|
38 |
+
"--min-lr",
|
39 |
+
"3.5e-7",
|
40 |
+
"--lr-decay-style",
|
41 |
+
"cosine",
|
42 |
+
"--lr-warmup-iters",
|
43 |
+
"500",
|
44 |
+
"--lr-decay-iters",
|
45 |
+
"23178",
|
46 |
+
"--weight-decay",
|
47 |
+
"0.1",
|
48 |
+
"--grad-clip-norm",
|
49 |
+
"1.0",
|
50 |
+
"--optimizer",
|
51 |
+
"anyprecision",
|
52 |
+
"--adam-beta1",
|
53 |
+
"0.9",
|
54 |
+
"--adam-beta2",
|
55 |
+
"0.95",
|
56 |
+
"--adam-eps",
|
57 |
+
"1e-8",
|
58 |
+
"--save-interval",
|
59 |
+
"200",
|
60 |
+
"--eval-interval",
|
61 |
+
"200",
|
62 |
+
"--eval-iters",
|
63 |
+
"10",
|
64 |
+
"--bf16",
|
65 |
+
"--mixed-precision",
|
66 |
+
"--base-model",
|
67 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
68 |
+
"--save",
|
69 |
+
"/work/llm_recipes/models/yans-baseline-qwen2-0.5B",
|
70 |
+
"--load",
|
71 |
+
"/work/llm_recipes/models/yans-baseline-qwen2-0.5B",
|
72 |
+
"--num-workers",
|
73 |
+
"4",
|
74 |
+
"--fsdp-activation-checkpointing",
|
75 |
+
"--sharding-strategy",
|
76 |
+
"NO_SHARD",
|
77 |
+
"--checkpoint-type",
|
78 |
+
"LOCAL_STATE_DICT",
|
79 |
+
"--save-n-checkpoints",
|
80 |
+
"10",
|
81 |
+
"--upload-all-checkpoints-to-hf",
|
82 |
+
"--hf-upload-retry-limit",
|
83 |
+
"2",
|
84 |
+
"--hf-repo-id",
|
85 |
+
"koichi12/yans-baseline-qwen2-0.5B",
|
86 |
+
"--wandb-entity",
|
87 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
88 |
+
"--wandb-project",
|
89 |
+
"yans_experiment",
|
90 |
+
"--wandb-name",
|
91 |
+
"yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07"
|
92 |
+
],
|
93 |
+
"state": "running",
|
94 |
+
"program": "/project/examples/finetuning.py",
|
95 |
+
"codePathLocal": "examples/finetuning.py",
|
96 |
+
"codePath": "examples/finetuning.py",
|
97 |
+
"git": {
|
98 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
99 |
+
"commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
|
100 |
+
},
|
101 |
+
"email": null,
|
102 |
+
"root": "/project",
|
103 |
+
"host": "gpu-koiwa-00",
|
104 |
+
"username": "koiwa",
|
105 |
+
"executable": "/usr/bin/python",
|
106 |
+
"cpu_count": 144,
|
107 |
+
"cpu_count_logical": 144,
|
108 |
+
"cpu_freq": {
|
109 |
+
"current": 2400.0340000000015,
|
110 |
+
"min": 0.0,
|
111 |
+
"max": 0.0
|
112 |
+
},
|
113 |
+
"cpu_freq_per_core": [
|
114 |
+
{
|
115 |
+
"current": 2400.034,
|
116 |
+
"min": 0.0,
|
117 |
+
"max": 0.0
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"current": 2400.034,
|
121 |
+
"min": 0.0,
|
122 |
+
"max": 0.0
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"current": 2400.034,
|
126 |
+
"min": 0.0,
|
127 |
+
"max": 0.0
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"current": 2400.034,
|
131 |
+
"min": 0.0,
|
132 |
+
"max": 0.0
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"current": 2400.034,
|
136 |
+
"min": 0.0,
|
137 |
+
"max": 0.0
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"current": 2400.034,
|
141 |
+
"min": 0.0,
|
142 |
+
"max": 0.0
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"current": 2400.034,
|
146 |
+
"min": 0.0,
|
147 |
+
"max": 0.0
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"current": 2400.034,
|
151 |
+
"min": 0.0,
|
152 |
+
"max": 0.0
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"current": 2400.034,
|
156 |
+
"min": 0.0,
|
157 |
+
"max": 0.0
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"current": 2400.034,
|
161 |
+
"min": 0.0,
|
162 |
+
"max": 0.0
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"current": 2400.034,
|
166 |
+
"min": 0.0,
|
167 |
+
"max": 0.0
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"current": 2400.034,
|
171 |
+
"min": 0.0,
|
172 |
+
"max": 0.0
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"current": 2400.034,
|
176 |
+
"min": 0.0,
|
177 |
+
"max": 0.0
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"current": 2400.034,
|
181 |
+
"min": 0.0,
|
182 |
+
"max": 0.0
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"current": 2400.034,
|
186 |
+
"min": 0.0,
|
187 |
+
"max": 0.0
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"current": 2400.034,
|
191 |
+
"min": 0.0,
|
192 |
+
"max": 0.0
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"current": 2400.034,
|
196 |
+
"min": 0.0,
|
197 |
+
"max": 0.0
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"current": 2400.034,
|
201 |
+
"min": 0.0,
|
202 |
+
"max": 0.0
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"current": 2400.034,
|
206 |
+
"min": 0.0,
|
207 |
+
"max": 0.0
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"current": 2400.034,
|
211 |
+
"min": 0.0,
|
212 |
+
"max": 0.0
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"current": 2400.034,
|
216 |
+
"min": 0.0,
|
217 |
+
"max": 0.0
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"current": 2400.034,
|
221 |
+
"min": 0.0,
|
222 |
+
"max": 0.0
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"current": 2400.034,
|
226 |
+
"min": 0.0,
|
227 |
+
"max": 0.0
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"current": 2400.034,
|
231 |
+
"min": 0.0,
|
232 |
+
"max": 0.0
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"current": 2400.034,
|
236 |
+
"min": 0.0,
|
237 |
+
"max": 0.0
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"current": 2400.034,
|
241 |
+
"min": 0.0,
|
242 |
+
"max": 0.0
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"current": 2400.034,
|
246 |
+
"min": 0.0,
|
247 |
+
"max": 0.0
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"current": 2400.034,
|
251 |
+
"min": 0.0,
|
252 |
+
"max": 0.0
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"current": 2400.034,
|
256 |
+
"min": 0.0,
|
257 |
+
"max": 0.0
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"current": 2400.034,
|
261 |
+
"min": 0.0,
|
262 |
+
"max": 0.0
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"current": 2400.034,
|
266 |
+
"min": 0.0,
|
267 |
+
"max": 0.0
|
268 |
+
},
|
269 |
+
{
|
270 |
+
"current": 2400.034,
|
271 |
+
"min": 0.0,
|
272 |
+
"max": 0.0
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"current": 2400.034,
|
276 |
+
"min": 0.0,
|
277 |
+
"max": 0.0
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"current": 2400.034,
|
281 |
+
"min": 0.0,
|
282 |
+
"max": 0.0
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"current": 2400.034,
|
286 |
+
"min": 0.0,
|
287 |
+
"max": 0.0
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"current": 2400.034,
|
291 |
+
"min": 0.0,
|
292 |
+
"max": 0.0
|
293 |
+
},
|
294 |
+
{
|
295 |
+
"current": 2400.034,
|
296 |
+
"min": 0.0,
|
297 |
+
"max": 0.0
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"current": 2400.034,
|
301 |
+
"min": 0.0,
|
302 |
+
"max": 0.0
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"current": 2400.034,
|
306 |
+
"min": 0.0,
|
307 |
+
"max": 0.0
|
308 |
+
},
|
309 |
+
{
|
310 |
+
"current": 2400.034,
|
311 |
+
"min": 0.0,
|
312 |
+
"max": 0.0
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"current": 2400.034,
|
316 |
+
"min": 0.0,
|
317 |
+
"max": 0.0
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"current": 2400.034,
|
321 |
+
"min": 0.0,
|
322 |
+
"max": 0.0
|
323 |
+
},
|
324 |
+
{
|
325 |
+
"current": 2400.034,
|
326 |
+
"min": 0.0,
|
327 |
+
"max": 0.0
|
328 |
+
},
|
329 |
+
{
|
330 |
+
"current": 2400.034,
|
331 |
+
"min": 0.0,
|
332 |
+
"max": 0.0
|
333 |
+
},
|
334 |
+
{
|
335 |
+
"current": 2400.034,
|
336 |
+
"min": 0.0,
|
337 |
+
"max": 0.0
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"current": 2400.034,
|
341 |
+
"min": 0.0,
|
342 |
+
"max": 0.0
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"current": 2400.034,
|
346 |
+
"min": 0.0,
|
347 |
+
"max": 0.0
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"current": 2400.034,
|
351 |
+
"min": 0.0,
|
352 |
+
"max": 0.0
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"current": 2400.034,
|
356 |
+
"min": 0.0,
|
357 |
+
"max": 0.0
|
358 |
+
},
|
359 |
+
{
|
360 |
+
"current": 2400.034,
|
361 |
+
"min": 0.0,
|
362 |
+
"max": 0.0
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"current": 2400.034,
|
366 |
+
"min": 0.0,
|
367 |
+
"max": 0.0
|
368 |
+
},
|
369 |
+
{
|
370 |
+
"current": 2400.034,
|
371 |
+
"min": 0.0,
|
372 |
+
"max": 0.0
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"current": 2400.034,
|
376 |
+
"min": 0.0,
|
377 |
+
"max": 0.0
|
378 |
+
},
|
379 |
+
{
|
380 |
+
"current": 2400.034,
|
381 |
+
"min": 0.0,
|
382 |
+
"max": 0.0
|
383 |
+
},
|
384 |
+
{
|
385 |
+
"current": 2400.034,
|
386 |
+
"min": 0.0,
|
387 |
+
"max": 0.0
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"current": 2400.034,
|
391 |
+
"min": 0.0,
|
392 |
+
"max": 0.0
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"current": 2400.034,
|
396 |
+
"min": 0.0,
|
397 |
+
"max": 0.0
|
398 |
+
},
|
399 |
+
{
|
400 |
+
"current": 2400.034,
|
401 |
+
"min": 0.0,
|
402 |
+
"max": 0.0
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"current": 2400.034,
|
406 |
+
"min": 0.0,
|
407 |
+
"max": 0.0
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"current": 2400.034,
|
411 |
+
"min": 0.0,
|
412 |
+
"max": 0.0
|
413 |
+
},
|
414 |
+
{
|
415 |
+
"current": 2400.034,
|
416 |
+
"min": 0.0,
|
417 |
+
"max": 0.0
|
418 |
+
},
|
419 |
+
{
|
420 |
+
"current": 2400.034,
|
421 |
+
"min": 0.0,
|
422 |
+
"max": 0.0
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"current": 2400.034,
|
426 |
+
"min": 0.0,
|
427 |
+
"max": 0.0
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"current": 2400.034,
|
431 |
+
"min": 0.0,
|
432 |
+
"max": 0.0
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"current": 2400.034,
|
436 |
+
"min": 0.0,
|
437 |
+
"max": 0.0
|
438 |
+
},
|
439 |
+
{
|
440 |
+
"current": 2400.034,
|
441 |
+
"min": 0.0,
|
442 |
+
"max": 0.0
|
443 |
+
},
|
444 |
+
{
|
445 |
+
"current": 2400.034,
|
446 |
+
"min": 0.0,
|
447 |
+
"max": 0.0
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"current": 2400.034,
|
451 |
+
"min": 0.0,
|
452 |
+
"max": 0.0
|
453 |
+
},
|
454 |
+
{
|
455 |
+
"current": 2400.034,
|
456 |
+
"min": 0.0,
|
457 |
+
"max": 0.0
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"current": 2400.034,
|
461 |
+
"min": 0.0,
|
462 |
+
"max": 0.0
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"current": 2400.034,
|
466 |
+
"min": 0.0,
|
467 |
+
"max": 0.0
|
468 |
+
},
|
469 |
+
{
|
470 |
+
"current": 2400.034,
|
471 |
+
"min": 0.0,
|
472 |
+
"max": 0.0
|
473 |
+
},
|
474 |
+
{
|
475 |
+
"current": 2400.034,
|
476 |
+
"min": 0.0,
|
477 |
+
"max": 0.0
|
478 |
+
},
|
479 |
+
{
|
480 |
+
"current": 2400.034,
|
481 |
+
"min": 0.0,
|
482 |
+
"max": 0.0
|
483 |
+
},
|
484 |
+
{
|
485 |
+
"current": 2400.034,
|
486 |
+
"min": 0.0,
|
487 |
+
"max": 0.0
|
488 |
+
},
|
489 |
+
{
|
490 |
+
"current": 2400.034,
|
491 |
+
"min": 0.0,
|
492 |
+
"max": 0.0
|
493 |
+
},
|
494 |
+
{
|
495 |
+
"current": 2400.034,
|
496 |
+
"min": 0.0,
|
497 |
+
"max": 0.0
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"current": 2400.034,
|
501 |
+
"min": 0.0,
|
502 |
+
"max": 0.0
|
503 |
+
},
|
504 |
+
{
|
505 |
+
"current": 2400.034,
|
506 |
+
"min": 0.0,
|
507 |
+
"max": 0.0
|
508 |
+
},
|
509 |
+
{
|
510 |
+
"current": 2400.034,
|
511 |
+
"min": 0.0,
|
512 |
+
"max": 0.0
|
513 |
+
},
|
514 |
+
{
|
515 |
+
"current": 2400.034,
|
516 |
+
"min": 0.0,
|
517 |
+
"max": 0.0
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"current": 2400.034,
|
521 |
+
"min": 0.0,
|
522 |
+
"max": 0.0
|
523 |
+
},
|
524 |
+
{
|
525 |
+
"current": 2400.034,
|
526 |
+
"min": 0.0,
|
527 |
+
"max": 0.0
|
528 |
+
},
|
529 |
+
{
|
530 |
+
"current": 2400.034,
|
531 |
+
"min": 0.0,
|
532 |
+
"max": 0.0
|
533 |
+
},
|
534 |
+
{
|
535 |
+
"current": 2400.034,
|
536 |
+
"min": 0.0,
|
537 |
+
"max": 0.0
|
538 |
+
},
|
539 |
+
{
|
540 |
+
"current": 2400.034,
|
541 |
+
"min": 0.0,
|
542 |
+
"max": 0.0
|
543 |
+
},
|
544 |
+
{
|
545 |
+
"current": 2400.034,
|
546 |
+
"min": 0.0,
|
547 |
+
"max": 0.0
|
548 |
+
},
|
549 |
+
{
|
550 |
+
"current": 2400.034,
|
551 |
+
"min": 0.0,
|
552 |
+
"max": 0.0
|
553 |
+
},
|
554 |
+
{
|
555 |
+
"current": 2400.034,
|
556 |
+
"min": 0.0,
|
557 |
+
"max": 0.0
|
558 |
+
},
|
559 |
+
{
|
560 |
+
"current": 2400.034,
|
561 |
+
"min": 0.0,
|
562 |
+
"max": 0.0
|
563 |
+
},
|
564 |
+
{
|
565 |
+
"current": 2400.034,
|
566 |
+
"min": 0.0,
|
567 |
+
"max": 0.0
|
568 |
+
},
|
569 |
+
{
|
570 |
+
"current": 2400.034,
|
571 |
+
"min": 0.0,
|
572 |
+
"max": 0.0
|
573 |
+
},
|
574 |
+
{
|
575 |
+
"current": 2400.034,
|
576 |
+
"min": 0.0,
|
577 |
+
"max": 0.0
|
578 |
+
},
|
579 |
+
{
|
580 |
+
"current": 2400.034,
|
581 |
+
"min": 0.0,
|
582 |
+
"max": 0.0
|
583 |
+
},
|
584 |
+
{
|
585 |
+
"current": 2400.034,
|
586 |
+
"min": 0.0,
|
587 |
+
"max": 0.0
|
588 |
+
},
|
589 |
+
{
|
590 |
+
"current": 2400.034,
|
591 |
+
"min": 0.0,
|
592 |
+
"max": 0.0
|
593 |
+
},
|
594 |
+
{
|
595 |
+
"current": 2400.034,
|
596 |
+
"min": 0.0,
|
597 |
+
"max": 0.0
|
598 |
+
},
|
599 |
+
{
|
600 |
+
"current": 2400.034,
|
601 |
+
"min": 0.0,
|
602 |
+
"max": 0.0
|
603 |
+
},
|
604 |
+
{
|
605 |
+
"current": 2400.034,
|
606 |
+
"min": 0.0,
|
607 |
+
"max": 0.0
|
608 |
+
},
|
609 |
+
{
|
610 |
+
"current": 2400.034,
|
611 |
+
"min": 0.0,
|
612 |
+
"max": 0.0
|
613 |
+
},
|
614 |
+
{
|
615 |
+
"current": 2400.034,
|
616 |
+
"min": 0.0,
|
617 |
+
"max": 0.0
|
618 |
+
},
|
619 |
+
{
|
620 |
+
"current": 2400.034,
|
621 |
+
"min": 0.0,
|
622 |
+
"max": 0.0
|
623 |
+
},
|
624 |
+
{
|
625 |
+
"current": 2400.034,
|
626 |
+
"min": 0.0,
|
627 |
+
"max": 0.0
|
628 |
+
},
|
629 |
+
{
|
630 |
+
"current": 2400.034,
|
631 |
+
"min": 0.0,
|
632 |
+
"max": 0.0
|
633 |
+
},
|
634 |
+
{
|
635 |
+
"current": 2400.034,
|
636 |
+
"min": 0.0,
|
637 |
+
"max": 0.0
|
638 |
+
},
|
639 |
+
{
|
640 |
+
"current": 2400.034,
|
641 |
+
"min": 0.0,
|
642 |
+
"max": 0.0
|
643 |
+
},
|
644 |
+
{
|
645 |
+
"current": 2400.034,
|
646 |
+
"min": 0.0,
|
647 |
+
"max": 0.0
|
648 |
+
},
|
649 |
+
{
|
650 |
+
"current": 2400.034,
|
651 |
+
"min": 0.0,
|
652 |
+
"max": 0.0
|
653 |
+
},
|
654 |
+
{
|
655 |
+
"current": 2400.034,
|
656 |
+
"min": 0.0,
|
657 |
+
"max": 0.0
|
658 |
+
},
|
659 |
+
{
|
660 |
+
"current": 2400.034,
|
661 |
+
"min": 0.0,
|
662 |
+
"max": 0.0
|
663 |
+
},
|
664 |
+
{
|
665 |
+
"current": 2400.034,
|
666 |
+
"min": 0.0,
|
667 |
+
"max": 0.0
|
668 |
+
},
|
669 |
+
{
|
670 |
+
"current": 2400.034,
|
671 |
+
"min": 0.0,
|
672 |
+
"max": 0.0
|
673 |
+
},
|
674 |
+
{
|
675 |
+
"current": 2400.034,
|
676 |
+
"min": 0.0,
|
677 |
+
"max": 0.0
|
678 |
+
},
|
679 |
+
{
|
680 |
+
"current": 2400.034,
|
681 |
+
"min": 0.0,
|
682 |
+
"max": 0.0
|
683 |
+
},
|
684 |
+
{
|
685 |
+
"current": 2400.034,
|
686 |
+
"min": 0.0,
|
687 |
+
"max": 0.0
|
688 |
+
},
|
689 |
+
{
|
690 |
+
"current": 2400.034,
|
691 |
+
"min": 0.0,
|
692 |
+
"max": 0.0
|
693 |
+
},
|
694 |
+
{
|
695 |
+
"current": 2400.034,
|
696 |
+
"min": 0.0,
|
697 |
+
"max": 0.0
|
698 |
+
},
|
699 |
+
{
|
700 |
+
"current": 2400.034,
|
701 |
+
"min": 0.0,
|
702 |
+
"max": 0.0
|
703 |
+
},
|
704 |
+
{
|
705 |
+
"current": 2400.034,
|
706 |
+
"min": 0.0,
|
707 |
+
"max": 0.0
|
708 |
+
},
|
709 |
+
{
|
710 |
+
"current": 2400.034,
|
711 |
+
"min": 0.0,
|
712 |
+
"max": 0.0
|
713 |
+
},
|
714 |
+
{
|
715 |
+
"current": 2400.034,
|
716 |
+
"min": 0.0,
|
717 |
+
"max": 0.0
|
718 |
+
},
|
719 |
+
{
|
720 |
+
"current": 2400.034,
|
721 |
+
"min": 0.0,
|
722 |
+
"max": 0.0
|
723 |
+
},
|
724 |
+
{
|
725 |
+
"current": 2400.034,
|
726 |
+
"min": 0.0,
|
727 |
+
"max": 0.0
|
728 |
+
},
|
729 |
+
{
|
730 |
+
"current": 2400.034,
|
731 |
+
"min": 0.0,
|
732 |
+
"max": 0.0
|
733 |
+
},
|
734 |
+
{
|
735 |
+
"current": 2400.034,
|
736 |
+
"min": 0.0,
|
737 |
+
"max": 0.0
|
738 |
+
},
|
739 |
+
{
|
740 |
+
"current": 2400.034,
|
741 |
+
"min": 0.0,
|
742 |
+
"max": 0.0
|
743 |
+
},
|
744 |
+
{
|
745 |
+
"current": 2400.034,
|
746 |
+
"min": 0.0,
|
747 |
+
"max": 0.0
|
748 |
+
},
|
749 |
+
{
|
750 |
+
"current": 2400.034,
|
751 |
+
"min": 0.0,
|
752 |
+
"max": 0.0
|
753 |
+
},
|
754 |
+
{
|
755 |
+
"current": 2400.034,
|
756 |
+
"min": 0.0,
|
757 |
+
"max": 0.0
|
758 |
+
},
|
759 |
+
{
|
760 |
+
"current": 2400.034,
|
761 |
+
"min": 0.0,
|
762 |
+
"max": 0.0
|
763 |
+
},
|
764 |
+
{
|
765 |
+
"current": 2400.034,
|
766 |
+
"min": 0.0,
|
767 |
+
"max": 0.0
|
768 |
+
},
|
769 |
+
{
|
770 |
+
"current": 2400.034,
|
771 |
+
"min": 0.0,
|
772 |
+
"max": 0.0
|
773 |
+
},
|
774 |
+
{
|
775 |
+
"current": 2400.034,
|
776 |
+
"min": 0.0,
|
777 |
+
"max": 0.0
|
778 |
+
},
|
779 |
+
{
|
780 |
+
"current": 2400.034,
|
781 |
+
"min": 0.0,
|
782 |
+
"max": 0.0
|
783 |
+
},
|
784 |
+
{
|
785 |
+
"current": 2400.034,
|
786 |
+
"min": 0.0,
|
787 |
+
"max": 0.0
|
788 |
+
},
|
789 |
+
{
|
790 |
+
"current": 2400.034,
|
791 |
+
"min": 0.0,
|
792 |
+
"max": 0.0
|
793 |
+
},
|
794 |
+
{
|
795 |
+
"current": 2400.034,
|
796 |
+
"min": 0.0,
|
797 |
+
"max": 0.0
|
798 |
+
},
|
799 |
+
{
|
800 |
+
"current": 2400.034,
|
801 |
+
"min": 0.0,
|
802 |
+
"max": 0.0
|
803 |
+
},
|
804 |
+
{
|
805 |
+
"current": 2400.034,
|
806 |
+
"min": 0.0,
|
807 |
+
"max": 0.0
|
808 |
+
},
|
809 |
+
{
|
810 |
+
"current": 2400.034,
|
811 |
+
"min": 0.0,
|
812 |
+
"max": 0.0
|
813 |
+
},
|
814 |
+
{
|
815 |
+
"current": 2400.034,
|
816 |
+
"min": 0.0,
|
817 |
+
"max": 0.0
|
818 |
+
},
|
819 |
+
{
|
820 |
+
"current": 2400.034,
|
821 |
+
"min": 0.0,
|
822 |
+
"max": 0.0
|
823 |
+
},
|
824 |
+
{
|
825 |
+
"current": 2400.034,
|
826 |
+
"min": 0.0,
|
827 |
+
"max": 0.0
|
828 |
+
},
|
829 |
+
{
|
830 |
+
"current": 2400.034,
|
831 |
+
"min": 0.0,
|
832 |
+
"max": 0.0
|
833 |
+
}
|
834 |
+
],
|
835 |
+
"disk": {
|
836 |
+
"/": {
|
837 |
+
"total": 0.0625,
|
838 |
+
"used": 1.1444091796875e-05
|
839 |
+
}
|
840 |
+
},
|
841 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
842 |
+
"gpu_count": 8,
|
843 |
+
"gpu_devices": [
|
844 |
+
{
|
845 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
846 |
+
"memory_total": 42949672960
|
847 |
+
},
|
848 |
+
{
|
849 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
850 |
+
"memory_total": 42949672960
|
851 |
+
},
|
852 |
+
{
|
853 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
854 |
+
"memory_total": 42949672960
|
855 |
+
},
|
856 |
+
{
|
857 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
858 |
+
"memory_total": 42949672960
|
859 |
+
},
|
860 |
+
{
|
861 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
862 |
+
"memory_total": 42949672960
|
863 |
+
},
|
864 |
+
{
|
865 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
866 |
+
"memory_total": 42949672960
|
867 |
+
},
|
868 |
+
{
|
869 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
870 |
+
"memory_total": 42949672960
|
871 |
+
},
|
872 |
+
{
|
873 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
874 |
+
"memory_total": 42949672960
|
875 |
+
}
|
876 |
+
],
|
877 |
+
"memory": {
|
878 |
+
"total": 453.4449462890625
|
879 |
+
}
|
880 |
+
}
|
wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 4}}
|
wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-24 20:20:22,655 INFO StreamThr :25836 [internal.py:wandb_internal():86] W&B internal server running at pid: 25836, started at: 2024-08-24 20:20:22.654049
|
2 |
+
2024-08-24 20:20:22,656 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-24 20:20:22,659 INFO WriterThread:25836 [datastore.py:open_for_write():87] open: /project/wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb
|
4 |
+
2024-08-24 20:20:22,660 DEBUG SenderThread:25836 [sender.py:send():382] send: header
|
5 |
+
2024-08-24 20:20:22,676 DEBUG SenderThread:25836 [sender.py:send():382] send: run
|
6 |
+
2024-08-24 20:20:23,101 INFO SenderThread:25836 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240824_202022-z2bjbf6e/files
|
7 |
+
2024-08-24 20:20:23,101 INFO SenderThread:25836 [sender.py:_start_run_threads():1136] run started: z2bjbf6e with start time 1724498422.652614
|
8 |
+
2024-08-24 20:20:23,106 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-24 20:20:23,106 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-24 20:20:23,175 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-24 20:20:23,182 DEBUG HandlerThread:25836 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-24 20:20:23,182 DEBUG HandlerThread:25836 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-24 20:20:23,182 INFO HandlerThread:25836 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-24 20:20:23,182 INFO SystemMonitor:25836 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-24 20:20:23,183 INFO HandlerThread:25836 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-24 20:20:23,183 INFO SystemMonitor:25836 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-24 20:20:23,183 INFO SystemMonitor:25836 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-24 20:20:23,184 INFO SystemMonitor:25836 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-24 20:20:23,185 INFO SystemMonitor:25836 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-24 20:20:23,186 INFO SystemMonitor:25836 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-24 20:20:23,248 DEBUG HandlerThread:25836 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-24 20:20:23,250 DEBUG HandlerThread:25836 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-24 20:20:23,264 DEBUG HandlerThread:25836 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-24 20:20:23,264 DEBUG HandlerThread:25836 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-24 20:20:23,264 DEBUG HandlerThread:25836 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-24T11:20:23.248321', 'startedAt': '2024-08-24T11:20:22.637930', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '131072', '--micro-batch-size', '16', '--valid_micro_batch_size', '1', '--global-batch-size', '1280', '--train-iters', '23178', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document', '--test-data-path', '1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document', '--lr', '3.5e-6', '--min-lr', '3.5e-7', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '23178', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', '--num-workers', '4', '--fsdp-activation-checkpointing', '--sharding-strategy', 'NO_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-baseline-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'yans_experiment', '--wandb-name', 'yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 144, 'cpu_count_logical': 144, 'cpu_freq': {'current': 2400.0340000000015, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 453.4449462890625}}
|
26 |
+
2024-08-24 20:20:23,264 INFO HandlerThread:25836 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-24 20:20:23,264 INFO HandlerThread:25836 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-24 20:20:23,266 INFO HandlerThread:25836 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-24 20:20:23,272 DEBUG SenderThread:25836 [sender.py:send():382] send: files
|
30 |
+
2024-08-24 20:20:23,272 INFO SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-24 20:20:23,283 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-24 20:20:23,284 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-24 20:20:23,284 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-24 20:20:23,284 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-24 20:20:23,286 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-24 20:20:23,526 DEBUG SenderThread:25836 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-24 20:20:23,973 INFO wandb-upload_0:25836 [upload_job.py:push():131] Uploaded file /tmp/tmpwjpjqs3pwandb/55szr5f9-wandb-metadata.json
|
38 |
+
2024-08-24 20:20:24,103 INFO Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
|
39 |
+
2024-08-24 20:20:24,103 INFO Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json
|
40 |
+
2024-08-24 20:20:24,103 INFO Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt
|
41 |
+
2024-08-24 20:20:26,103 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
|
42 |
+
2024-08-24 20:20:27,701 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-24 20:20:27,737 DEBUG SenderThread:25836 [sender.py:send():382] send: exit
|
44 |
+
2024-08-24 20:20:27,737 INFO SenderThread:25836 [sender.py:send_exit():589] handling exit code: 1
|
45 |
+
2024-08-24 20:20:27,737 INFO SenderThread:25836 [sender.py:send_exit():591] handling runtime: 4
|
46 |
+
2024-08-24 20:20:27,739 INFO SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
47 |
+
2024-08-24 20:20:27,739 INFO SenderThread:25836 [sender.py:send_exit():597] send defer
|
48 |
+
2024-08-24 20:20:27,739 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
49 |
+
2024-08-24 20:20:27,739 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 0
|
50 |
+
2024-08-24 20:20:27,740 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
51 |
+
2024-08-24 20:20:27,740 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 0
|
52 |
+
2024-08-24 20:20:27,740 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 1
|
53 |
+
2024-08-24 20:20:27,740 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
54 |
+
2024-08-24 20:20:27,740 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 1
|
55 |
+
2024-08-24 20:20:27,740 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
56 |
+
2024-08-24 20:20:27,740 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 1
|
57 |
+
2024-08-24 20:20:27,740 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 2
|
58 |
+
2024-08-24 20:20:27,740 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
59 |
+
2024-08-24 20:20:27,740 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 2
|
60 |
+
2024-08-24 20:20:27,740 INFO HandlerThread:25836 [system_monitor.py:finish():203] Stopping system monitor
|
61 |
+
2024-08-24 20:20:27,740 DEBUG SystemMonitor:25836 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
62 |
+
2024-08-24 20:20:27,741 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined cpu monitor
|
63 |
+
2024-08-24 20:20:27,741 DEBUG SystemMonitor:25836 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
64 |
+
2024-08-24 20:20:27,741 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined disk monitor
|
65 |
+
2024-08-24 20:20:27,741 DEBUG SystemMonitor:25836 [system_monitor.py:_start():183] Publishing last batch of metrics
|
66 |
+
2024-08-24 20:20:28,105 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
|
67 |
+
2024-08-24 20:20:28,106 INFO Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
|
68 |
+
2024-08-24 20:20:28,918 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined gpu monitor
|
69 |
+
2024-08-24 20:20:28,918 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined memory monitor
|
70 |
+
2024-08-24 20:20:28,918 INFO HandlerThread:25836 [interfaces.py:finish():202] Joined network monitor
|
71 |
+
2024-08-24 20:20:28,918 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
|
72 |
+
2024-08-24 20:20:28,920 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
73 |
+
2024-08-24 20:20:28,920 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 2
|
74 |
+
2024-08-24 20:20:28,920 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 3
|
75 |
+
2024-08-24 20:20:28,920 DEBUG SenderThread:25836 [sender.py:send():382] send: stats
|
76 |
+
2024-08-24 20:20:28,920 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
77 |
+
2024-08-24 20:20:28,921 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
|
78 |
+
2024-08-24 20:20:28,921 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 3
|
79 |
+
2024-08-24 20:20:28,921 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
80 |
+
2024-08-24 20:20:28,921 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 3
|
81 |
+
2024-08-24 20:20:28,921 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 4
|
82 |
+
2024-08-24 20:20:28,921 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
83 |
+
2024-08-24 20:20:28,921 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 4
|
84 |
+
2024-08-24 20:20:28,922 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
85 |
+
2024-08-24 20:20:28,922 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 4
|
86 |
+
2024-08-24 20:20:28,922 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 5
|
87 |
+
2024-08-24 20:20:28,922 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
88 |
+
2024-08-24 20:20:28,922 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 5
|
89 |
+
2024-08-24 20:20:28,922 DEBUG SenderThread:25836 [sender.py:send():382] send: summary
|
90 |
+
2024-08-24 20:20:28,923 INFO SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
91 |
+
2024-08-24 20:20:28,923 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
92 |
+
2024-08-24 20:20:28,923 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 5
|
93 |
+
2024-08-24 20:20:28,923 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 6
|
94 |
+
2024-08-24 20:20:28,923 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
95 |
+
2024-08-24 20:20:28,924 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 6
|
96 |
+
2024-08-24 20:20:28,924 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
97 |
+
2024-08-24 20:20:28,924 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 6
|
98 |
+
2024-08-24 20:20:28,927 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
|
99 |
+
2024-08-24 20:20:29,107 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
|
100 |
+
2024-08-24 20:20:29,126 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 7
|
101 |
+
2024-08-24 20:20:29,126 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
102 |
+
2024-08-24 20:20:29,126 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 7
|
103 |
+
2024-08-24 20:20:29,126 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
104 |
+
2024-08-24 20:20:29,126 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 7
|
105 |
+
2024-08-24 20:20:29,738 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
|
106 |
+
2024-08-24 20:20:30,108 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml
|
107 |
+
2024-08-24 20:20:30,108 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
|
108 |
+
2024-08-24 20:20:31,391 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 8
|
109 |
+
2024-08-24 20:20:31,392 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
|
110 |
+
2024-08-24 20:20:31,392 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
111 |
+
2024-08-24 20:20:31,392 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 8
|
112 |
+
2024-08-24 20:20:31,392 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
113 |
+
2024-08-24 20:20:31,392 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 8
|
114 |
+
2024-08-24 20:20:31,392 INFO SenderThread:25836 [job_builder.py:build():296] Attempting to build job artifact
|
115 |
+
2024-08-24 20:20:31,393 INFO SenderThread:25836 [job_builder.py:_get_source_type():426] is repo sourced job
|
116 |
+
2024-08-24 20:20:31,408 INFO SenderThread:25836 [job_builder.py:build():402] adding wandb-job metadata file
|
117 |
+
2024-08-24 20:20:31,417 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 9
|
118 |
+
2024-08-24 20:20:31,418 DEBUG SenderThread:25836 [sender.py:send():382] send: artifact
|
119 |
+
2024-08-24 20:20:31,418 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
120 |
+
2024-08-24 20:20:31,419 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 9
|
121 |
+
2024-08-24 20:20:31,738 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
|
122 |
+
2024-08-24 20:20:32,109 INFO Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
|
123 |
+
2024-08-24 20:20:34,782 INFO SenderThread:25836 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MzU1Mzg0Mw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjgwMzg3NA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MzU1Mzg0Mw==', 'versionIndex': 0}}}
|
124 |
+
2024-08-24 20:20:34,782 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
125 |
+
2024-08-24 20:20:34,782 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
|
126 |
+
2024-08-24 20:20:34,782 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 9
|
127 |
+
2024-08-24 20:20:34,783 INFO SenderThread:25836 [dir_watcher.py:finish():358] shutting down directory watcher
|
128 |
+
2024-08-24 20:20:35,110 INFO SenderThread:25836 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240824_202022-z2bjbf6e/files
|
129 |
+
2024-08-24 20:20:35,110 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt requirements.txt
|
130 |
+
2024-08-24 20:20:35,110 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml config.yaml
|
131 |
+
2024-08-24 20:20:35,112 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json wandb-metadata.json
|
132 |
+
2024-08-24 20:20:35,112 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json wandb-summary.json
|
133 |
+
2024-08-24 20:20:35,113 INFO SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log output.log
|
134 |
+
2024-08-24 20:20:35,115 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 10
|
135 |
+
2024-08-24 20:20:35,115 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
|
136 |
+
2024-08-24 20:20:35,116 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
137 |
+
2024-08-24 20:20:35,117 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 10
|
138 |
+
2024-08-24 20:20:35,117 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
139 |
+
2024-08-24 20:20:35,117 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 10
|
140 |
+
2024-08-24 20:20:35,117 INFO SenderThread:25836 [file_pusher.py:finish():172] shutting down file pusher
|
141 |
+
2024-08-24 20:20:35,574 INFO wandb-upload_1:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml
|
142 |
+
2024-08-24 20:20:35,574 INFO wandb-upload_0:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt
|
143 |
+
2024-08-24 20:20:35,580 INFO wandb-upload_2:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
|
144 |
+
2024-08-24 20:20:35,588 INFO wandb-upload_3:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
|
145 |
+
2024-08-24 20:20:35,739 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
|
146 |
+
2024-08-24 20:20:35,740 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
|
147 |
+
2024-08-24 20:20:35,788 INFO Thread-11 (_thread_body):25836 [sender.py:transition_state():617] send defer: 11
|
148 |
+
2024-08-24 20:20:35,788 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
149 |
+
2024-08-24 20:20:35,788 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 11
|
150 |
+
2024-08-24 20:20:35,789 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
151 |
+
2024-08-24 20:20:35,789 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 11
|
152 |
+
2024-08-24 20:20:35,789 INFO SenderThread:25836 [file_pusher.py:join():178] waiting for file pusher
|
153 |
+
2024-08-24 20:20:35,789 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 12
|
154 |
+
2024-08-24 20:20:35,789 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
155 |
+
2024-08-24 20:20:35,789 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 12
|
156 |
+
2024-08-24 20:20:35,789 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
157 |
+
2024-08-24 20:20:35,789 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 12
|
158 |
+
2024-08-24 20:20:35,789 INFO SenderThread:25836 [file_stream.py:finish():595] file stream finish called
|
159 |
+
2024-08-24 20:20:36,056 INFO SenderThread:25836 [file_stream.py:finish():599] file stream finish is done
|
160 |
+
2024-08-24 20:20:36,056 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 13
|
161 |
+
2024-08-24 20:20:36,056 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
162 |
+
2024-08-24 20:20:36,056 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 13
|
163 |
+
2024-08-24 20:20:36,056 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
164 |
+
2024-08-24 20:20:36,056 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 13
|
165 |
+
2024-08-24 20:20:36,056 INFO SenderThread:25836 [sender.py:transition_state():617] send defer: 14
|
166 |
+
2024-08-24 20:20:36,057 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
|
167 |
+
2024-08-24 20:20:36,057 DEBUG SenderThread:25836 [sender.py:send():382] send: final
|
168 |
+
2024-08-24 20:20:36,057 INFO HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 14
|
169 |
+
2024-08-24 20:20:36,057 DEBUG SenderThread:25836 [sender.py:send():382] send: footer
|
170 |
+
2024-08-24 20:20:36,057 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: defer
|
171 |
+
2024-08-24 20:20:36,057 INFO SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 14
|
172 |
+
2024-08-24 20:20:36,057 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
|
173 |
+
2024-08-24 20:20:36,057 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
|
174 |
+
2024-08-24 20:20:36,058 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
|
175 |
+
2024-08-24 20:20:36,058 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: server_info
|
176 |
+
2024-08-24 20:20:36,058 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
|
177 |
+
2024-08-24 20:20:36,058 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: server_info
|
178 |
+
2024-08-24 20:20:36,060 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: get_summary
|
179 |
+
2024-08-24 20:20:36,060 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: sampled_history
|
180 |
+
2024-08-24 20:20:36,060 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: internal_messages
|
181 |
+
2024-08-24 20:20:36,060 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: job_info
|
182 |
+
2024-08-24 20:20:36,224 DEBUG SenderThread:25836 [sender.py:send_request():409] send_request: job_info
|
183 |
+
2024-08-24 20:20:36,224 INFO MainThread:25836 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
184 |
+
2024-08-24 20:20:36,224 INFO MainThread:25836 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
185 |
+
2024-08-24 20:20:36,224 INFO MainThread:25836 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
186 |
+
2024-08-24 20:20:36,225 DEBUG HandlerThread:25836 [handler.py:handle_request():146] handle_request: shutdown
|
187 |
+
2024-08-24 20:20:36,225 INFO HandlerThread:25836 [handler.py:finish():869] shutting down handler
|
188 |
+
2024-08-24 20:20:37,061 INFO WriterThread:25836 [datastore.py:close():296] close: /project/wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb
|
189 |
+
2024-08-24 20:20:37,224 INFO SenderThread:25836 [sender.py:finish():1572] shutting down sender
|
190 |
+
2024-08-24 20:20:37,224 INFO SenderThread:25836 [file_pusher.py:finish():172] shutting down file pusher
|
191 |
+
2024-08-24 20:20:37,224 INFO SenderThread:25836 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240824_202022-z2bjbf6e/logs/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Configure stats pid to 25210
|
3 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train baseline'}
|
6 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240824_202022-z2bjbf6e/logs/debug.log
|
9 |
+
2024-08-24 20:20:22,645 INFO MainThread:25210 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log
|
10 |
+
2024-08-24 20:20:22,646 INFO MainThread:25210 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-24 20:20:22,646 INFO MainThread:25210 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document'], 'test_data_path': ['1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 4, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07', 'wandb_project': 'yans_experiment', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 3.5e-06, 'lr_decay_style': 'cosine', 'lr_decay_iters': 23178, 'lr_warmup_iters': 500, 'min_lr': 3.5e-07, 'train_iters': 23178, 'train_samples': None, 'global_batch_size': 1280, 'micro_batch_size': 16, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-baseline-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 8, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 10}
|
13 |
+
2024-08-24 20:20:22,646 INFO MainThread:25210 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-24 20:20:22,646 INFO MainThread:25210 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-24 20:20:22,651 INFO MainThread:25210 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-24 20:20:22,652 INFO MainThread:25210 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-24 20:20:22,659 INFO MainThread:25210 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-24 20:20:22,672 INFO MainThread:25210 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-24 20:20:23,105 INFO MainThread:25210 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-24 20:20:23,127 INFO MainThread:25210 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-24 20:20:23,127 INFO MainThread:25210 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-24 20:20:23,283 INFO MainThread:25210 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-24 20:20:23,283 INFO MainThread:25210 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-24 20:20:23,283 INFO MainThread:25210 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-24 20:20:23,283 INFO MainThread:25210 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-24 20:20:23,284 INFO MainThread:25210 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-24 20:20:37,225 WARNING MsgRouterThr:25210 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb
ADDED
Binary file (18.8 kB). View file
|
|
wandb/run-20240826_221726-7jzdp89j/files/config.yaml
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '1754785366'
|
31 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
32 |
+
- '28623823675'
|
33 |
+
- /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
|
34 |
+
valid_data_path:
|
35 |
+
desc: null
|
36 |
+
value:
|
37 |
+
- '1205770'
|
38 |
+
- /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
|
39 |
+
test_data_path:
|
40 |
+
desc: null
|
41 |
+
value:
|
42 |
+
- '1205770'
|
43 |
+
- /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
|
44 |
+
data_cache_path:
|
45 |
+
desc: null
|
46 |
+
value: null
|
47 |
+
vocab_size:
|
48 |
+
desc: null
|
49 |
+
value: null
|
50 |
+
vocab_file:
|
51 |
+
desc: null
|
52 |
+
value: null
|
53 |
+
merge_file:
|
54 |
+
desc: null
|
55 |
+
value: null
|
56 |
+
seq_length:
|
57 |
+
desc: null
|
58 |
+
value: 1024
|
59 |
+
num_workers:
|
60 |
+
desc: null
|
61 |
+
value: 4
|
62 |
+
tokenizer_type:
|
63 |
+
desc: null
|
64 |
+
value: HFPreTrainedTokenizer
|
65 |
+
tokenizer_model:
|
66 |
+
desc: null
|
67 |
+
value: /share/pretrained_lm/Qwen/Qwen2-1.5B
|
68 |
+
reset_position_ids:
|
69 |
+
desc: null
|
70 |
+
value: false
|
71 |
+
reset_attention_mask:
|
72 |
+
desc: null
|
73 |
+
value: false
|
74 |
+
eod_mask_loss:
|
75 |
+
desc: null
|
76 |
+
value: false
|
77 |
+
retro_return_doc_ids:
|
78 |
+
desc: null
|
79 |
+
value: false
|
80 |
+
short_seq_prob:
|
81 |
+
desc: null
|
82 |
+
value: 0.1
|
83 |
+
vocab_extra_ids:
|
84 |
+
desc: null
|
85 |
+
value: 0
|
86 |
+
seed:
|
87 |
+
desc: null
|
88 |
+
value: 1234
|
89 |
+
use_mpi:
|
90 |
+
desc: null
|
91 |
+
value: false
|
92 |
+
wandb_entity:
|
93 |
+
desc: null
|
94 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
95 |
+
wandb_name:
|
96 |
+
desc: null
|
97 |
+
value: yans-baseline-qwen2-1.5B-3.5e-5_train_2024-08-26-22:17:00
|
98 |
+
wandb_project:
|
99 |
+
desc: null
|
100 |
+
value: yans_experiment
|
101 |
+
quantization:
|
102 |
+
desc: null
|
103 |
+
value: false
|
104 |
+
use_freeze_layers:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
freeze_layers:
|
108 |
+
desc: null
|
109 |
+
value: null
|
110 |
+
bf16:
|
111 |
+
desc: null
|
112 |
+
value: true
|
113 |
+
fp16:
|
114 |
+
desc: null
|
115 |
+
value: false
|
116 |
+
mixed_precision:
|
117 |
+
desc: null
|
118 |
+
value: true
|
119 |
+
param_dtype:
|
120 |
+
desc: null
|
121 |
+
value: null
|
122 |
+
load:
|
123 |
+
desc: null
|
124 |
+
value: /work/llm_recipes/models/yans-baseline-qwen2-1.5B-3.5e-5
|
125 |
+
save:
|
126 |
+
desc: null
|
127 |
+
value: /work/llm_recipes/models/yans-baseline-qwen2-1.5B-3.5e-5
|
128 |
+
base_model:
|
129 |
+
desc: null
|
130 |
+
value: /share/pretrained_lm/Qwen/Qwen2-1.5B
|
131 |
+
use_better_transformer:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
grad_clip_norm:
|
135 |
+
desc: null
|
136 |
+
value: 1.0
|
137 |
+
eval_interval:
|
138 |
+
desc: null
|
139 |
+
value: 200
|
140 |
+
save_interval:
|
141 |
+
desc: null
|
142 |
+
value: 200
|
143 |
+
eval_iters:
|
144 |
+
desc: null
|
145 |
+
value: 10
|
146 |
+
optimizer:
|
147 |
+
desc: null
|
148 |
+
value: anyprecision
|
149 |
+
lr:
|
150 |
+
desc: null
|
151 |
+
value: 3.5e-05
|
152 |
+
lr_decay_style:
|
153 |
+
desc: null
|
154 |
+
value: cosine
|
155 |
+
lr_decay_iters:
|
156 |
+
desc: null
|
157 |
+
value: 23178
|
158 |
+
lr_warmup_iters:
|
159 |
+
desc: null
|
160 |
+
value: 500
|
161 |
+
min_lr:
|
162 |
+
desc: null
|
163 |
+
value: 3.5e-06
|
164 |
+
train_iters:
|
165 |
+
desc: null
|
166 |
+
value: 23178
|
167 |
+
train_samples:
|
168 |
+
desc: null
|
169 |
+
value: null
|
170 |
+
global_batch_size:
|
171 |
+
desc: null
|
172 |
+
value: 1280
|
173 |
+
micro_batch_size:
|
174 |
+
desc: null
|
175 |
+
value: 16
|
176 |
+
make_vocab_size_divisible_by:
|
177 |
+
desc: null
|
178 |
+
value: 128
|
179 |
+
sliding_window_size:
|
180 |
+
desc: null
|
181 |
+
value: 131072
|
182 |
+
skip_batch:
|
183 |
+
desc: null
|
184 |
+
value: null
|
185 |
+
no_save_optimizer_state:
|
186 |
+
desc: null
|
187 |
+
value: false
|
188 |
+
continual_pretraining:
|
189 |
+
desc: null
|
190 |
+
value: false
|
191 |
+
instruction_tuning:
|
192 |
+
desc: null
|
193 |
+
value: false
|
194 |
+
direct_preference_optimization:
|
195 |
+
desc: null
|
196 |
+
value: false
|
197 |
+
attention_dropout:
|
198 |
+
desc: null
|
199 |
+
value: 0.1
|
200 |
+
hidden_dropout:
|
201 |
+
desc: null
|
202 |
+
value: 0.1
|
203 |
+
weight_decay:
|
204 |
+
desc: null
|
205 |
+
value: 0.1
|
206 |
+
adam_beta1:
|
207 |
+
desc: null
|
208 |
+
value: 0.9
|
209 |
+
adam_beta2:
|
210 |
+
desc: null
|
211 |
+
value: 0.95
|
212 |
+
adam_eps:
|
213 |
+
desc: null
|
214 |
+
value: 1.0e-08
|
215 |
+
hf_transformer_model_dir:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
instruction_train_data_path:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
instruction_valid_data_path:
|
222 |
+
desc: null
|
223 |
+
value: null
|
224 |
+
epoch:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
instruction_dataset_size:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
save_sampler_state:
|
231 |
+
desc: null
|
232 |
+
value: false
|
233 |
+
label_smoothing:
|
234 |
+
desc: null
|
235 |
+
value: 0.0
|
236 |
+
save_n_checkpoints:
|
237 |
+
desc: null
|
238 |
+
value: 10
|
239 |
+
hf_repo_id:
|
240 |
+
desc: null
|
241 |
+
value: koichi12/yans-baseline-qwen2-1.5B-3.5e-5
|
242 |
+
create_public_hf_repo:
|
243 |
+
desc: null
|
244 |
+
value: false
|
245 |
+
upload_all_checkpoints_to_hf:
|
246 |
+
desc: null
|
247 |
+
value: true
|
248 |
+
hf_upload_retry_limit:
|
249 |
+
desc: null
|
250 |
+
value: 2
|
251 |
+
exit_duration_in_mins:
|
252 |
+
desc: null
|
253 |
+
value: null
|
254 |
+
source_key:
|
255 |
+
desc: null
|
256 |
+
value: null
|
257 |
+
target_key:
|
258 |
+
desc: null
|
259 |
+
value: null
|
260 |
+
attn_implementation:
|
261 |
+
desc: null
|
262 |
+
value: flash_attention_2
|
263 |
+
efficient_instruction_tuning:
|
264 |
+
desc: null
|
265 |
+
value: false
|
266 |
+
remove_padding_masking:
|
267 |
+
desc: null
|
268 |
+
value: false
|
269 |
+
save_start_iter:
|
270 |
+
desc: null
|
271 |
+
value: null
|
272 |
+
valid_micro_batch_size:
|
273 |
+
desc: null
|
274 |
+
value: 1
|
275 |
+
rank:
|
276 |
+
desc: null
|
277 |
+
value: 0
|
278 |
+
world_size:
|
279 |
+
desc: null
|
280 |
+
value: 8
|
281 |
+
padded_vocab_size:
|
282 |
+
desc: null
|
283 |
+
value: 151680
|
284 |
+
gradient_accumulation_steps:
|
285 |
+
desc: null
|
286 |
+
value: 10
|
287 |
+
_wandb:
|
288 |
+
desc: null
|
289 |
+
value:
|
290 |
+
python_version: 3.10.12
|
291 |
+
cli_version: 0.16.3
|
292 |
+
framework: huggingface
|
293 |
+
huggingface_version: 4.43.3
|
294 |
+
is_jupyter_run: false
|
295 |
+
is_kaggle_kernel: false
|
296 |
+
start_time: 1724678246.995911
|
297 |
+
t:
|
298 |
+
1:
|
299 |
+
- 1
|
300 |
+
- 11
|
301 |
+
- 49
|
302 |
+
- 55
|
303 |
+
- 71
|
304 |
+
- 105
|
305 |
+
2:
|
306 |
+
- 1
|
307 |
+
- 11
|
308 |
+
- 49
|
309 |
+
- 55
|
310 |
+
- 71
|
311 |
+
- 105
|
312 |
+
3:
|
313 |
+
- 13
|
314 |
+
- 16
|
315 |
+
- 23
|
316 |
+
4: 3.10.12
|
317 |
+
5: 0.16.3
|
318 |
+
6: 4.43.3
|
319 |
+
8:
|
320 |
+
- 5
|
321 |
+
13: linux-x86_64
|
322 |
+
model_architecture:
|
323 |
+
desc: null
|
324 |
+
value: Qwen2ForCausalLM
|
325 |
+
activation_function:
|
326 |
+
desc: null
|
327 |
+
value: silu
|
328 |
+
hidden_size:
|
329 |
+
desc: null
|
330 |
+
value: 1536
|
331 |
+
model_type:
|
332 |
+
desc: null
|
333 |
+
value: qwen2
|
334 |
+
max_position_embeddings:
|
335 |
+
desc: null
|
336 |
+
value: 1024
|
337 |
+
num_attention_heads:
|
338 |
+
desc: null
|
339 |
+
value: 12
|
340 |
+
num_hidden_layers:
|
341 |
+
desc: null
|
342 |
+
value: 28
|
wandb/run-20240826_221726-7jzdp89j/files/output.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240826_221726-7jzdp89j/files/requirements.txt
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
build==1.2.1
|
23 |
+
cachecontrol==0.14.0
|
24 |
+
cachetools==5.3.2
|
25 |
+
catalogue==2.0.10
|
26 |
+
certifi==2024.2.2
|
27 |
+
cffi==1.16.0
|
28 |
+
chardet==5.2.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
cleo==2.1.0
|
31 |
+
click==8.1.7
|
32 |
+
cloudpathlib==0.16.0
|
33 |
+
cloudpickle==3.0.0
|
34 |
+
cmake==3.28.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.1
|
37 |
+
confection==0.1.4
|
38 |
+
contourpy==1.2.0
|
39 |
+
cramjam==2.8.3
|
40 |
+
crashtest==0.4.1
|
41 |
+
cryptography==43.0.0
|
42 |
+
cubinlinker==0.3.0+2.g405ac64
|
43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
44 |
+
cudf==23.12.0
|
45 |
+
cugraph-dgl==23.12.0
|
46 |
+
cugraph-service-client==23.12.0
|
47 |
+
cugraph-service-server==23.12.0
|
48 |
+
cugraph==23.12.0
|
49 |
+
cuml==23.12.0
|
50 |
+
cupy-cuda12x==12.3.0
|
51 |
+
cycler==0.12.1
|
52 |
+
cymem==2.0.8
|
53 |
+
cython==3.0.8
|
54 |
+
dask-cuda==23.12.0
|
55 |
+
dask-cudf==23.12.0
|
56 |
+
dask==2023.11.0
|
57 |
+
dataclasses-json==0.6.7
|
58 |
+
dataproperty==1.0.1
|
59 |
+
datasets==2.20.0
|
60 |
+
debugpy==1.8.1
|
61 |
+
decorator==5.1.1
|
62 |
+
defusedxml==0.7.1
|
63 |
+
dill==0.3.8
|
64 |
+
distlib==0.3.8
|
65 |
+
distributed==2023.11.0
|
66 |
+
distro==1.9.0
|
67 |
+
dm-tree==0.1.8
|
68 |
+
docker-pycreds==0.4.0
|
69 |
+
dulwich==0.21.7
|
70 |
+
einops==0.7.0
|
71 |
+
emoji==2.12.1
|
72 |
+
entmax==1.3
|
73 |
+
evaluate==0.4.2
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
execnet==2.0.2
|
76 |
+
executing==2.0.1
|
77 |
+
expecttest==0.1.3
|
78 |
+
fastjsonschema==2.19.1
|
79 |
+
fastparquet==2023.10.1
|
80 |
+
fastrlock==0.8.2
|
81 |
+
filelock==3.13.1
|
82 |
+
flash-attn==2.4.2
|
83 |
+
fonttools==4.48.1
|
84 |
+
frozenlist==1.4.1
|
85 |
+
fsspec==2023.12.2
|
86 |
+
fugashi==1.3.2
|
87 |
+
fuzzywuzzy==0.18.0
|
88 |
+
gast==0.5.4
|
89 |
+
gitdb==4.0.11
|
90 |
+
gitpython==3.1.43
|
91 |
+
google-auth-oauthlib==0.4.6
|
92 |
+
google-auth==2.27.0
|
93 |
+
graphsurgeon==0.4.6
|
94 |
+
greenlet==3.0.3
|
95 |
+
grpcio==1.60.1
|
96 |
+
h11==0.14.0
|
97 |
+
httpcore==1.0.5
|
98 |
+
httpx==0.27.0
|
99 |
+
huggingface-hub==0.24.5
|
100 |
+
hydra-core==1.3.2
|
101 |
+
hypothesis==5.35.1
|
102 |
+
idna==3.6
|
103 |
+
importlib-metadata==7.0.1
|
104 |
+
iniconfig==2.0.0
|
105 |
+
installer==0.7.0
|
106 |
+
intel-openmp==2021.4.0
|
107 |
+
ipadic==1.0.0
|
108 |
+
ipykernel==6.29.2
|
109 |
+
ipython-genutils==0.2.0
|
110 |
+
ipython==8.21.0
|
111 |
+
isort==5.13.2
|
112 |
+
jaraco.classes==3.4.0
|
113 |
+
jedi==0.19.1
|
114 |
+
jeepney==0.8.0
|
115 |
+
jinja2==3.1.3
|
116 |
+
jiter==0.5.0
|
117 |
+
joblib==1.3.2
|
118 |
+
json5==0.9.14
|
119 |
+
jsonargparse==3.13.1
|
120 |
+
jsonlines==4.0.0
|
121 |
+
jsonnet==0.19.1
|
122 |
+
jsonpatch==1.33
|
123 |
+
jsonpointer==3.0.0
|
124 |
+
jsonschema-specifications==2023.12.1
|
125 |
+
jsonschema==4.21.1
|
126 |
+
jupyter-client==8.6.0
|
127 |
+
jupyter-core==5.7.1
|
128 |
+
jupyter-tensorboard==0.2.0
|
129 |
+
jupyterlab-pygments==0.3.0
|
130 |
+
jupyterlab-server==1.2.0
|
131 |
+
jupyterlab==2.3.2
|
132 |
+
jupytext==1.16.1
|
133 |
+
keyring==24.3.1
|
134 |
+
kiwisolver==1.4.5
|
135 |
+
langchain-community==0.2.12
|
136 |
+
langchain-core==0.2.31
|
137 |
+
langchain-huggingface==0.0.2
|
138 |
+
langchain-openai==0.1.21
|
139 |
+
langchain-text-splitters==0.2.2
|
140 |
+
langchain==0.2.13
|
141 |
+
langcodes==3.3.0
|
142 |
+
langsmith==0.1.99
|
143 |
+
lazy-loader==0.3
|
144 |
+
levenshtein==0.25.1
|
145 |
+
librosa==0.10.1
|
146 |
+
lightning-utilities==0.11.6
|
147 |
+
llm-jp-eval==1.4.0
|
148 |
+
llvmlite==0.40.1
|
149 |
+
lm-eval==0.3.0
|
150 |
+
locket==1.0.0
|
151 |
+
logzero==1.7.0
|
152 |
+
lxml==5.2.2
|
153 |
+
markdown-it-py==3.0.0
|
154 |
+
markdown==3.5.2
|
155 |
+
markupsafe==2.1.4
|
156 |
+
marshmallow==3.21.3
|
157 |
+
matplotlib-inline==0.1.6
|
158 |
+
matplotlib==3.8.2
|
159 |
+
mbstrdecoder==1.1.3
|
160 |
+
mccabe==0.7.0
|
161 |
+
mdit-py-plugins==0.4.0
|
162 |
+
mdurl==0.1.2
|
163 |
+
mecab-python3==1.0.6
|
164 |
+
mistune==3.0.2
|
165 |
+
mkl-devel==2021.1.1
|
166 |
+
mkl-include==2021.1.1
|
167 |
+
mkl==2021.1.1
|
168 |
+
mock==5.1.0
|
169 |
+
mojimoji==0.0.13
|
170 |
+
more-itertools==9.1.0
|
171 |
+
mpmath==1.3.0
|
172 |
+
msgpack==1.0.7
|
173 |
+
multidict==6.0.4
|
174 |
+
multiprocess==0.70.16
|
175 |
+
murmurhash==1.0.10
|
176 |
+
mypy-extensions==1.0.0
|
177 |
+
nbclient==0.9.0
|
178 |
+
nbconvert==7.16.0
|
179 |
+
nbformat==5.9.2
|
180 |
+
neologdn==0.5.3
|
181 |
+
nest-asyncio==1.6.0
|
182 |
+
networkx==2.6.3
|
183 |
+
ninja==1.11.1.1
|
184 |
+
nltk==3.8.1
|
185 |
+
notebook==6.4.10
|
186 |
+
numba==0.57.1+1.g1ff679645
|
187 |
+
numexpr==2.10.1
|
188 |
+
numpy==1.24.4
|
189 |
+
nvfuser==0.1.4a0+d0bb811
|
190 |
+
nvidia-dali-cuda120==1.34.0
|
191 |
+
nvidia-pyindex==1.0.9
|
192 |
+
nvtx==0.2.5
|
193 |
+
oauthlib==3.2.2
|
194 |
+
omegaconf==2.3.0
|
195 |
+
onnx==1.15.0rc2
|
196 |
+
openai==1.40.6
|
197 |
+
opencv==4.7.0
|
198 |
+
optree==0.10.0
|
199 |
+
orjson==3.10.7
|
200 |
+
packaging==23.2
|
201 |
+
pandas==2.2.2
|
202 |
+
pandocfilters==1.5.1
|
203 |
+
parso==0.8.3
|
204 |
+
partd==1.4.1
|
205 |
+
pathvalidate==3.2.0
|
206 |
+
peft==0.5.0
|
207 |
+
pexpect==4.9.0
|
208 |
+
pillow==10.2.0
|
209 |
+
pip==24.0
|
210 |
+
pkginfo==1.11.1
|
211 |
+
plac==1.4.3
|
212 |
+
platformdirs==4.2.0
|
213 |
+
pluggy==1.4.0
|
214 |
+
ply==3.11
|
215 |
+
poetry-core==1.9.0
|
216 |
+
poetry-plugin-export==1.8.0
|
217 |
+
poetry==1.8.3
|
218 |
+
polygraphy==0.49.4
|
219 |
+
pooch==1.8.0
|
220 |
+
portalocker==2.10.1
|
221 |
+
preshed==3.0.9
|
222 |
+
prettytable==3.9.0
|
223 |
+
prometheus-client==0.19.0
|
224 |
+
prompt-toolkit==3.0.43
|
225 |
+
protobuf==4.24.4
|
226 |
+
psutil==5.9.4
|
227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
228 |
+
ptyprocess==0.7.0
|
229 |
+
pure-eval==0.2.2
|
230 |
+
pyarrow-hotfix==0.6
|
231 |
+
pyarrow==15.0.2
|
232 |
+
pyasn1-modules==0.3.0
|
233 |
+
pyasn1==0.5.1
|
234 |
+
pybind11-global==2.11.1
|
235 |
+
pybind11==2.11.1
|
236 |
+
pycocotools==2.0+nv0.8.0
|
237 |
+
pycountry==24.6.1
|
238 |
+
pycparser==2.21
|
239 |
+
pydantic-core==2.16.2
|
240 |
+
pydantic==2.6.1
|
241 |
+
pygments==2.17.2
|
242 |
+
pylibcugraph==23.12.0
|
243 |
+
pylibcugraphops==23.12.0
|
244 |
+
pylibraft==23.12.0
|
245 |
+
pylint==3.2.6
|
246 |
+
pynvml==11.4.1
|
247 |
+
pyparsing==3.1.1
|
248 |
+
pyproject-hooks==1.1.0
|
249 |
+
pytablewriter==1.2.0
|
250 |
+
pytest-flakefinder==1.1.0
|
251 |
+
pytest-rerunfailures==13.0
|
252 |
+
pytest-shard==0.1.2
|
253 |
+
pytest-xdist==3.5.0
|
254 |
+
pytest==8.0.0
|
255 |
+
python-dateutil==2.8.2
|
256 |
+
python-dotenv==1.0.0
|
257 |
+
python-hostlist==1.23.0
|
258 |
+
python-levenshtein==0.25.1
|
259 |
+
pytorch-lightning==2.4.0
|
260 |
+
pytorch-quantization==2.1.2
|
261 |
+
pytz==2023.3.post1
|
262 |
+
pyyaml==6.0.1
|
263 |
+
pyzmq==25.1.2
|
264 |
+
raft-dask==23.12.0
|
265 |
+
rapidfuzz==3.9.6
|
266 |
+
rapids-dask-dependency==23.12.1
|
267 |
+
referencing==0.33.0
|
268 |
+
regex==2023.12.25
|
269 |
+
requests-oauthlib==1.3.1
|
270 |
+
requests-toolbelt==1.0.0
|
271 |
+
requests==2.32.3
|
272 |
+
rhoknp==1.7.0
|
273 |
+
rich==13.7.0
|
274 |
+
rmm==23.12.0
|
275 |
+
rouge-score==0.1.2
|
276 |
+
rpds-py==0.17.1
|
277 |
+
rsa==4.9
|
278 |
+
sacrebleu==2.4.2
|
279 |
+
safetensors==0.4.3
|
280 |
+
scikit-learn==1.5.1
|
281 |
+
scipy==1.12.0
|
282 |
+
secretstorage==3.3.3
|
283 |
+
send2trash==1.8.2
|
284 |
+
sentence-transformers==3.0.1
|
285 |
+
sentencepiece==0.1.99
|
286 |
+
sentry-sdk==2.12.0
|
287 |
+
setproctitle==1.3.3
|
288 |
+
setuptools==68.2.2
|
289 |
+
shellingham==1.5.4
|
290 |
+
six==1.16.0
|
291 |
+
smart-open==6.4.0
|
292 |
+
smmap==5.0.1
|
293 |
+
sniffio==1.3.1
|
294 |
+
sortedcontainers==2.4.0
|
295 |
+
soundfile==0.12.1
|
296 |
+
soupsieve==2.5
|
297 |
+
soxr==0.3.7
|
298 |
+
spacy-legacy==3.0.12
|
299 |
+
spacy-loggers==1.0.5
|
300 |
+
spacy==3.7.2
|
301 |
+
sphinx-glpi-theme==0.6
|
302 |
+
sqlalchemy==2.0.32
|
303 |
+
sqlitedict==2.1.0
|
304 |
+
srsly==2.4.8
|
305 |
+
stack-data==0.6.3
|
306 |
+
sumeval==0.2.2
|
307 |
+
sympy==1.12
|
308 |
+
tabledata==1.3.3
|
309 |
+
tabulate==0.9.0
|
310 |
+
tbb==2021.11.0
|
311 |
+
tblib==3.0.0
|
312 |
+
tcolorpy==0.1.6
|
313 |
+
tenacity==8.5.0
|
314 |
+
tensorboard-data-server==0.6.1
|
315 |
+
tensorboard-plugin-wit==1.8.1
|
316 |
+
tensorboard==2.9.0
|
317 |
+
tensorrt==8.6.3
|
318 |
+
terminado==0.18.0
|
319 |
+
termplotlib==0.3.9
|
320 |
+
text-generation==0.7.0
|
321 |
+
thinc==8.2.3
|
322 |
+
threadpoolctl==3.2.0
|
323 |
+
thriftpy2==0.4.17
|
324 |
+
tiktoken==0.7.0
|
325 |
+
tinycss2==1.2.1
|
326 |
+
tokenizers==0.19.1
|
327 |
+
toml==0.10.2
|
328 |
+
tomli==2.0.1
|
329 |
+
tomlkit==0.13.2
|
330 |
+
toolz==0.12.1
|
331 |
+
torch-tensorrt==2.3.0a0
|
332 |
+
torch==2.3.0a0+ebedce2
|
333 |
+
torchdata==0.7.1a0
|
334 |
+
torchmetrics==0.10.3
|
335 |
+
torchtext==0.17.0a0
|
336 |
+
torchvision==0.18.0a0
|
337 |
+
tornado==6.4
|
338 |
+
tqdm-multiprocess==0.0.11
|
339 |
+
tqdm==4.66.5
|
340 |
+
traitlets==5.9.0
|
341 |
+
transformer-engine==1.3.0+5b90b7f
|
342 |
+
transformers==4.43.3
|
343 |
+
treelite-runtime==3.9.1
|
344 |
+
treelite==3.9.1
|
345 |
+
triton==2.2.0+e28a256
|
346 |
+
trove-classifiers==2024.7.2
|
347 |
+
typepy==1.3.2
|
348 |
+
typer==0.9.0
|
349 |
+
types-dataclasses==0.6.6
|
350 |
+
typing-extensions==4.12.2
|
351 |
+
typing-inspect==0.9.0
|
352 |
+
tzdata==2024.1
|
353 |
+
ucx-py==0.35.0
|
354 |
+
uff==0.6.9
|
355 |
+
ujson==5.8.0
|
356 |
+
unbabel-comet==2.2.2
|
357 |
+
unidic-lite==1.0.8
|
358 |
+
urllib3==1.26.18
|
359 |
+
virtualenv==20.26.3
|
360 |
+
wandb==0.16.3
|
361 |
+
wasabi==1.1.2
|
362 |
+
wcwidth==0.2.13
|
363 |
+
weasel==0.3.4
|
364 |
+
webencodings==0.5.1
|
365 |
+
werkzeug==3.0.1
|
366 |
+
wheel==0.42.0
|
367 |
+
word2number==1.1
|
368 |
+
xdoctest==1.0.2
|
369 |
+
xgboost==1.7.6
|
370 |
+
xmltodict==0.13.0
|
371 |
+
xxhash==3.4.1
|
372 |
+
yarl==1.9.4
|
373 |
+
zict==3.0.0
|
374 |
+
zipp==3.17.0
|
375 |
+
zstandard==0.23.0
|