koichi12 commited on Nov 28, 2024

Commit

ca3e41a

verified ·

1 Parent(s): 09e5c81

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

wandb/run-20240802_173428-s75vpwte/files/config.yaml +335 -0
wandb/run-20240802_173428-s75vpwte/files/output.log +0 -0
wandb/run-20240802_173428-s75vpwte/files/requirements.txt +271 -0
wandb/run-20240802_173428-s75vpwte/files/wandb-metadata.json +215 -0
wandb/run-20240802_173428-s75vpwte/files/wandb-summary.json +1 -0
wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log +0 -0
wandb/run-20240802_173428-s75vpwte/logs/debug.log +29 -0
wandb/run-20240804_135607-ikp7tdz1/files/config.yaml +335 -0
wandb/run-20240804_135607-ikp7tdz1/files/output.log +130 -0
wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt +271 -0
wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json +215 -0
wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json +1 -0
wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log +216 -0
wandb/run-20240804_135607-ikp7tdz1/logs/debug.log +30 -0
wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb +0 -0
wandb/run-20240812_070449-ufge4h1y/files/config.yaml +335 -0
wandb/run-20240812_070449-ufge4h1y/files/output.log +158 -0
wandb/run-20240812_070449-ufge4h1y/files/requirements.txt +271 -0
wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json +215 -0
wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json +1 -0
wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log +616 -0
wandb/run-20240812_070449-ufge4h1y/logs/debug.log +29 -0
wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb +0 -0
wandb/run-20240812_073202-yby212na/files/config.yaml +335 -0
wandb/run-20240812_073202-yby212na/files/output.log +116 -0
wandb/run-20240812_073202-yby212na/files/requirements.txt +271 -0
wandb/run-20240812_073202-yby212na/files/wandb-metadata.json +215 -0
wandb/run-20240812_073202-yby212na/files/wandb-summary.json +1 -0
wandb/run-20240812_073202-yby212na/logs/debug-internal.log +236 -0
wandb/run-20240812_073202-yby212na/logs/debug.log +29 -0
wandb/run-20240812_073202-yby212na/run-yby212na.wandb +0 -0
wandb/run-20240815_041534-1ld4rgmy/files/config.yaml +337 -0
wandb/run-20240815_041534-1ld4rgmy/files/output.log +92 -0
wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt +354 -0
wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json +215 -0
wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json +1 -0
wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log +162 -0
wandb/run-20240815_041534-1ld4rgmy/logs/debug.log +29 -0
wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb +0 -0
wandb/run-20240824_202022-z2bjbf6e/files/config.yaml +321 -0
wandb/run-20240824_202022-z2bjbf6e/files/output.log +51 -0
wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt +375 -0
wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json +880 -0
wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json +1 -0
wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log +191 -0
wandb/run-20240824_202022-z2bjbf6e/logs/debug.log +28 -0
wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb +0 -0
wandb/run-20240826_221726-7jzdp89j/files/config.yaml +342 -0
wandb/run-20240826_221726-7jzdp89j/files/output.log +0 -0
wandb/run-20240826_221726-7jzdp89j/files/requirements.txt +375 -0

wandb/run-20240802_173428-s75vpwte/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-mistral-sample_train_2024-08-02-17:34:15
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-mistral-sample
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-mistral-sample
+base_model:
+  desc: null
+  value: /share/pretrained_lm/custom/tiny-mistral
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-mistral-sample
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32768
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722587668.341658
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 256
+model_type:
+  desc: null
+  value: mistral
+max_position_embeddings:
+  desc: null
+  value: 512
+num_attention_heads:
+  desc: null
+  value: 4
+num_hidden_layers:
+  desc: null
+  value: 4
+model_architecture:
+  desc: null
+  value: MistralForCausalLM

wandb/run-20240802_173428-s75vpwte/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240802_173428-s75vpwte/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240802_173428-s75vpwte/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-02T08:34:28.941229",
+    "startedAt": "2024-08-02T08:34:28.326109",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/custom/tiny-mistral",
+        "--save",
+        "/work/llm_recipes/models/tiny-mistral-sample",
+        "--load",
+        "/work/llm_recipes/models/tiny-mistral-sample",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-mistral-sample",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-mistral-sample_train_2024-08-02-17:34:15"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0409999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240802_173428-s75vpwte/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"training/loss": 8.780712127685547, "training/perplexity": 6507.50970149773, "utils/batch_size": 8, "utils/global_batch_size": 320, "utils/seq_len": 513, "utils/gradient_accumulation_steps": 40, "utils/iteration": 1410, "optimizer/lr": 1.989808738231659e-05, "optimizer/variance_l2": 0.013855160145659429, "optimizer/variance_sqrt_l2": 0.9992841304001847, "optimizer/momentum_l2": 0.9839698623853019, "optimizer/weight_l2": 101.83051175850979, "optimizer/variance_l1": 1.002197265625, "optimizer/variance_sqrt_l1": 536.5, "optimizer/momentum_l1": 403.875, "optimizer/weight_l1": 332288.0, "optimizer/variance_abs_max": 0.0011444091796875, "optimizer/variance_sqrt_abs_max": 0.033935546875, "optimizer/momentum_abs_max": 0.03369140625, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 1.277997902000152, "stats/tokens_per_sec": 128450.91509389698, "stats/tokens_per_sec_per_gpu": 128450.91509389698, "stats/tflops": 9.093190310165799, "_timestamp": 1722589282.0763872, "_runtime": 1613.73472905159, "_step": 1410, "evaluation/val_loss": 8.783937454223633, "evaluation/val_ppl": 6528.5322265625, "_wandb": {"runtime": 1614}}

wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240802_173428-s75vpwte/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-08-02 17:34:28,332 INFO    MainThread:13969 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_setup.py:_flush():76] Configure stats pid to 13969
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240802_173428-s75vpwte/logs/debug.log
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240802_173428-s75vpwte/logs/debug-internal.log
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_init.py:init():566] calling init triggers
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-02-17:34:15', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_init.py:init():616] starting backend
+2024-08-02 17:34:28,333 INFO    MainThread:13969 [wandb_init.py:init():620] setting up manager
+2024-08-02 17:34:28,339 INFO    MainThread:13969 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-02 17:34:28,341 INFO    MainThread:13969 [wandb_init.py:init():628] backend started and connected
+2024-08-02 17:34:28,346 INFO    MainThread:13969 [wandb_init.py:init():720] updated telemetry
+2024-08-02 17:34:28,360 INFO    MainThread:13969 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-02 17:34:28,832 INFO    MainThread:13969 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-02 17:34:28,915 INFO    MainThread:13969 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-02 17:34:28,915 INFO    MainThread:13969 [wandb_init.py:init():804] starting run threads in backend
+2024-08-02 17:34:28,976 INFO    MainThread:13969 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-02 17:34:28,976 INFO    MainThread:13969 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-02 17:34:28,976 INFO    MainThread:13969 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-02 17:34:28,976 INFO    MainThread:13969 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-02 17:34:28,977 INFO    MainThread:13969 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-02 17:34:33,327 INFO    MainThread:13969 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
+2024-08-02 17:34:33,327 INFO    MainThread:13969 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}

wandb/run-20240804_135607-ikp7tdz1/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 256
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-llama-sample_train_2024-08-04-13:55:35
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama-sample
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama-sample
+base_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 2000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 2000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 2048
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-llama-sample
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32000
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722747367.911791
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 2048
+model_type:
+  desc: null
+  value: llama
+max_position_embeddings:
+  desc: null
+  value: 2048
+num_attention_heads:
+  desc: null
+  value: 32
+num_hidden_layers:
+  desc: null
+  value: 22
+model_architecture:
+  desc: null
+  value: LlamaForCausalLM

wandb/run-20240804_135607-ikp7tdz1/files/output.log ADDED Viewed

	@@ -0,0 +1,130 @@

+Created Hugging Face repository with ID koichi12/tiny-llama-sample.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping model loading
+--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
+You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      640000
+    validation: 35200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping optimizer loading
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): LlamaForCausalLM(
+    (model): LlamaModel(
+      (embed_tokens): Embedding(32000, 2048)
+      (layers): ModuleList(
+        (0-21): 22 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): LlamaDecoderLayer(
+              (self_attn): LlamaFlashAttention2(
+                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (v_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm()
+              (post_attention_layernorm): LlamaRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): LlamaRMSNorm()
+      (rotary_emb): LlamaRotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
+  )
+)
+model config: LlamaConfig {
+  "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32000
+}
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
+    loss: torch.Tensor = model(**batch).loss
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
+    output = self._fsdp_wrapped_module(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
+    outputs = self.model(
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
+    cache_position = torch.arange(
+RuntimeError: CUDA error: device-side assert triggered
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T04:56:08.637907",
+    "startedAt": "2024-08-04T04:56:07.879507",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "256",
+        "--sliding-window-size",
+        "2048",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "2000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "2000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+        "--save",
+        "/work/llm_recipes/models/tiny-llama-sample",
+        "--load",
+        "/work/llm_recipes/models/tiny-llama-sample",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-llama-sample",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-llama-sample_train_2024-08-04-13:55:35"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48781967163086
+    }
+}

wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 67}}

wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,216 @@

+2024-08-04 13:56:07,912 INFO    StreamThr :9151 [internal.py:wandb_internal():86] W&B internal server running at pid: 9151, started at: 2024-08-04 13:56:07.911369
+2024-08-04 13:56:07,914 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status
+2024-08-04 13:56:07,916 INFO    WriterThread:9151 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb
+2024-08-04 13:56:07,917 DEBUG   SenderThread:9151 [sender.py:send():382] send: header
+2024-08-04 13:56:08,068 DEBUG   SenderThread:9151 [sender.py:send():382] send: run
+2024-08-04 13:56:08,527 INFO    SenderThread:9151 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_135607-ikp7tdz1/files
+2024-08-04 13:56:08,527 INFO    SenderThread:9151 [sender.py:_start_run_threads():1136] run started: ikp7tdz1 with start time 1722747367.911791
+2024-08-04 13:56:08,532 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 13:56:08,533 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: check_version
+2024-08-04 13:56:08,619 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 13:56:08,625 DEBUG   HandlerThread:9151 [system_info.py:__init__():27] System info init
+2024-08-04 13:56:08,625 DEBUG   HandlerThread:9151 [system_info.py:__init__():42] System info init done
+2024-08-04 13:56:08,625 INFO    HandlerThread:9151 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 13:56:08,625 INFO    SystemMonitor:9151 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 13:56:08,626 INFO    HandlerThread:9151 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 13:56:08,626 INFO    SystemMonitor:9151 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 13:56:08,627 INFO    SystemMonitor:9151 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 13:56:08,628 INFO    SystemMonitor:9151 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 13:56:08,628 INFO    SystemMonitor:9151 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 13:56:08,629 INFO    SystemMonitor:9151 [interfaces.py:start():190] Started network monitoring
+2024-08-04 13:56:08,637 DEBUG   HandlerThread:9151 [system_info.py:probe():151] Probing system
+2024-08-04 13:56:08,639 DEBUG   HandlerThread:9151 [system_info.py:_probe_git():136] Probing git
+2024-08-04 13:56:08,651 DEBUG   HandlerThread:9151 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 13:56:08,651 DEBUG   HandlerThread:9151 [system_info.py:probe():199] Probing system done
+2024-08-04 13:56:08,651 DEBUG   HandlerThread:9151 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T04:56:08.637907', 'startedAt': '2024-08-04T04:56:07.879507', 'docker': None, 'cuda': None, 'args': ('--seq-length', '256', '--sliding-window-size', '2048', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama-sample', '--load', '/work/llm_recipes/models/tiny-llama-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama-sample_train_2024-08-04-13:55:35'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
+2024-08-04 13:56:08,651 INFO    HandlerThread:9151 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 13:56:08,651 INFO    HandlerThread:9151 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 13:56:08,653 INFO    HandlerThread:9151 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 13:56:08,681 DEBUG   SenderThread:9151 [sender.py:send():382] send: files
+2024-08-04 13:56:08,681 INFO    SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 13:56:08,690 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 13:56:08,690 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 13:56:08,691 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 13:56:08,691 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 13:56:08,692 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 13:56:08,938 DEBUG   SenderThread:9151 [sender.py:send():382] send: telemetry
+2024-08-04 13:56:09,405 INFO    wandb-upload_0:9151 [upload_job.py:push():131] Uploaded file /tmp/tmpins_li9awandb/mkgvo0s4-wandb-metadata.json
+2024-08-04 13:56:09,529 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt
+2024-08-04 13:56:09,529 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json
+2024-08-04 13:56:10,529 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
+2024-08-04 13:56:12,531 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
+2024-08-04 13:56:13,586 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:56:16,533 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
+2024-08-04 13:56:19,567 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:56:23,689 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 13:56:23,690 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 13:56:23,690 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 13:56:24,913 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:56:29,913 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:56:34,914 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:56:38,689 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 13:56:38,690 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 13:56:38,732 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 13:56:39,955 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:56:40,547 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
+2024-08-04 13:56:45,164 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:56:50,164 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:56:53,690 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 13:56:53,690 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 13:56:53,732 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 13:56:55,957 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:57:00,957 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:57:05,958 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:57:08,629 DEBUG   SystemMonitor:9151 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 13:57:08,630 DEBUG   SenderThread:9151 [sender.py:send():382] send: stats
+2024-08-04 13:57:08,690 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 13:57:08,690 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 13:57:08,732 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 13:57:11,872 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:57:15,196 DEBUG   SenderThread:9151 [sender.py:send():382] send: config
+2024-08-04 13:57:15,197 DEBUG   SenderThread:9151 [sender.py:send():382] send: config
+2024-08-04 13:57:16,571 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
+2024-08-04 13:57:16,600 DEBUG   SenderThread:9151 [sender.py:send():382] send: exit
+2024-08-04 13:57:16,601 INFO    SenderThread:9151 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 13:57:16,601 INFO    SenderThread:9151 [sender.py:send_exit():591] handling runtime: 67
+2024-08-04 13:57:16,602 INFO    SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 13:57:16,602 INFO    SenderThread:9151 [sender.py:send_exit():597] send defer
+2024-08-04 13:57:16,602 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:16,603 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 13:57:16,603 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:16,603 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 13:57:16,603 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 1
+2024-08-04 13:57:16,603 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:16,603 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 13:57:16,603 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:16,603 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 13:57:16,603 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 2
+2024-08-04 13:57:16,603 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:16,603 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 13:57:16,603 INFO    HandlerThread:9151 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 13:57:16,603 DEBUG   SystemMonitor:9151 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 13:57:16,604 INFO    HandlerThread:9151 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 13:57:16,604 DEBUG   SystemMonitor:9151 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 13:57:16,604 INFO    HandlerThread:9151 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 13:57:16,637 INFO    HandlerThread:9151 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 13:57:16,637 INFO    HandlerThread:9151 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 13:57:16,637 INFO    HandlerThread:9151 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 13:57:16,638 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:16,638 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 13:57:16,638 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 3
+2024-08-04 13:57:16,638 DEBUG   SenderThread:9151 [sender.py:send():382] send: stats
+2024-08-04 13:57:16,638 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:16,638 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 13:57:16,638 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:16,638 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 13:57:16,638 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 4
+2024-08-04 13:57:16,638 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:16,638 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 13:57:16,639 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:16,639 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 13:57:16,639 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 5
+2024-08-04 13:57:16,639 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:16,639 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 13:57:16,639 DEBUG   SenderThread:9151 [sender.py:send():382] send: summary
+2024-08-04 13:57:16,640 INFO    SenderThread:9151 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 13:57:16,640 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:16,640 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 13:57:16,640 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 6
+2024-08-04 13:57:16,640 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:16,640 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 13:57:16,640 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:16,640 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 13:57:16,643 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 13:57:16,835 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 7
+2024-08-04 13:57:16,836 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:16,836 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 13:57:16,836 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:16,836 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 13:57:17,572 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
+2024-08-04 13:57:17,572 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json
+2024-08-04 13:57:17,600 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 13:57:18,334 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 8
+2024-08-04 13:57:18,334 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 13:57:18,334 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:18,335 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 13:57:18,335 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:18,335 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 13:57:18,335 INFO    SenderThread:9151 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 13:57:18,336 INFO    SenderThread:9151 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 13:57:18,350 INFO    SenderThread:9151 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 13:57:18,359 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 9
+2024-08-04 13:57:18,360 DEBUG   SenderThread:9151 [sender.py:send():382] send: artifact
+2024-08-04 13:57:18,360 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:18,361 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 13:57:18,573 INFO    Thread-12 :9151 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
+2024-08-04 13:57:18,601 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 13:57:19,234 INFO    SenderThread:9151 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 13:57:19,234 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:19,234 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 13:57:19,234 INFO    SenderThread:9151 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 13:57:19,573 INFO    SenderThread:9151 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_135607-ikp7tdz1/files
+2024-08-04 13:57:19,574 INFO    SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt requirements.txt
+2024-08-04 13:57:19,574 INFO    SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml config.yaml
+2024-08-04 13:57:19,575 INFO    SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 13:57:19,576 INFO    SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json wandb-summary.json
+2024-08-04 13:57:19,577 INFO    SenderThread:9151 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log output.log
+2024-08-04 13:57:19,579 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 10
+2024-08-04 13:57:19,579 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 13:57:19,579 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:19,580 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 13:57:19,581 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:19,581 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 13:57:19,581 INFO    SenderThread:9151 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 13:57:19,601 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 13:57:19,601 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 13:57:19,983 INFO    wandb-upload_0:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/requirements.txt
+2024-08-04 13:57:20,084 INFO    wandb-upload_1:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/config.yaml
+2024-08-04 13:57:20,165 INFO    wandb-upload_2:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/wandb-summary.json
+2024-08-04 13:57:20,334 INFO    wandb-upload_3:9151 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_135607-ikp7tdz1/files/output.log
+2024-08-04 13:57:20,534 INFO    Thread-11 (_thread_body):9151 [sender.py:transition_state():617] send defer: 11
+2024-08-04 13:57:20,534 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:20,534 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 13:57:20,535 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:20,535 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 13:57:20,535 INFO    SenderThread:9151 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 13:57:20,535 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 12
+2024-08-04 13:57:20,535 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:20,535 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 13:57:20,535 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:20,535 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 13:57:20,535 INFO    SenderThread:9151 [file_stream.py:finish():595] file stream finish called
+2024-08-04 13:57:20,601 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 13:57:20,717 INFO    SenderThread:9151 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 13:57:20,717 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 13
+2024-08-04 13:57:20,717 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 13:57:20,717 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:20,718 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 13:57:20,718 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:20,718 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 13:57:20,718 INFO    SenderThread:9151 [sender.py:transition_state():617] send defer: 14
+2024-08-04 13:57:20,718 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 13:57:20,718 DEBUG   SenderThread:9151 [sender.py:send():382] send: final
+2024-08-04 13:57:20,718 INFO    HandlerThread:9151 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 13:57:20,718 DEBUG   SenderThread:9151 [sender.py:send():382] send: footer
+2024-08-04 13:57:20,719 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: defer
+2024-08-04 13:57:20,719 INFO    SenderThread:9151 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 13:57:20,719 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 13:57:20,719 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 13:57:20,719 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 13:57:20,720 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 13:57:20,720 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 13:57:20,720 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 13:57:20,720 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: server_info
+2024-08-04 13:57:20,721 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 13:57:20,722 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 13:57:20,722 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 13:57:20,885 DEBUG   SenderThread:9151 [sender.py:send_request():409] send_request: job_info
+2024-08-04 13:57:20,885 INFO    MainThread:9151 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 13:57:20,885 INFO    MainThread:9151 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 13:57:20,885 INFO    MainThread:9151 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 13:57:20,886 DEBUG   HandlerThread:9151 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 13:57:20,886 INFO    HandlerThread:9151 [handler.py:finish():869] shutting down handler
+2024-08-04 13:57:21,722 INFO    WriterThread:9151 [datastore.py:close():296] close: /project/wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb
+2024-08-04 13:57:21,885 INFO    SenderThread:9151 [sender.py:finish():1572] shutting down sender
+2024-08-04 13:57:21,885 INFO    SenderThread:9151 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 13:57:21,885 INFO    SenderThread:9151 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_135607-ikp7tdz1/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_setup.py:_flush():76] Configure stats pid to 9079
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_135607-ikp7tdz1/logs/debug.log
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_135607-ikp7tdz1/logs/debug-internal.log
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_init.py:init():566] calling init triggers
+2024-08-04 13:56:07,904 INFO    MainThread:9079 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 256, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama-sample_train_2024-08-04-13:55:35', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama-sample', 'save': '/work/llm_recipes/models/tiny-llama-sample', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 2048, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
+2024-08-04 13:56:07,905 INFO    MainThread:9079 [wandb_init.py:init():616] starting backend
+2024-08-04 13:56:07,905 INFO    MainThread:9079 [wandb_init.py:init():620] setting up manager
+2024-08-04 13:56:07,909 INFO    MainThread:9079 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 13:56:07,911 INFO    MainThread:9079 [wandb_init.py:init():628] backend started and connected
+2024-08-04 13:56:07,916 INFO    MainThread:9079 [wandb_init.py:init():720] updated telemetry
+2024-08-04 13:56:08,064 INFO    MainThread:9079 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 13:56:08,532 INFO    MainThread:9079 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 13:56:08,612 INFO    MainThread:9079 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 13:56:08,612 INFO    MainThread:9079 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 13:56:08,689 INFO    MainThread:9079 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 13:56:08,689 INFO    MainThread:9079 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 13:56:08,690 INFO    MainThread:9079 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 13:56:08,690 INFO    MainThread:9079 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 13:56:08,691 INFO    MainThread:9079 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 13:57:15,195 INFO    MainThread:9079 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
+2024-08-04 13:57:15,196 INFO    MainThread:9079 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 13:57:21,887 WARNING MsgRouterThr:9079 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_135607-ikp7tdz1/run-ikp7tdz1.wandb ADDED Viewed

Binary file (22.5 kB). View file

wandb/run-20240812_070449-ufge4h1y/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-qwen2-0.5B_train_2024-08-12-07:04:37
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 5
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-qwen2-0.5B
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1723413889.11596
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Qwen2ForCausalLM
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 896
+model_type:
+  desc: null
+  value: qwen2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 14
+num_hidden_layers:
+  desc: null
+  value: 24

wandb/run-20240812_070449-ufge4h1y/files/output.log ADDED Viewed

	@@ -0,0 +1,158 @@

+Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
+--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
+--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 323200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151936, 896)
+      (layers): ModuleList(
+        (0-23): 24 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Qwen2DecoderLayer(
+              (self_attn): Qwen2FlashAttention2(
+                (q_proj): Linear(in_features=896, out_features=896, bias=True)
+                (k_proj): Linear(in_features=896, out_features=128, bias=True)
+                (v_proj): Linear(in_features=896, out_features=128, bias=True)
+                (o_proj): Linear(in_features=896, out_features=896, bias=False)
+                (rotary_emb): Qwen2RotaryEmbedding()
+              )
+              (mlp): Qwen2MLP(
+                (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (up_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (down_proj): Linear(in_features=4864, out_features=896, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen2RMSNorm()
+              (post_attention_layernorm): Qwen2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Qwen2RMSNorm()
+    )
+    (lm_head): Linear(in_features=896, out_features=151936, bias=False)
+  )
+)
+model config: Qwen2Config {
+  "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+------------------------------------------------------------------
+iteration: 1 , TFLOPS: 69.43623917184445, Tokens per sec: 17268.44384112612, Loss: 4.1814446449279785
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 2 , TFLOPS: 69.64205785663373, Tokens per sec: 17319.629914020166, Loss: 4.191491603851318
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 3 , TFLOPS: 69.60094665048808, Tokens per sec: 17309.405763590446, Loss: 4.197597026824951
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 4 , TFLOPS: 69.47512522949748, Tokens per sec: 17278.114608304662, Loss: 4.183670520782471
+------------------------------------------------------------------
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+------------------------------------------------------------------
+iteration: 5 , TFLOPS: 69.67467547447801, Tokens per sec: 17327.7417517103, Loss: 4.198245048522949
+------------------------------------------------------------------
+Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005
+Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
+Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
+[rank0]:[2024-08-12 07:11:16,345] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling:  defaultdict(<class 'float'>, {'preprocessing': 0.006517466999866883, 'preprocessing_with_comm': 0.0007555539996246807, 'state_converting': 0.9849483990001318, <Type.ALL: 'all'>: 0.9936859660001574})
+Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
+Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
+Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
+Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
+Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
+Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
+Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005, took 4.44s
+------------------------------------------------------------------
+iteration: 6 , TFLOPS: 70.22008480550622, Tokens per sec: 17463.382312253587, Loss: 4.179391860961914
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 7 , TFLOPS: 69.98955682269778, Tokens per sec: 17406.051161079293, Loss: 4.190949440002441
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 8 , TFLOPS: 69.94509258955091, Tokens per sec: 17394.993129679646, Loss: 4.189082622528076
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 9 , TFLOPS: 70.07602036768274, Tokens per sec: 17427.55421033261, Loss: 4.181089878082275
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 10 , TFLOPS: 70.03395601975187, Tokens per sec: 17417.093018329397, Loss: 4.1603803634643555
+------------------------------------------------------------------
+Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010
+Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/model.pt
+Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/model.pt
+[rank0]:[2024-08-12 07:17:37,283] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling:  defaultdict(<class 'float'>, {'preprocessing': 0.0064329239994549425, 'preprocessing_with_comm': 0.0007190309997895383, 'state_converting': 0.9757228209991808, <Type.ALL: 'all'>: 0.9842789310005173})
+Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/optimizer.pt
+Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/optimizer.pt
+Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/scheduler.pt
+Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/scheduler.pt
+Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/rng.pt
+Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010/rng.pt
+Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000010, took 4.48s
+------------------------------------------------------------------
+iteration: 11 , TFLOPS: 70.31766010694388, Tokens per sec: 17487.64879951231, Loss: 4.118324279785156
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 12 , TFLOPS: 70.37958976318761, Tokens per sec: 17503.050393891557, Loss: 4.171144008636475
+------------------------------------------------------------------

wandb/run-20240812_070449-ufge4h1y/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-11T22:04:49.754332",
+    "startedAt": "2024-08-11T22:04:49.102690",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "5",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--load",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-qwen2-0.5B",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-qwen2-0.5B_train_2024-08-12-07:04:37"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0429999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.487823486328125
+    }
+}

wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"training/loss": 4.171144008636475, "training/perplexity": 64.78952950804121, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 12, "optimizer/lr": 1.4560000000000001e-06, "optimizer/variance_l2": 0.012989128226478895, "optimizer/variance_sqrt_l2": 0.6784465027663834, "optimizer/momentum_l2": 0.7107880089338467, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.4604034423828125, "optimizer/variance_sqrt_l1": 2849.0, "optimizer/momentum_l1": 2785.25, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.00909423828125, "optimizer/variance_sqrt_abs_max": 0.09521484375, "optimizer/momentum_abs_max": 0.10107421875, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.9035151299995, "stats/tokens_per_sec": 17503.050393891557, "stats/tokens_per_sec_per_gpu": 17503.050393891557, "stats/tflops": 70.37958976318761, "_timestamp": 1723414808.909133, "_runtime": 919.7931730747223, "_step": 12, "_wandb": {"runtime": 922}}

wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,616 @@

+2024-08-12 07:04:49,117 INFO    StreamThr :13762 [internal.py:wandb_internal():86] W&B internal server running at pid: 13762, started at: 2024-08-12 07:04:49.116639
+2024-08-12 07:04:49,119 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status
+2024-08-12 07:04:49,121 INFO    WriterThread:13762 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb
+2024-08-12 07:04:49,122 DEBUG   SenderThread:13762 [sender.py:send():382] send: header
+2024-08-12 07:04:49,136 DEBUG   SenderThread:13762 [sender.py:send():382] send: run
+2024-08-12 07:04:49,638 INFO    SenderThread:13762 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_070449-ufge4h1y/files
+2024-08-12 07:04:49,638 INFO    SenderThread:13762 [sender.py:_start_run_threads():1136] run started: ufge4h1y with start time 1723413889.11596
+2024-08-12 07:04:49,643 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: check_version
+2024-08-12 07:04:49,643 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: check_version
+2024-08-12 07:04:49,733 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: run_start
+2024-08-12 07:04:49,739 DEBUG   HandlerThread:13762 [system_info.py:__init__():27] System info init
+2024-08-12 07:04:49,739 DEBUG   HandlerThread:13762 [system_info.py:__init__():42] System info init done
+2024-08-12 07:04:49,740 INFO    HandlerThread:13762 [system_monitor.py:start():194] Starting system monitor
+2024-08-12 07:04:49,740 INFO    SystemMonitor:13762 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-12 07:04:49,740 INFO    HandlerThread:13762 [system_monitor.py:probe():214] Collecting system info
+2024-08-12 07:04:49,740 INFO    SystemMonitor:13762 [interfaces.py:start():190] Started cpu monitoring
+2024-08-12 07:04:49,741 INFO    SystemMonitor:13762 [interfaces.py:start():190] Started disk monitoring
+2024-08-12 07:04:49,741 INFO    SystemMonitor:13762 [interfaces.py:start():190] Started gpu monitoring
+2024-08-12 07:04:49,742 INFO    SystemMonitor:13762 [interfaces.py:start():190] Started memory monitoring
+2024-08-12 07:04:49,744 INFO    SystemMonitor:13762 [interfaces.py:start():190] Started network monitoring
+2024-08-12 07:04:49,754 DEBUG   HandlerThread:13762 [system_info.py:probe():151] Probing system
+2024-08-12 07:04:49,756 DEBUG   HandlerThread:13762 [system_info.py:_probe_git():136] Probing git
+2024-08-12 07:04:49,770 DEBUG   HandlerThread:13762 [system_info.py:_probe_git():144] Probing git done
+2024-08-12 07:04:49,771 DEBUG   HandlerThread:13762 [system_info.py:probe():199] Probing system done
+2024-08-12 07:04:49,771 DEBUG   HandlerThread:13762 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T22:04:49.754332', 'startedAt': '2024-08-11T22:04:49.102690', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '5', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-07:04:37'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
+2024-08-12 07:04:49,771 INFO    HandlerThread:13762 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-12 07:04:49,771 INFO    HandlerThread:13762 [system_monitor.py:probe():227] Publishing system info
+2024-08-12 07:04:49,772 INFO    HandlerThread:13762 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-12 07:04:49,779 DEBUG   SenderThread:13762 [sender.py:send():382] send: files
+2024-08-12 07:04:49,779 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-12 07:04:49,788 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-12 07:04:49,789 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:04:49,789 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:04:49,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: python_packages
+2024-08-12 07:04:49,791 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:04:50,088 DEBUG   SenderThread:13762 [sender.py:send():382] send: telemetry
+2024-08-12 07:04:50,465 INFO    wandb-upload_0:13762 [upload_job.py:push():131] Uploaded file /tmp/tmp0h3j51sdwandb/z7nk28zc-wandb-metadata.json
+2024-08-12 07:04:50,640 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json
+2024-08-12 07:04:50,640 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:04:50,640 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/requirements.txt
+2024-08-12 07:04:52,640 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:04:54,468 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:04:54,641 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:04:54,719 DEBUG   SenderThread:13762 [sender.py:send():382] send: config
+2024-08-12 07:04:54,719 DEBUG   SenderThread:13762 [sender.py:send():382] send: config
+2024-08-12 07:04:56,643 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:04:59,720 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:04,721 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:04,789 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:05:04,790 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:05:04,790 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:05:10,015 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:15,015 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:19,788 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:05:19,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:05:19,828 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:05:20,046 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:20,658 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/config.yaml
+2024-08-12 07:05:25,253 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:30,254 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:34,788 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:05:34,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:05:34,832 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:05:36,061 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:41,062 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:46,063 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:49,744 DEBUG   SystemMonitor:13762 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-12 07:05:49,746 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:05:49,788 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:05:49,788 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:05:49,828 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:05:51,986 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:05:56,987 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:01,988 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:04,788 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:06:04,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:06:04,832 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:06:06,993 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:10,837 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:06:12,691 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:06:12,882 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:17,882 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:19,747 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:06:19,788 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:06:19,789 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:06:19,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:06:23,039 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:28,039 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:33,040 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:34,789 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:06:34,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:06:34,832 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:06:39,036 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:44,037 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:49,037 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:49,748 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:06:49,789 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:06:49,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:06:49,832 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:06:54,988 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:06:59,989 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:04,789 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:07:04,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:07:04,832 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:07:05,036 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:10,037 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:15,038 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:19,749 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:07:19,789 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:07:19,789 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:07:19,832 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:07:20,985 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:25,986 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:26,535 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:07:26,538 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:07:26,538 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:07:26,540 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:07:26,739 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:07:28,741 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:07:31,578 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:34,791 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:07:34,791 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:07:34,791 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:07:37,002 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:42,003 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:47,004 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:49,750 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:07:49,792 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:07:49,792 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:07:49,832 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:07:52,985 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:07:57,986 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:02,986 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:04,792 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:08:04,792 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:08:04,793 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:08:08,037 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:13,038 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:18,039 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:19,751 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:08:19,791 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:08:19,792 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:08:19,792 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:08:23,989 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:28,990 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:33,991 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:34,792 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:08:34,792 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:08:34,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:08:39,042 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:42,279 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:08:42,281 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:08:42,282 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:08:42,283 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:08:42,792 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:08:44,322 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:44,793 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:08:49,322 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:49,752 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:08:49,792 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:08:49,792 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:08:49,794 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:08:54,999 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:08:59,999 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:04,792 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:09:04,793 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:09:04,832 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:09:05,001 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:10,002 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:15,003 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:19,753 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:09:19,793 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:09:19,793 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:09:19,793 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:09:20,044 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:25,045 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:30,046 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:34,793 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:09:34,793 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:09:34,793 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:09:35,995 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:40,995 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:45,996 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:49,754 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:09:49,794 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:09:49,794 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:09:49,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:09:51,979 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:56,980 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:09:58,160 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:09:58,162 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:09:58,162 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:09:58,163 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:09:58,845 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:10:00,846 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:10:02,202 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:04,793 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:10:04,794 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:10:04,794 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:10:08,061 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:13,062 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:18,063 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:19,755 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:10:19,793 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:10:19,794 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:10:19,794 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:10:23,070 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:28,071 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:33,072 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:34,794 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:10:34,795 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:10:34,795 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:10:38,976 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:43,977 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:48,978 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:49,758 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:10:49,793 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:10:49,794 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:10:49,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:10:54,010 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:10:59,011 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:04,012 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:04,794 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:11:04,794 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:11:04,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:11:09,041 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:13,824 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:11:13,826 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:11:13,826 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:11:13,827 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:11:13,896 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:11:14,866 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:14,897 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:11:16,898 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:11:18,900 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:11:19,757 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:11:19,794 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:11:19,794 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:11:19,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:11:20,004 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:20,901 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:11:25,004 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:30,005 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:34,794 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:11:34,795 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:11:34,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:11:35,993 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:40,994 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:45,994 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:49,758 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:11:49,795 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:11:49,795 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:11:49,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:11:51,989 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:11:56,990 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:01,990 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:04,795 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:12:04,795 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:12:04,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:12:06,998 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:11,999 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:17,000 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:19,760 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:12:19,795 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:12:19,795 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:12:19,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:12:22,010 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:27,011 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:32,011 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:33,344 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:12:33,346 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:12:33,346 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:12:33,348 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:12:33,948 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:12:34,796 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:12:34,796 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:12:34,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:12:34,948 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:12:38,002 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:43,002 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:48,003 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:49,760 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:12:49,795 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:12:49,796 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:12:49,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:12:53,056 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:12:58,057 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:03,057 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:04,796 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:13:04,796 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:13:04,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:13:09,033 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:14,033 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:19,034 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:19,761 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:13:19,796 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:13:19,796 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:13:19,836 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:13:25,033 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:30,033 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:34,838 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:13:34,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:13:34,839 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:13:35,055 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:40,056 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:45,057 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:48,668 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:13:48,670 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:13:48,671 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:13:48,672 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:13:48,997 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:13:49,762 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:13:49,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:13:49,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:13:49,840 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:13:50,998 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:13:51,022 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:13:56,023 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:01,023 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:04,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:14:04,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:14:04,839 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:14:06,089 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:11,090 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:16,090 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:19,763 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:14:19,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:14:19,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:14:19,839 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:14:21,108 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:26,109 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:31,109 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:34,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:14:34,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:14:34,840 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:14:37,031 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:42,032 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:47,033 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:49,764 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:14:49,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:14:49,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:14:49,840 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:14:52,060 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:14:57,061 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:02,061 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:04,039 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:15:04,041 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:15:04,041 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:15:04,043 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:15:04,047 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:15:04,839 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:15:04,841 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:15:04,841 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:15:05,048 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:15:07,077 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:12,077 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:17,078 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:19,765 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:15:19,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:15:19,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:15:19,840 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:15:22,080 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:27,081 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:32,082 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:34,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:15:34,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:15:34,840 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:15:38,041 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:43,042 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:48,042 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:49,766 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:15:49,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:15:49,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:15:49,840 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:15:53,080 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:15:58,080 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:03,081 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:04,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:16:04,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:16:04,841 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:16:09,051 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:14,052 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:19,053 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:19,269 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:16:19,271 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:16:19,271 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:16:19,273 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:16:19,767 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:16:19,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:16:19,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:16:19,841 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:16:20,099 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:16:21,099 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:16:25,052 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:30,052 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:34,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:16:34,841 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:16:34,841 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:16:35,100 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:40,100 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:45,101 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:49,768 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:16:49,840 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:16:49,841 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:16:49,841 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:16:51,038 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:16:56,039 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:01,040 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:04,841 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:17:04,841 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:17:04,841 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:17:06,122 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:11,123 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:16,124 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:19,769 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:17:19,841 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:17:19,841 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:17:19,842 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:17:22,020 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:27,021 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:32,022 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:34,545 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:17:34,548 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:17:34,548 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:17:34,550 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:17:35,013 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:17:35,041 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:17:35,041 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:17:35,149 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:17:37,151 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:17:37,272 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:39,152 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:17:41,154 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:17:43,033 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:48,033 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:49,770 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:17:49,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:17:49,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:17:49,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:17:53,197 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:17:58,198 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:03,198 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:04,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:18:04,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:18:04,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:18:08,232 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:13,233 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:18,233 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:19,771 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:18:19,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:18:19,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:18:19,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:18:23,237 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:28,237 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:33,238 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:34,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:18:34,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:18:34,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:18:39,167 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:44,168 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:49,168 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:49,772 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:18:49,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:18:49,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:18:49,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:18:54,004 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:18:54,006 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:18:54,007 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:18:54,008 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:18:54,198 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:18:55,009 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:18:55,199 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:19:00,010 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:04,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:19:04,971 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:19:04,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:19:05,244 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:10,245 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:15,245 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:19,773 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:19:19,970 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:19:19,971 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:19:19,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:19:21,167 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:26,168 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:31,169 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:34,971 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:19:34,971 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:19:34,972 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:19:37,149 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:42,150 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:47,151 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:49,774 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:19:49,971 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:19:49,971 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:19:49,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:19:52,230 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:19:57,230 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:02,231 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:04,971 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:20:04,971 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:20:04,971 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:20:08,210 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:08,910 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:20:08,913 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:20:08,913 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:20:08,914 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:20:09,243 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:20:09,244 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:20:12,332 DEBUG   SenderThread:13762 [sender.py:send():382] send: exit
+2024-08-12 07:20:12,332 INFO    SenderThread:13762 [sender.py:send_exit():589] handling exit code: 255
+2024-08-12 07:20:12,332 INFO    SenderThread:13762 [sender.py:send_exit():591] handling runtime: 922
+2024-08-12 07:20:12,333 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:20:12,334 INFO    SenderThread:13762 [sender.py:send_exit():597] send defer
+2024-08-12 07:20:12,334 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:12,334 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-12 07:20:12,334 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:12,334 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-12 07:20:12,334 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 1
+2024-08-12 07:20:12,334 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:12,334 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-12 07:20:12,334 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:12,334 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-12 07:20:12,334 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 2
+2024-08-12 07:20:12,335 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:12,335 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-12 07:20:12,335 INFO    HandlerThread:13762 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-12 07:20:12,335 DEBUG   SystemMonitor:13762 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-12 07:20:12,335 INFO    HandlerThread:13762 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-12 07:20:12,335 DEBUG   SystemMonitor:13762 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-12 07:20:12,335 INFO    HandlerThread:13762 [interfaces.py:finish():202] Joined disk monitor
+2024-08-12 07:20:12,371 INFO    HandlerThread:13762 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-12 07:20:12,371 INFO    HandlerThread:13762 [interfaces.py:finish():202] Joined memory monitor
+2024-08-12 07:20:12,371 INFO    HandlerThread:13762 [interfaces.py:finish():202] Joined network monitor
+2024-08-12 07:20:12,372 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:12,372 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-12 07:20:12,372 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 3
+2024-08-12 07:20:12,372 DEBUG   SenderThread:13762 [sender.py:send():382] send: stats
+2024-08-12 07:20:12,372 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:12,373 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-12 07:20:12,374 DEBUG   SenderThread:13762 [sender.py:send():382] send: history
+2024-08-12 07:20:12,374 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:20:12,375 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:20:12,376 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:12,376 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-12 07:20:12,376 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 4
+2024-08-12 07:20:12,376 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:12,376 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-12 07:20:12,376 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:12,376 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-12 07:20:12,376 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 5
+2024-08-12 07:20:12,376 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:12,376 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-12 07:20:12,377 DEBUG   SenderThread:13762 [sender.py:send():382] send: summary
+2024-08-12 07:20:12,378 INFO    SenderThread:13762 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:20:12,378 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:12,378 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-12 07:20:12,378 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 6
+2024-08-12 07:20:12,378 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:12,378 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-12 07:20:12,379 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:12,379 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-12 07:20:12,379 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 7
+2024-08-12 07:20:12,379 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:12,379 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:12,379 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-12 07:20:12,379 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:12,379 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-12 07:20:13,247 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json
+2024-08-12 07:20:13,332 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 07:20:15,017 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 8
+2024-08-12 07:20:15,017 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 07:20:15,017 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:15,017 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-12 07:20:15,018 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:15,018 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-12 07:20:15,018 INFO    SenderThread:13762 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-12 07:20:15,019 INFO    SenderThread:13762 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-12 07:20:15,033 INFO    SenderThread:13762 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-12 07:20:15,042 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 9
+2024-08-12 07:20:15,042 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:15,042 DEBUG   SenderThread:13762 [sender.py:send():382] send: artifact
+2024-08-12 07:20:15,042 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-12 07:20:15,248 INFO    Thread-12 :13762 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log
+2024-08-12 07:20:15,333 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 07:20:15,953 INFO    SenderThread:13762 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
+2024-08-12 07:20:15,953 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:15,953 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-12 07:20:15,953 INFO    SenderThread:13762 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-12 07:20:16,249 INFO    SenderThread:13762 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_070449-ufge4h1y/files
+2024-08-12 07:20:16,249 INFO    SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/requirements.txt requirements.txt
+2024-08-12 07:20:16,250 INFO    SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/config.yaml config.yaml
+2024-08-12 07:20:16,250 INFO    SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-metadata.json wandb-metadata.json
+2024-08-12 07:20:16,250 INFO    SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/wandb-summary.json wandb-summary.json
+2024-08-12 07:20:16,250 INFO    SenderThread:13762 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_070449-ufge4h1y/files/output.log output.log
+2024-08-12 07:20:16,250 INFO    SenderThread:13762 [sender.py:transition_state():617] send defer: 10
+2024-08-12 07:20:16,250 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 07:20:16,251 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:20:16,251 INFO    HandlerThread:13762 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-12 07:20:16,251 DEBUG   SenderThread:13762 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:20:16,251 INFO    SenderThread:13762 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-12 07:20:16,251 INFO    SenderThread:13762 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 07:20:20,252 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:25,252 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:30,253 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:35,254 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:40,254 DEBUG   HandlerThread:13762 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:20:43,105 WARNING StreamThr :13762 [internal.py:is_dead():414] Internal process exiting, parent pid 13691 disappeared
+2024-08-12 07:20:43,105 ERROR   StreamThr :13762 [internal.py:wandb_internal():152] Internal process shutdown.
+2024-08-12 07:20:43,255 INFO    SenderThread:13762 [sender.py:finish():1572] shutting down sender
+2024-08-12 07:20:43,255 INFO    SenderThread:13762 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 07:20:43,255 INFO    HandlerThread:13762 [handler.py:finish():869] shutting down handler
+2024-08-12 07:20:43,255 INFO    SenderThread:13762 [file_pusher.py:join():178] waiting for file pusher
+2024-08-12 07:20:43,255 INFO    WriterThread:13762 [datastore.py:close():296] close: /project/wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb
+2024-08-12 07:20:43,255 INFO    SenderThread:13762 [file_stream.py:finish():595] file stream finish called
+2024-08-12 07:20:43,425 INFO    SenderThread:13762 [file_stream.py:finish():599] file stream finish is done

wandb/run-20240812_070449-ufge4h1y/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-08-12 07:04:49,108 INFO    MainThread:13691 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_setup.py:_flush():76] Configure stats pid to 13691
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_070449-ufge4h1y/logs/debug.log
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_070449-ufge4h1y/logs/debug-internal.log
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_init.py:init():566] calling init triggers
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:04:37', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 5, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_init.py:init():616] starting backend
+2024-08-12 07:04:49,109 INFO    MainThread:13691 [wandb_init.py:init():620] setting up manager
+2024-08-12 07:04:49,114 INFO    MainThread:13691 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-12 07:04:49,115 INFO    MainThread:13691 [wandb_init.py:init():628] backend started and connected
+2024-08-12 07:04:49,120 INFO    MainThread:13691 [wandb_init.py:init():720] updated telemetry
+2024-08-12 07:04:49,131 INFO    MainThread:13691 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-12 07:04:49,642 INFO    MainThread:13691 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-12 07:04:49,725 INFO    MainThread:13691 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-12 07:04:49,725 INFO    MainThread:13691 [wandb_init.py:init():804] starting run threads in backend
+2024-08-12 07:04:49,788 INFO    MainThread:13691 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-12 07:04:49,788 INFO    MainThread:13691 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-12 07:04:49,788 INFO    MainThread:13691 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-12 07:04:49,788 INFO    MainThread:13691 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-12 07:04:49,789 INFO    MainThread:13691 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-12 07:04:54,718 INFO    MainThread:13691 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
+2024-08-12 07:04:54,718 INFO    MainThread:13691 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}

wandb/run-20240812_070449-ufge4h1y/run-ufge4h1y.wandb ADDED Viewed

Binary file (81.8 kB). View file

wandb/run-20240812_073202-yby212na/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-qwen2-0.5B_train_2024-08-12-07:31:51
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 5
+save_interval:
+  desc: null
+  value: 1
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-qwen2-0.5B
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1723415522.366221
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Qwen2ForCausalLM
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 896
+model_type:
+  desc: null
+  value: qwen2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 14
+num_hidden_layers:
+  desc: null
+  value: 24

wandb/run-20240812_073202-yby212na/files/output.log ADDED Viewed

	@@ -0,0 +1,116 @@

+Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
+--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
+--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 12803200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151936, 896)
+      (layers): ModuleList(
+        (0-23): 24 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Qwen2DecoderLayer(
+              (self_attn): Qwen2FlashAttention2(
+                (q_proj): Linear(in_features=896, out_features=896, bias=True)
+                (k_proj): Linear(in_features=896, out_features=128, bias=True)
+                (v_proj): Linear(in_features=896, out_features=128, bias=True)
+                (o_proj): Linear(in_features=896, out_features=896, bias=False)
+                (rotary_emb): Qwen2RotaryEmbedding()
+              )
+              (mlp): Qwen2MLP(
+                (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (up_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (down_proj): Linear(in_features=4864, out_features=896, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen2RMSNorm()
+              (post_attention_layernorm): Qwen2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Qwen2RMSNorm()
+    )
+    (lm_head): Linear(in_features=896, out_features=151936, bias=False)
+  )
+)
+model config: Qwen2Config {
+  "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+------------------------------------------------------------------
+iteration: 1 , TFLOPS: 69.93553660778689, Tokens per sec: 17392.616605023257, Loss: 4.1814446449279785
+------------------------------------------------------------------
+Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001
+Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/model.pt
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/model.pt
+Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/optimizer.pt
+[rank0]:[2024-08-12 07:33:22,462] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling:  defaultdict(<class 'float'>, {'preprocessing': 0.006542664999869885, 'preprocessing_with_comm': 0.0007797380003466969, 'state_converting': 0.9963913259998662, <Type.ALL: 'all'>: 1.0051406040001893})
+Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/optimizer.pt
+Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/scheduler.pt
+Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/scheduler.pt
+Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/rng.pt
+Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001/rng.pt
+None
+/work/llm_recipes/models/yans-qwen2-0.5B/tokenizer
+Saved checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000001, took 4.39s

wandb/run-20240812_073202-yby212na/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240812_073202-yby212na/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-11T22:32:03.032279",
+    "startedAt": "2024-08-11T22:32:02.353340",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "1",
+        "--eval-interval",
+        "5",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--load",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-qwen2-0.5B",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-qwen2-0.5B_train_2024-08-12-07:31:51"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0429999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.487823486328125
+    }
+}

wandb/run-20240812_073202-yby212na/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"_wandb": {"runtime": 132}, "training/loss": 4.1814446449279785, "training/perplexity": 65.46035190441053, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1, "optimizer/lr": 1.038e-06, "optimizer/variance_l2": 0.001437161465185535, "optimizer/variance_sqrt_l2": 0.22307888709863474, "optimizer/momentum_l2": 0.09989735636562776, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.04984140396118164, "optimizer/variance_sqrt_l1": 889.25, "optimizer/momentum_l1": 397.875, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.00101470947265625, "optimizer/variance_sqrt_abs_max": 0.03173828125, "optimizer/momentum_abs_max": 0.0142822265625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 75.37911228499979, "stats/tokens_per_sec": 17392.616605023257, "stats/tokens_per_sec_per_gpu": 17392.616605023257, "stats/tflops": 69.93553660778689, "_timestamp": 1723415599.9530108, "_runtime": 77.58678984642029, "_step": 1}

wandb/run-20240812_073202-yby212na/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,236 @@

+2024-08-12 07:32:02,368 INFO    StreamThr :14458 [internal.py:wandb_internal():86] W&B internal server running at pid: 14458, started at: 2024-08-12 07:32:02.367023
+2024-08-12 07:32:02,369 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status
+2024-08-12 07:32:02,371 INFO    WriterThread:14458 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_073202-yby212na/run-yby212na.wandb
+2024-08-12 07:32:02,372 DEBUG   SenderThread:14458 [sender.py:send():382] send: header
+2024-08-12 07:32:02,386 DEBUG   SenderThread:14458 [sender.py:send():382] send: run
+2024-08-12 07:32:02,917 INFO    SenderThread:14458 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_073202-yby212na/files
+2024-08-12 07:32:02,917 INFO    SenderThread:14458 [sender.py:_start_run_threads():1136] run started: yby212na with start time 1723415522.366221
+2024-08-12 07:32:02,923 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: check_version
+2024-08-12 07:32:02,923 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: check_version
+2024-08-12 07:32:03,012 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: run_start
+2024-08-12 07:32:03,018 DEBUG   HandlerThread:14458 [system_info.py:__init__():27] System info init
+2024-08-12 07:32:03,018 DEBUG   HandlerThread:14458 [system_info.py:__init__():42] System info init done
+2024-08-12 07:32:03,018 INFO    HandlerThread:14458 [system_monitor.py:start():194] Starting system monitor
+2024-08-12 07:32:03,019 INFO    SystemMonitor:14458 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-12 07:32:03,019 INFO    HandlerThread:14458 [system_monitor.py:probe():214] Collecting system info
+2024-08-12 07:32:03,019 INFO    SystemMonitor:14458 [interfaces.py:start():190] Started cpu monitoring
+2024-08-12 07:32:03,020 INFO    SystemMonitor:14458 [interfaces.py:start():190] Started disk monitoring
+2024-08-12 07:32:03,020 INFO    SystemMonitor:14458 [interfaces.py:start():190] Started gpu monitoring
+2024-08-12 07:32:03,021 INFO    SystemMonitor:14458 [interfaces.py:start():190] Started memory monitoring
+2024-08-12 07:32:03,022 INFO    SystemMonitor:14458 [interfaces.py:start():190] Started network monitoring
+2024-08-12 07:32:03,032 DEBUG   HandlerThread:14458 [system_info.py:probe():151] Probing system
+2024-08-12 07:32:03,034 DEBUG   HandlerThread:14458 [system_info.py:_probe_git():136] Probing git
+2024-08-12 07:32:03,046 DEBUG   HandlerThread:14458 [system_info.py:_probe_git():144] Probing git done
+2024-08-12 07:32:03,047 DEBUG   HandlerThread:14458 [system_info.py:probe():199] Probing system done
+2024-08-12 07:32:03,047 DEBUG   HandlerThread:14458 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T22:32:03.032279', 'startedAt': '2024-08-11T22:32:02.353340', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '1', '--eval-interval', '5', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-07:31:51'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
+2024-08-12 07:32:03,047 INFO    HandlerThread:14458 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-12 07:32:03,047 INFO    HandlerThread:14458 [system_monitor.py:probe():227] Publishing system info
+2024-08-12 07:32:03,048 INFO    HandlerThread:14458 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-12 07:32:03,054 DEBUG   SenderThread:14458 [sender.py:send():382] send: files
+2024-08-12 07:32:03,054 INFO    SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-12 07:32:03,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-12 07:32:03,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:32:03,065 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:32:03,065 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: python_packages
+2024-08-12 07:32:03,067 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:32:03,383 DEBUG   SenderThread:14458 [sender.py:send():382] send: telemetry
+2024-08-12 07:32:03,716 INFO    wandb-upload_0:14458 [upload_job.py:push():131] Uploaded file /tmp/tmpjkv15ab8wandb/lrd2pdzk-wandb-metadata.json
+2024-08-12 07:32:03,919 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/requirements.txt
+2024-08-12 07:32:03,920 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/output.log
+2024-08-12 07:32:03,920 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/wandb-metadata.json
+2024-08-12 07:32:04,384 DEBUG   SenderThread:14458 [sender.py:send():382] send: config
+2024-08-12 07:32:04,384 DEBUG   SenderThread:14458 [sender.py:send():382] send: config
+2024-08-12 07:32:05,920 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
+2024-08-12 07:32:07,384 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:12,385 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:17,386 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:18,065 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:32:18,065 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:32:18,065 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:32:23,322 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:28,323 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:33,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:32:33,064 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:32:33,104 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:32:34,273 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:34,938 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/config.yaml
+2024-08-12 07:32:39,667 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:44,667 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:48,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:32:48,064 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:32:48,108 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:32:50,338 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:32:55,338 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:00,339 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:03,022 DEBUG   SystemMonitor:14458 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-12 07:33:03,024 DEBUG   SenderThread:14458 [sender.py:send():382] send: stats
+2024-08-12 07:33:03,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:33:03,064 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:33:03,104 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:33:06,281 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:11,281 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:16,282 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:18,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:33:18,064 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:33:18,108 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:33:19,954 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:33:21,450 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:21,967 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
+2024-08-12 07:33:23,969 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
+2024-08-12 07:33:25,970 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
+2024-08-12 07:33:27,344 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:32,345 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:33,025 DEBUG   SenderThread:14458 [sender.py:send():382] send: stats
+2024-08-12 07:33:33,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:33:33,065 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:33:33,066 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:33:38,333 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:43,334 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:48,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:33:48,065 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:33:48,104 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:33:49,288 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:54,289 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:33:59,290 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:03,026 DEBUG   SenderThread:14458 [sender.py:send():382] send: stats
+2024-08-12 07:34:03,064 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:34:03,065 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:34:03,108 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:34:05,251 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:10,252 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:15,187 DEBUG   SenderThread:14458 [sender.py:send():382] send: exit
+2024-08-12 07:34:15,187 INFO    SenderThread:14458 [sender.py:send_exit():589] handling exit code: 255
+2024-08-12 07:34:15,187 INFO    SenderThread:14458 [sender.py:send_exit():591] handling runtime: 132
+2024-08-12 07:34:15,189 INFO    SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:34:15,189 INFO    SenderThread:14458 [sender.py:send_exit():597] send defer
+2024-08-12 07:34:15,189 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,190 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-12 07:34:15,190 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,190 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-12 07:34:15,190 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 1
+2024-08-12 07:34:15,190 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,190 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-12 07:34:15,190 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,190 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-12 07:34:15,190 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 2
+2024-08-12 07:34:15,190 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,190 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-12 07:34:15,190 INFO    HandlerThread:14458 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-12 07:34:15,191 DEBUG   SystemMonitor:14458 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-12 07:34:15,191 DEBUG   SystemMonitor:14458 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-12 07:34:15,191 INFO    HandlerThread:14458 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-12 07:34:15,192 INFO    HandlerThread:14458 [interfaces.py:finish():202] Joined disk monitor
+2024-08-12 07:34:15,225 INFO    HandlerThread:14458 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-12 07:34:15,226 INFO    HandlerThread:14458 [interfaces.py:finish():202] Joined memory monitor
+2024-08-12 07:34:15,226 INFO    HandlerThread:14458 [interfaces.py:finish():202] Joined network monitor
+2024-08-12 07:34:15,226 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,226 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-12 07:34:15,226 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 3
+2024-08-12 07:34:15,227 DEBUG   SenderThread:14458 [sender.py:send():382] send: stats
+2024-08-12 07:34:15,227 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,227 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-12 07:34:15,229 DEBUG   SenderThread:14458 [sender.py:send():382] send: history
+2024-08-12 07:34:15,230 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:34:15,231 INFO    SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:34:15,231 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,231 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-12 07:34:15,231 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 4
+2024-08-12 07:34:15,231 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,231 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-12 07:34:15,231 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,231 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-12 07:34:15,231 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 5
+2024-08-12 07:34:15,231 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,232 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-12 07:34:15,232 DEBUG   SenderThread:14458 [sender.py:send():382] send: summary
+2024-08-12 07:34:15,233 INFO    SenderThread:14458 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:34:15,233 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,233 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-12 07:34:15,233 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 6
+2024-08-12 07:34:15,233 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,234 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-12 07:34:15,234 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,234 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-12 07:34:15,234 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 7
+2024-08-12 07:34:15,234 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:15,234 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,234 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-12 07:34:15,234 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,234 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-12 07:34:15,862 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 8
+2024-08-12 07:34:15,862 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,862 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-12 07:34:15,863 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:15,863 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-12 07:34:15,863 INFO    SenderThread:14458 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-12 07:34:15,864 INFO    SenderThread:14458 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-12 07:34:15,878 INFO    SenderThread:14458 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-12 07:34:15,887 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 9
+2024-08-12 07:34:15,887 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:15,887 DEBUG   SenderThread:14458 [sender.py:send():382] send: artifact
+2024-08-12 07:34:15,887 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-12 07:34:16,002 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_073202-yby212na/files/output.log
+2024-08-12 07:34:16,002 INFO    Thread-12 :14458 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json
+2024-08-12 07:34:16,187 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 07:34:16,750 INFO    SenderThread:14458 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
+2024-08-12 07:34:16,750 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:16,750 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-12 07:34:16,750 INFO    SenderThread:14458 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-12 07:34:17,003 INFO    SenderThread:14458 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_073202-yby212na/files
+2024-08-12 07:34:17,004 INFO    SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/requirements.txt requirements.txt
+2024-08-12 07:34:17,004 INFO    SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/config.yaml config.yaml
+2024-08-12 07:34:17,004 INFO    SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/wandb-metadata.json wandb-metadata.json
+2024-08-12 07:34:17,006 INFO    SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json wandb-summary.json
+2024-08-12 07:34:17,008 INFO    SenderThread:14458 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_073202-yby212na/files/output.log output.log
+2024-08-12 07:34:17,009 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 10
+2024-08-12 07:34:17,009 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 07:34:17,011 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:17,011 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-12 07:34:17,012 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:17,012 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-12 07:34:17,012 INFO    SenderThread:14458 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 07:34:17,188 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 07:34:17,188 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 07:34:17,408 INFO    wandb-upload_1:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/config.yaml
+2024-08-12 07:34:17,511 INFO    wandb-upload_0:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/requirements.txt
+2024-08-12 07:34:17,588 INFO    wandb-upload_2:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/wandb-summary.json
+2024-08-12 07:34:17,614 INFO    wandb-upload_3:14458 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_073202-yby212na/files/output.log
+2024-08-12 07:34:17,814 INFO    Thread-11 (_thread_body):14458 [sender.py:transition_state():617] send defer: 11
+2024-08-12 07:34:17,814 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:17,815 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-12 07:34:17,815 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:17,815 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-12 07:34:17,815 INFO    SenderThread:14458 [file_pusher.py:join():178] waiting for file pusher
+2024-08-12 07:34:17,815 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 12
+2024-08-12 07:34:17,815 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:17,815 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-12 07:34:17,815 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:17,815 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-12 07:34:17,815 INFO    SenderThread:14458 [file_stream.py:finish():595] file stream finish called
+2024-08-12 07:34:18,362 INFO    SenderThread:14458 [file_stream.py:finish():599] file stream finish is done
+2024-08-12 07:34:18,362 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 13
+2024-08-12 07:34:18,362 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:18,363 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-12 07:34:18,363 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:18,363 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-12 07:34:18,363 INFO    SenderThread:14458 [sender.py:transition_state():617] send defer: 14
+2024-08-12 07:34:18,363 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:34:18,363 DEBUG   SenderThread:14458 [sender.py:send():382] send: final
+2024-08-12 07:34:18,363 INFO    HandlerThread:14458 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-12 07:34:18,363 DEBUG   SenderThread:14458 [sender.py:send():382] send: footer
+2024-08-12 07:34:18,364 DEBUG   SenderThread:14458 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:34:18,364 INFO    SenderThread:14458 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-12 07:34:21,364 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:26,365 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:31,366 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:36,367 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:41,367 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:46,368 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:51,369 DEBUG   HandlerThread:14458 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:34:51,550 WARNING StreamThr :14458 [internal.py:is_dead():414] Internal process exiting, parent pid 14387 disappeared
+2024-08-12 07:34:51,550 ERROR   StreamThr :14458 [internal.py:wandb_internal():152] Internal process shutdown.
+2024-08-12 07:34:52,369 INFO    SenderThread:14458 [sender.py:finish():1572] shutting down sender
+2024-08-12 07:34:52,369 INFO    SenderThread:14458 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 07:34:52,369 INFO    SenderThread:14458 [file_pusher.py:join():178] waiting for file pusher
+2024-08-12 07:34:52,369 INFO    HandlerThread:14458 [handler.py:finish():869] shutting down handler
+2024-08-12 07:34:52,369 INFO    WriterThread:14458 [datastore.py:close():296] close: /project/wandb/run-20240812_073202-yby212na/run-yby212na.wandb

wandb/run-20240812_073202-yby212na/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-08-12 07:32:02,359 INFO    MainThread:14387 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-12 07:32:02,359 INFO    MainThread:14387 [wandb_setup.py:_flush():76] Configure stats pid to 14387
+2024-08-12 07:32:02,359 INFO    MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_073202-yby212na/logs/debug.log
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_073202-yby212na/logs/debug-internal.log
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_init.py:init():566] calling init triggers
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:31:51', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 5, 'save_interval': 1, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_init.py:init():616] starting backend
+2024-08-12 07:32:02,360 INFO    MainThread:14387 [wandb_init.py:init():620] setting up manager
+2024-08-12 07:32:02,365 INFO    MainThread:14387 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-12 07:32:02,366 INFO    MainThread:14387 [wandb_init.py:init():628] backend started and connected
+2024-08-12 07:32:02,370 INFO    MainThread:14387 [wandb_init.py:init():720] updated telemetry
+2024-08-12 07:32:02,382 INFO    MainThread:14387 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-12 07:32:02,922 INFO    MainThread:14387 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-12 07:32:03,004 INFO    MainThread:14387 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-12 07:32:03,004 INFO    MainThread:14387 [wandb_init.py:init():804] starting run threads in backend
+2024-08-12 07:32:03,064 INFO    MainThread:14387 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-12 07:32:03,064 INFO    MainThread:14387 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-12 07:32:03,064 INFO    MainThread:14387 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-12 07:32:03,064 INFO    MainThread:14387 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-12 07:32:03,065 INFO    MainThread:14387 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-12 07:32:04,383 INFO    MainThread:14387 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
+2024-08-12 07:32:04,383 INFO    MainThread:14387 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}

wandb/run-20240812_073202-yby212na/run-yby212na.wandb ADDED Viewed

Binary file (26.1 kB). View file

wandb/run-20240815_041534-1ld4rgmy/files/config.yaml ADDED Viewed

	@@ -0,0 +1,337 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-qwen2-0.5B_train_2024-08-15-04:15:21
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 10
+save_interval:
+  desc: null
+  value: 10
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-qwen2-0.5B
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1723662934.646627
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Qwen2ForCausalLM
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 896
+model_type:
+  desc: null
+  value: qwen2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 14
+num_hidden_layers:
+  desc: null
+  value: 24

wandb/run-20240815_041534-1ld4rgmy/files/output.log ADDED Viewed

	@@ -0,0 +1,92 @@

+Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+Loading model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Loaded model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
+--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
+--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 6403200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+> finished creating GPT datasets ...
+Loading optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
+Loaded optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151936, 896)
+      (layers): ModuleList(
+        (0-23): 24 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Qwen2DecoderLayer(
+              (self_attn): Qwen2FlashAttention2(
+                (q_proj): Linear(in_features=896, out_features=896, bias=True)
+                (k_proj): Linear(in_features=896, out_features=128, bias=True)
+                (v_proj): Linear(in_features=896, out_features=128, bias=True)
+                (o_proj): Linear(in_features=896, out_features=896, bias=False)
+                (rotary_emb): Qwen2RotaryEmbedding()
+              )
+              (mlp): Qwen2MLP(
+                (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (up_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (down_proj): Linear(in_features=4864, out_features=896, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen2RMSNorm()
+              (post_attention_layernorm): Qwen2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Qwen2RMSNorm()
+    )
+    (lm_head): Linear(in_features=896, out_features=151936, bias=False)
+  )
+)
+model config: Qwen2Config {
+  "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+[rank0]:[2024-08-15 04:15:41,598] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling:  defaultdict(<class 'float'>, {})
+------------------------------------------------------------------
+iteration: 1161 , TFLOPS: 71.0304706218284, Tokens per sec: 17664.9211934734, Loss: 2.442603349685669
+------------------------------------------------------------------

wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,354 @@

+absl-py==2.1.0
+accelerate==0.23.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+astroid==3.2.4
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bert-score==0.3.13
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cramjam==2.8.3
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+dataclasses-json==0.6.7
+dataproperty==1.0.1
+datasets==2.20.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distributed==2023.11.0
+distro==1.9.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+emoji==2.12.1
+entmax==1.3
+evaluate==0.4.2
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastparquet==2023.10.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+fugashi==1.3.2
+fuzzywuzzy==0.18.0
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+greenlet==3.0.3
+grpcio==1.60.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.24.5
+hydra-core==1.3.2
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+isort==5.13.2
+jedi==0.19.1
+jinja2==3.1.3
+jiter==0.5.0
+joblib==1.3.2
+json5==0.9.14
+jsonargparse==3.13.1
+jsonlines==4.0.0
+jsonnet==0.19.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langchain-community==0.2.12
+langchain-core==0.2.31
+langchain-huggingface==0.0.2
+langchain-openai==0.1.21
+langchain-text-splitters==0.2.2
+langchain==0.2.13
+langcodes==3.3.0
+langsmith==0.1.99
+lazy-loader==0.3
+levenshtein==0.25.1
+librosa==0.10.1
+lightning-utilities==0.11.6
+llm-jp-eval==1.4.0
+llvmlite==0.40.1
+lm-eval==0.3.0
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+marshmallow==3.21.3
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mbstrdecoder==1.1.3
+mccabe==0.7.0
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+mojimoji==0.0.13
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+neologdn==0.5.3
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numexpr==2.10.1
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+omegaconf==2.3.0
+onnx==1.15.0rc2
+openai==1.40.6
+opencv==4.7.0
+optree==0.10.0
+orjson==3.10.7
+packaging==23.2
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+pathvalidate==3.2.0
+peft==0.5.0
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+plac==1.4.3
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==15.0.2
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycountry==24.6.1
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pylint==3.2.6
+pynvml==11.4.1
+pyparsing==3.1.1
+pytablewriter==1.2.0
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+python-levenshtein==0.25.1
+pytorch-lightning==2.4.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapidfuzz==3.9.6
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.32.3
+rhoknp==1.7.0
+rich==13.7.0
+rmm==23.12.0
+rouge-score==0.1.2
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.2
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.12.0
+send2trash==1.8.2
+sentence-transformers==3.0.1
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+sqlalchemy==2.0.32
+sqlitedict==2.1.0
+srsly==2.4.8
+stack-data==0.6.3
+sumeval==0.2.2
+sympy==1.12
+tabledata==1.3.3
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tcolorpy==0.1.6
+tenacity==8.5.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+text-generation==0.7.0
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tiktoken==0.7.0
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.13.2
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchmetrics==0.10.3
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm-multiprocess==0.0.11
+tqdm==4.66.5
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typepy==1.3.2
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.12.2
+typing-inspect==0.9.0
+tzdata==2024.1
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+unbabel-comet==2.2.2
+unidic-lite==1.0.8
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+word2number==1.1
+xdoctest==1.0.2
+xgboost==1.7.6
+xmltodict==0.13.0
+xxhash==3.4.1
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0
+zstandard==0.23.0

wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-14T19:15:35.173102",
+    "startedAt": "2024-08-14T19:15:34.633818",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "10",
+        "--eval-interval",
+        "10",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--load",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-qwen2-0.5B",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-qwen2-0.5B_train_2024-08-15-04:15:21"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"_wandb": {"runtime": 86}, "training/loss": 2.442603349685669, "training/perplexity": 11.502947992429535, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1161, "optimizer/lr": 1.9946184158325198e-05, "optimizer/variance_l2": 0.0046823736576586325, "optimizer/variance_sqrt_l2": 0.5343142380105511, "optimizer/momentum_l2": 0.12459250428605805, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.284942626953125, "optimizer/variance_sqrt_l1": 4625.0, "optimizer/momentum_l1": 977.875, "optimizer/weight_l1": 6918144.0, "optimizer/variance_abs_max": 0.0030059814453125, "optimizer/variance_sqrt_abs_max": 0.054931640625, "optimizer/momentum_abs_max": 0.0108642578125, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.21714400200017, "stats/tokens_per_sec": 17664.9211934734, "stats/tokens_per_sec_per_gpu": 17664.9211934734, "stats/tflops": 71.0304706218284, "_timestamp": 1723663016.4553976, "_runtime": 81.8087706565857, "_step": 1161}

wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,162 @@

+2024-08-15 04:15:34,649 INFO    StreamThr :12253 [internal.py:wandb_internal():86] W&B internal server running at pid: 12253, started at: 2024-08-15 04:15:34.648066
+2024-08-15 04:15:34,650 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status
+2024-08-15 04:15:34,652 INFO    WriterThread:12253 [datastore.py:open_for_write():87] open: /project/wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb
+2024-08-15 04:15:34,653 DEBUG   SenderThread:12253 [sender.py:send():382] send: header
+2024-08-15 04:15:34,666 DEBUG   SenderThread:12253 [sender.py:send():382] send: run
+2024-08-15 04:15:35,078 INFO    SenderThread:12253 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240815_041534-1ld4rgmy/files
+2024-08-15 04:15:35,078 INFO    SenderThread:12253 [sender.py:_start_run_threads():1136] run started: 1ld4rgmy with start time 1723662934.646627
+2024-08-15 04:15:35,084 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: check_version
+2024-08-15 04:15:35,084 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: check_version
+2024-08-15 04:15:35,155 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: run_start
+2024-08-15 04:15:35,161 DEBUG   HandlerThread:12253 [system_info.py:__init__():27] System info init
+2024-08-15 04:15:35,161 DEBUG   HandlerThread:12253 [system_info.py:__init__():42] System info init done
+2024-08-15 04:15:35,161 INFO    HandlerThread:12253 [system_monitor.py:start():194] Starting system monitor
+2024-08-15 04:15:35,161 INFO    SystemMonitor:12253 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-15 04:15:35,161 INFO    HandlerThread:12253 [system_monitor.py:probe():214] Collecting system info
+2024-08-15 04:15:35,162 INFO    SystemMonitor:12253 [interfaces.py:start():190] Started cpu monitoring
+2024-08-15 04:15:35,162 INFO    SystemMonitor:12253 [interfaces.py:start():190] Started disk monitoring
+2024-08-15 04:15:35,163 INFO    SystemMonitor:12253 [interfaces.py:start():190] Started gpu monitoring
+2024-08-15 04:15:35,164 INFO    SystemMonitor:12253 [interfaces.py:start():190] Started memory monitoring
+2024-08-15 04:15:35,164 INFO    SystemMonitor:12253 [interfaces.py:start():190] Started network monitoring
+2024-08-15 04:15:35,173 DEBUG   HandlerThread:12253 [system_info.py:probe():151] Probing system
+2024-08-15 04:15:35,175 DEBUG   HandlerThread:12253 [system_info.py:_probe_git():136] Probing git
+2024-08-15 04:15:35,188 DEBUG   HandlerThread:12253 [system_info.py:_probe_git():144] Probing git done
+2024-08-15 04:15:35,188 DEBUG   HandlerThread:12253 [system_info.py:probe():199] Probing system done
+2024-08-15 04:15:35,188 DEBUG   HandlerThread:12253 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-14T19:15:35.173102', 'startedAt': '2024-08-14T19:15:34.633818', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-15-04:15:21'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
+2024-08-15 04:15:35,188 INFO    HandlerThread:12253 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-15 04:15:35,188 INFO    HandlerThread:12253 [system_monitor.py:probe():227] Publishing system info
+2024-08-15 04:15:35,189 INFO    HandlerThread:12253 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-15 04:15:35,195 DEBUG   SenderThread:12253 [sender.py:send():382] send: files
+2024-08-15 04:15:35,195 INFO    SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-15 04:15:35,207 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-15 04:15:35,207 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 04:15:35,207 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: python_packages
+2024-08-15 04:15:35,208 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 04:15:35,209 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 04:15:35,448 DEBUG   SenderThread:12253 [sender.py:send():382] send: telemetry
+2024-08-15 04:15:35,826 INFO    wandb-upload_0:12253 [upload_job.py:push():131] Uploaded file /tmp/tmprvuc38znwandb/8jb1h2yo-wandb-metadata.json
+2024-08-15 04:15:36,080 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/requirements.txt
+2024-08-15 04:15:36,080 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/wandb-metadata.json
+2024-08-15 04:15:36,081 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
+2024-08-15 04:15:38,081 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
+2024-08-15 04:15:40,019 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:15:40,082 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
+2024-08-15 04:15:41,878 DEBUG   SenderThread:12253 [sender.py:send():382] send: config
+2024-08-15 04:15:41,878 DEBUG   SenderThread:12253 [sender.py:send():382] send: config
+2024-08-15 04:15:42,083 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
+2024-08-15 04:15:44,084 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
+2024-08-15 04:15:45,879 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:15:50,206 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 04:15:50,206 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 04:15:50,208 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 04:15:51,411 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:15:56,411 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:01,412 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:05,206 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 04:16:05,206 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 04:16:05,246 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 04:16:06,461 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:08,114 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/config.yaml
+2024-08-15 04:16:12,324 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:17,325 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:20,207 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 04:16:20,207 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 04:16:20,250 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 04:16:22,438 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:27,438 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:32,439 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:35,164 DEBUG   SystemMonitor:12253 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-15 04:16:35,166 DEBUG   SenderThread:12253 [sender.py:send():382] send: stats
+2024-08-15 04:16:35,206 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 04:16:35,206 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 04:16:35,250 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 04:16:38,433 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:43,434 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:48,434 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:50,206 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 04:16:50,206 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 04:16:50,250 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 04:16:54,406 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:16:56,456 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-15 04:16:58,142 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
+2024-08-15 04:16:59,499 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:17:02,068 DEBUG   SenderThread:12253 [sender.py:send():382] send: exit
+2024-08-15 04:17:02,069 INFO    SenderThread:12253 [sender.py:send_exit():589] handling exit code: 255
+2024-08-15 04:17:02,069 INFO    SenderThread:12253 [sender.py:send_exit():591] handling runtime: 86
+2024-08-15 04:17:02,070 INFO    SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-15 04:17:02,070 INFO    SenderThread:12253 [sender.py:send_exit():597] send defer
+2024-08-15 04:17:02,071 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:02,071 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-15 04:17:02,071 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:02,071 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-15 04:17:02,071 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 1
+2024-08-15 04:17:02,071 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:02,071 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-15 04:17:02,071 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:02,071 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-15 04:17:02,071 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 2
+2024-08-15 04:17:02,071 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:02,072 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-15 04:17:02,072 INFO    HandlerThread:12253 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-15 04:17:02,072 DEBUG   SystemMonitor:12253 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-15 04:17:02,072 INFO    HandlerThread:12253 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-15 04:17:02,072 DEBUG   SystemMonitor:12253 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-15 04:17:02,072 INFO    HandlerThread:12253 [interfaces.py:finish():202] Joined disk monitor
+2024-08-15 04:17:02,107 INFO    HandlerThread:12253 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-15 04:17:02,107 INFO    HandlerThread:12253 [interfaces.py:finish():202] Joined memory monitor
+2024-08-15 04:17:02,107 INFO    HandlerThread:12253 [interfaces.py:finish():202] Joined network monitor
+2024-08-15 04:17:02,108 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:02,108 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-15 04:17:02,108 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 3
+2024-08-15 04:17:02,108 DEBUG   SenderThread:12253 [sender.py:send():382] send: stats
+2024-08-15 04:17:02,108 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:02,108 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-15 04:17:02,111 DEBUG   SenderThread:12253 [sender.py:send():382] send: history
+2024-08-15 04:17:02,111 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: summary_record
+2024-08-15 04:17:02,112 INFO    SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-15 04:17:02,113 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:02,113 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-15 04:17:02,113 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 4
+2024-08-15 04:17:02,113 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:02,113 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-15 04:17:02,113 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:02,113 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-15 04:17:02,113 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 5
+2024-08-15 04:17:02,113 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:02,113 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-15 04:17:02,114 DEBUG   SenderThread:12253 [sender.py:send():382] send: summary
+2024-08-15 04:17:02,115 INFO    SenderThread:12253 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-15 04:17:02,115 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:02,115 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-15 04:17:02,115 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 6
+2024-08-15 04:17:02,115 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:02,115 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-15 04:17:02,115 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:02,115 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-15 04:17:02,116 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 7
+2024-08-15 04:17:02,116 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 04:17:02,116 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:02,116 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-15 04:17:02,116 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:02,116 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-15 04:17:02,145 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_041534-1ld4rgmy/files/wandb-summary.json
+2024-08-15 04:17:03,068 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-15 04:17:03,854 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 8
+2024-08-15 04:17:03,854 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: poll_exit
+2024-08-15 04:17:03,854 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:03,855 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-15 04:17:03,855 DEBUG   SenderThread:12253 [sender.py:send_request():409] send_request: defer
+2024-08-15 04:17:03,855 INFO    SenderThread:12253 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-15 04:17:03,855 INFO    SenderThread:12253 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-15 04:17:03,856 INFO    SenderThread:12253 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-15 04:17:03,871 INFO    SenderThread:12253 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-15 04:17:03,880 INFO    SenderThread:12253 [sender.py:transition_state():617] send defer: 9
+2024-08-15 04:17:03,880 DEBUG   SenderThread:12253 [sender.py:send():382] send: artifact
+2024-08-15 04:17:03,880 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 04:17:03,881 INFO    HandlerThread:12253 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-15 04:17:04,069 DEBUG   HandlerThread:12253 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-15 04:17:04,146 INFO    Thread-12 :12253 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_041534-1ld4rgmy/files/output.log
+2024-08-15 04:17:35,760 WARNING StreamThr :12253 [internal.py:is_dead():414] Internal process exiting, parent pid 12182 disappeared
+2024-08-15 04:17:35,760 ERROR   StreamThr :12253 [internal.py:wandb_internal():152] Internal process shutdown.
+2024-08-15 04:17:36,070 INFO    WriterThread:12253 [datastore.py:close():296] close: /project/wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb
+2024-08-15 04:17:36,071 INFO    HandlerThread:12253 [handler.py:finish():869] shutting down handler

wandb/run-20240815_041534-1ld4rgmy/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-08-15 04:15:34,639 INFO    MainThread:12182 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_setup.py:_flush():76] Configure stats pid to 12182
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240815_041534-1ld4rgmy/logs/debug.log
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240815_041534-1ld4rgmy/logs/debug-internal.log
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_init.py:init():566] calling init triggers
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-15-04:15:21', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_init.py:init():616] starting backend
+2024-08-15 04:15:34,640 INFO    MainThread:12182 [wandb_init.py:init():620] setting up manager
+2024-08-15 04:15:34,645 INFO    MainThread:12182 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-15 04:15:34,646 INFO    MainThread:12182 [wandb_init.py:init():628] backend started and connected
+2024-08-15 04:15:34,651 INFO    MainThread:12182 [wandb_init.py:init():720] updated telemetry
+2024-08-15 04:15:34,662 INFO    MainThread:12182 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-15 04:15:35,083 INFO    MainThread:12182 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-15 04:15:35,107 INFO    MainThread:12182 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-15 04:15:35,107 INFO    MainThread:12182 [wandb_init.py:init():804] starting run threads in backend
+2024-08-15 04:15:35,205 INFO    MainThread:12182 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-15 04:15:35,206 INFO    MainThread:12182 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-15 04:15:35,206 INFO    MainThread:12182 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-15 04:15:35,206 INFO    MainThread:12182 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-15 04:15:35,207 INFO    MainThread:12182 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-15 04:15:41,877 INFO    MainThread:12182 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
+2024-08-15 04:15:41,877 INFO    MainThread:12182 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}

wandb/run-20240815_041534-1ld4rgmy/run-1ld4rgmy.wandb ADDED Viewed

Binary file (18 kB). View file

wandb/run-20240824_202022-z2bjbf6e/files/config.yaml ADDED Viewed

	@@ -0,0 +1,321 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: NO_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '1754785366'
+  - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
+  - '28623823675'
+  - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '1205770'
+  - /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '1205770'
+  - /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 1024
+num_workers:
+  desc: null
+  value: 4
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07
+wandb_project:
+  desc: null
+  value: yans_experiment
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-baseline-qwen2-0.5B
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: anyprecision
+lr:
+  desc: null
+  value: 3.5e-06
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 23178
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 3.5e-07
+train_iters:
+  desc: null
+  value: 23178
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 1280
+micro_batch_size:
+  desc: null
+  value: 16
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 131072
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-08
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-baseline-qwen2-0.5B
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: true
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+valid_micro_batch_size:
+  desc: null
+  value: 1
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 8
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 10
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1724498422.652614
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64

wandb/run-20240824_202022-z2bjbf6e/files/output.log ADDED Viewed

	@@ -0,0 +1,51 @@

+Created Hugging Face repository with ID koichi12/yans-baseline-qwen2-0.5B.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-baseline-qwen2-0.5B/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-baseline-qwen2-0.5B, skipping model loading
+--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
+--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+Let split = None
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 270, in build_generic_dataset
+    dataset = cls(*args)
+  File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 359, in __init__
+    self.initialize(path_prefix, multimodal)
+  File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 374, in initialize
+    self.index = _IndexReader(get_idx_path(self.path_prefix), self.multimodal)
+  File "/project/megatron_lm/megatron/core/datasets/indexed_dataset.py", line 233, in __init__
+    with open(idx_path, "rb") as stream:
+FileNotFoundError: [Errno 2] No such file or directory: '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document.idx'
+The above exception was the direct cause of the following exception:
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 162, in main
+    train_dataset, validation_dataset, test_dataset = build_train_valid_test_datasets()
+  File "/project/src/llama_recipes/datasets/pretrain_dataset.py", line 76, in build_train_valid_test_datasets
+    return train_valid_test_datasets_provider(train_val_test_num_samples)
+  File "/project/src/llama_recipes/datasets/pretrain_dataset.py", line 46, in train_valid_test_datasets_provider
+    ).build()
+  File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 56, in build
+    return self._build_blended_dataset_splits()
+  File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 162, in _build_blended_dataset_splits
+    self._build_megatron_dataset_splits(
+  File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 199, in _build_megatron_dataset_splits
+    indexed_dataset = self.build_generic_dataset(
+  File "/project/megatron_lm/megatron/core/datasets/blended_megatron_dataset_builder.py", line 278, in build_generic_dataset
+    raise Exception(log) from err
+Exception: Failed to write dataset materials to the data cache directory. Please supply a directory to which you have write access via the path_to_cache attribute in BlendedMegatronDatasetConfig and retry. Refer to the preserved traceback above for more information.
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      29667840
+    validation: 1484800
+    test:       12800
+> building train, validation, and test datasets for GPT ...

wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,375 @@

+absl-py==2.1.0
+accelerate==0.23.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+astroid==3.2.4
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bert-score==0.3.13
+bleach==6.1.0
+blis==0.7.11
+build==1.2.1
+cachecontrol==0.14.0
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+cleo==2.1.0
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cramjam==2.8.3
+crashtest==0.4.1
+cryptography==43.0.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+dataclasses-json==0.6.7
+dataproperty==1.0.1
+datasets==2.20.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.8
+distributed==2023.11.0
+distro==1.9.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+dulwich==0.21.7
+einops==0.7.0
+emoji==2.12.1
+entmax==1.3
+evaluate==0.4.2
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastparquet==2023.10.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+fugashi==1.3.2
+fuzzywuzzy==0.18.0
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+greenlet==3.0.3
+grpcio==1.60.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.24.5
+hydra-core==1.3.2
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+installer==0.7.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+isort==5.13.2
+jaraco.classes==3.4.0
+jedi==0.19.1
+jeepney==0.8.0
+jinja2==3.1.3
+jiter==0.5.0
+joblib==1.3.2
+json5==0.9.14
+jsonargparse==3.13.1
+jsonlines==4.0.0
+jsonnet==0.19.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+keyring==24.3.1
+kiwisolver==1.4.5
+langchain-community==0.2.12
+langchain-core==0.2.31
+langchain-huggingface==0.0.2
+langchain-openai==0.1.21
+langchain-text-splitters==0.2.2
+langchain==0.2.13
+langcodes==3.3.0
+langsmith==0.1.99
+lazy-loader==0.3
+levenshtein==0.25.1
+librosa==0.10.1
+lightning-utilities==0.11.6
+llm-jp-eval==1.4.0
+llvmlite==0.40.1
+lm-eval==0.3.0
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+marshmallow==3.21.3
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mbstrdecoder==1.1.3
+mccabe==0.7.0
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+mojimoji==0.0.13
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+neologdn==0.5.3
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numexpr==2.10.1
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+omegaconf==2.3.0
+onnx==1.15.0rc2
+openai==1.40.6
+opencv==4.7.0
+optree==0.10.0
+orjson==3.10.7
+packaging==23.2
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+pathvalidate==3.2.0
+peft==0.5.0
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+pkginfo==1.11.1
+plac==1.4.3
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+poetry-core==1.9.0
+poetry-plugin-export==1.8.0
+poetry==1.8.3
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==15.0.2
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycountry==24.6.1
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pylint==3.2.6
+pynvml==11.4.1
+pyparsing==3.1.1
+pyproject-hooks==1.1.0
+pytablewriter==1.2.0
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+python-levenshtein==0.25.1
+pytorch-lightning==2.4.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapidfuzz==3.9.6
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+requests==2.32.3
+rhoknp==1.7.0
+rich==13.7.0
+rmm==23.12.0
+rouge-score==0.1.2
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.2
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.12.0
+secretstorage==3.3.3
+send2trash==1.8.2
+sentence-transformers==3.0.1
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+shellingham==1.5.4
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+sqlalchemy==2.0.32
+sqlitedict==2.1.0
+srsly==2.4.8
+stack-data==0.6.3
+sumeval==0.2.2
+sympy==1.12
+tabledata==1.3.3
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tcolorpy==0.1.6
+tenacity==8.5.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+text-generation==0.7.0
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tiktoken==0.7.0
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.13.2
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchmetrics==0.10.3
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm-multiprocess==0.0.11
+tqdm==4.66.5
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+trove-classifiers==2024.7.2
+typepy==1.3.2
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.12.2
+typing-inspect==0.9.0
+tzdata==2024.1
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+unbabel-comet==2.2.2
+unidic-lite==1.0.8
+urllib3==1.26.18
+virtualenv==20.26.3
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+word2number==1.1
+xdoctest==1.0.2
+xgboost==1.7.6
+xmltodict==0.13.0
+xxhash==3.4.1
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0
+zstandard==0.23.0

wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,880 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-24T11:20:23.248321",
+    "startedAt": "2024-08-24T11:20:22.637930",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "1024",
+        "--sliding-window-size",
+        "131072",
+        "--micro-batch-size",
+        "16",
+        "--valid_micro_batch_size",
+        "1",
+        "--global-batch-size",
+        "1280",
+        "--train-iters",
+        "23178",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "1754785366",
+        "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
+        "28623823675",
+        "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
+        "--valid-data-path",
+        "1205770",
+        "/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document",
+        "--test-data-path",
+        "1205770",
+        "/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document",
+        "--lr",
+        "3.5e-6",
+        "--min-lr",
+        "3.5e-7",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "23178",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "anyprecision",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-8",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/yans-baseline-qwen2-0.5B",
+        "--load",
+        "/work/llm_recipes/models/yans-baseline-qwen2-0.5B",
+        "--num-workers",
+        "4",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "NO_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--upload-all-checkpoints-to-hf",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-baseline-qwen2-0.5B",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "yans_experiment",
+        "--wandb-name",
+        "yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 144,
+    "cpu_count_logical": 144,
+    "cpu_freq": {
+        "current": 2400.0340000000015,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 8,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        },
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        },
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        },
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        },
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        },
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        },
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        },
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 453.4449462890625
+    }
+}

wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 4}}

wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,191 @@

+2024-08-24 20:20:22,655 INFO    StreamThr :25836 [internal.py:wandb_internal():86] W&B internal server running at pid: 25836, started at: 2024-08-24 20:20:22.654049
+2024-08-24 20:20:22,656 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: status
+2024-08-24 20:20:22,659 INFO    WriterThread:25836 [datastore.py:open_for_write():87] open: /project/wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb
+2024-08-24 20:20:22,660 DEBUG   SenderThread:25836 [sender.py:send():382] send: header
+2024-08-24 20:20:22,676 DEBUG   SenderThread:25836 [sender.py:send():382] send: run
+2024-08-24 20:20:23,101 INFO    SenderThread:25836 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240824_202022-z2bjbf6e/files
+2024-08-24 20:20:23,101 INFO    SenderThread:25836 [sender.py:_start_run_threads():1136] run started: z2bjbf6e with start time 1724498422.652614
+2024-08-24 20:20:23,106 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: check_version
+2024-08-24 20:20:23,106 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: check_version
+2024-08-24 20:20:23,175 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: run_start
+2024-08-24 20:20:23,182 DEBUG   HandlerThread:25836 [system_info.py:__init__():27] System info init
+2024-08-24 20:20:23,182 DEBUG   HandlerThread:25836 [system_info.py:__init__():42] System info init done
+2024-08-24 20:20:23,182 INFO    HandlerThread:25836 [system_monitor.py:start():194] Starting system monitor
+2024-08-24 20:20:23,182 INFO    SystemMonitor:25836 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-24 20:20:23,183 INFO    HandlerThread:25836 [system_monitor.py:probe():214] Collecting system info
+2024-08-24 20:20:23,183 INFO    SystemMonitor:25836 [interfaces.py:start():190] Started cpu monitoring
+2024-08-24 20:20:23,183 INFO    SystemMonitor:25836 [interfaces.py:start():190] Started disk monitoring
+2024-08-24 20:20:23,184 INFO    SystemMonitor:25836 [interfaces.py:start():190] Started gpu monitoring
+2024-08-24 20:20:23,185 INFO    SystemMonitor:25836 [interfaces.py:start():190] Started memory monitoring
+2024-08-24 20:20:23,186 INFO    SystemMonitor:25836 [interfaces.py:start():190] Started network monitoring
+2024-08-24 20:20:23,248 DEBUG   HandlerThread:25836 [system_info.py:probe():151] Probing system
+2024-08-24 20:20:23,250 DEBUG   HandlerThread:25836 [system_info.py:_probe_git():136] Probing git
+2024-08-24 20:20:23,264 DEBUG   HandlerThread:25836 [system_info.py:_probe_git():144] Probing git done
+2024-08-24 20:20:23,264 DEBUG   HandlerThread:25836 [system_info.py:probe():199] Probing system done
+2024-08-24 20:20:23,264 DEBUG   HandlerThread:25836 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-24T11:20:23.248321', 'startedAt': '2024-08-24T11:20:22.637930', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '131072', '--micro-batch-size', '16', '--valid_micro_batch_size', '1', '--global-batch-size', '1280', '--train-iters', '23178', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document', '--test-data-path', '1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document', '--lr', '3.5e-6', '--min-lr', '3.5e-7', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '23178', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', '--num-workers', '4', '--fsdp-activation-checkpointing', '--sharding-strategy', 'NO_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-baseline-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'yans_experiment', '--wandb-name', 'yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 144, 'cpu_count_logical': 144, 'cpu_freq': {'current': 2400.0340000000015, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}, {'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 453.4449462890625}}
+2024-08-24 20:20:23,264 INFO    HandlerThread:25836 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-24 20:20:23,264 INFO    HandlerThread:25836 [system_monitor.py:probe():227] Publishing system info
+2024-08-24 20:20:23,266 INFO    HandlerThread:25836 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-24 20:20:23,272 DEBUG   SenderThread:25836 [sender.py:send():382] send: files
+2024-08-24 20:20:23,272 INFO    SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-24 20:20:23,283 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-24 20:20:23,284 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-24 20:20:23,284 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-24 20:20:23,284 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: python_packages
+2024-08-24 20:20:23,286 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: stop_status
+2024-08-24 20:20:23,526 DEBUG   SenderThread:25836 [sender.py:send():382] send: telemetry
+2024-08-24 20:20:23,973 INFO    wandb-upload_0:25836 [upload_job.py:push():131] Uploaded file /tmp/tmpwjpjqs3pwandb/55szr5f9-wandb-metadata.json
+2024-08-24 20:20:24,103 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
+2024-08-24 20:20:24,103 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json
+2024-08-24 20:20:24,103 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt
+2024-08-24 20:20:26,103 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
+2024-08-24 20:20:27,701 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
+2024-08-24 20:20:27,737 DEBUG   SenderThread:25836 [sender.py:send():382] send: exit
+2024-08-24 20:20:27,737 INFO    SenderThread:25836 [sender.py:send_exit():589] handling exit code: 1
+2024-08-24 20:20:27,737 INFO    SenderThread:25836 [sender.py:send_exit():591] handling runtime: 4
+2024-08-24 20:20:27,739 INFO    SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-24 20:20:27,739 INFO    SenderThread:25836 [sender.py:send_exit():597] send defer
+2024-08-24 20:20:27,739 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:27,739 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-24 20:20:27,740 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:27,740 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-24 20:20:27,740 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 1
+2024-08-24 20:20:27,740 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:27,740 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-24 20:20:27,740 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:27,740 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-24 20:20:27,740 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 2
+2024-08-24 20:20:27,740 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:27,740 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-24 20:20:27,740 INFO    HandlerThread:25836 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-24 20:20:27,740 DEBUG   SystemMonitor:25836 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-24 20:20:27,741 INFO    HandlerThread:25836 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-24 20:20:27,741 DEBUG   SystemMonitor:25836 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-24 20:20:27,741 INFO    HandlerThread:25836 [interfaces.py:finish():202] Joined disk monitor
+2024-08-24 20:20:27,741 DEBUG   SystemMonitor:25836 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-24 20:20:28,105 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
+2024-08-24 20:20:28,106 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
+2024-08-24 20:20:28,918 INFO    HandlerThread:25836 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-24 20:20:28,918 INFO    HandlerThread:25836 [interfaces.py:finish():202] Joined memory monitor
+2024-08-24 20:20:28,918 INFO    HandlerThread:25836 [interfaces.py:finish():202] Joined network monitor
+2024-08-24 20:20:28,918 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-24 20:20:28,920 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:28,920 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-24 20:20:28,920 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 3
+2024-08-24 20:20:28,920 DEBUG   SenderThread:25836 [sender.py:send():382] send: stats
+2024-08-24 20:20:28,920 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:28,921 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
+2024-08-24 20:20:28,921 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-24 20:20:28,921 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:28,921 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-24 20:20:28,921 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 4
+2024-08-24 20:20:28,921 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:28,921 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-24 20:20:28,922 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:28,922 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-24 20:20:28,922 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 5
+2024-08-24 20:20:28,922 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:28,922 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-24 20:20:28,922 DEBUG   SenderThread:25836 [sender.py:send():382] send: summary
+2024-08-24 20:20:28,923 INFO    SenderThread:25836 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-24 20:20:28,923 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:28,923 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-24 20:20:28,923 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 6
+2024-08-24 20:20:28,923 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:28,924 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-24 20:20:28,924 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:28,924 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-24 20:20:28,927 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
+2024-08-24 20:20:29,107 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
+2024-08-24 20:20:29,126 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 7
+2024-08-24 20:20:29,126 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:29,126 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-24 20:20:29,126 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:29,126 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-24 20:20:29,738 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-24 20:20:30,108 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml
+2024-08-24 20:20:30,108 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
+2024-08-24 20:20:31,391 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 8
+2024-08-24 20:20:31,392 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
+2024-08-24 20:20:31,392 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:31,392 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-24 20:20:31,392 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:31,392 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-24 20:20:31,392 INFO    SenderThread:25836 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-24 20:20:31,393 INFO    SenderThread:25836 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-24 20:20:31,408 INFO    SenderThread:25836 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-24 20:20:31,417 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 9
+2024-08-24 20:20:31,418 DEBUG   SenderThread:25836 [sender.py:send():382] send: artifact
+2024-08-24 20:20:31,418 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:31,419 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-24 20:20:31,738 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-24 20:20:32,109 INFO    Thread-12 :25836 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
+2024-08-24 20:20:34,782 INFO    SenderThread:25836 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MzU1Mzg0Mw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjgwMzg3NA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MzU1Mzg0Mw==', 'versionIndex': 0}}}
+2024-08-24 20:20:34,782 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:34,782 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: status_report
+2024-08-24 20:20:34,782 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-24 20:20:34,783 INFO    SenderThread:25836 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-24 20:20:35,110 INFO    SenderThread:25836 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240824_202022-z2bjbf6e/files
+2024-08-24 20:20:35,110 INFO    SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt requirements.txt
+2024-08-24 20:20:35,110 INFO    SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml config.yaml
+2024-08-24 20:20:35,112 INFO    SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-metadata.json wandb-metadata.json
+2024-08-24 20:20:35,112 INFO    SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json wandb-summary.json
+2024-08-24 20:20:35,113 INFO    SenderThread:25836 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log output.log
+2024-08-24 20:20:35,115 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 10
+2024-08-24 20:20:35,115 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
+2024-08-24 20:20:35,116 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:35,117 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-24 20:20:35,117 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:35,117 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-24 20:20:35,117 INFO    SenderThread:25836 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-24 20:20:35,574 INFO    wandb-upload_1:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/config.yaml
+2024-08-24 20:20:35,574 INFO    wandb-upload_0:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/requirements.txt
+2024-08-24 20:20:35,580 INFO    wandb-upload_2:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/wandb-summary.json
+2024-08-24 20:20:35,588 INFO    wandb-upload_3:25836 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240824_202022-z2bjbf6e/files/output.log
+2024-08-24 20:20:35,739 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-24 20:20:35,740 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
+2024-08-24 20:20:35,788 INFO    Thread-11 (_thread_body):25836 [sender.py:transition_state():617] send defer: 11
+2024-08-24 20:20:35,788 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:35,788 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-24 20:20:35,789 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:35,789 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-24 20:20:35,789 INFO    SenderThread:25836 [file_pusher.py:join():178] waiting for file pusher
+2024-08-24 20:20:35,789 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 12
+2024-08-24 20:20:35,789 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:35,789 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-24 20:20:35,789 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:35,789 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-24 20:20:35,789 INFO    SenderThread:25836 [file_stream.py:finish():595] file stream finish called
+2024-08-24 20:20:36,056 INFO    SenderThread:25836 [file_stream.py:finish():599] file stream finish is done
+2024-08-24 20:20:36,056 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 13
+2024-08-24 20:20:36,056 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:36,056 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-24 20:20:36,056 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:36,056 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-24 20:20:36,056 INFO    SenderThread:25836 [sender.py:transition_state():617] send defer: 14
+2024-08-24 20:20:36,057 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: defer
+2024-08-24 20:20:36,057 DEBUG   SenderThread:25836 [sender.py:send():382] send: final
+2024-08-24 20:20:36,057 INFO    HandlerThread:25836 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-24 20:20:36,057 DEBUG   SenderThread:25836 [sender.py:send():382] send: footer
+2024-08-24 20:20:36,057 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: defer
+2024-08-24 20:20:36,057 INFO    SenderThread:25836 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-24 20:20:36,057 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-24 20:20:36,057 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
+2024-08-24 20:20:36,058 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-24 20:20:36,058 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: server_info
+2024-08-24 20:20:36,058 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: poll_exit
+2024-08-24 20:20:36,058 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: server_info
+2024-08-24 20:20:36,060 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-24 20:20:36,060 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-24 20:20:36,060 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-24 20:20:36,060 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: job_info
+2024-08-24 20:20:36,224 DEBUG   SenderThread:25836 [sender.py:send_request():409] send_request: job_info
+2024-08-24 20:20:36,224 INFO    MainThread:25836 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-24 20:20:36,224 INFO    MainThread:25836 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-24 20:20:36,224 INFO    MainThread:25836 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-24 20:20:36,225 DEBUG   HandlerThread:25836 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-24 20:20:36,225 INFO    HandlerThread:25836 [handler.py:finish():869] shutting down handler
+2024-08-24 20:20:37,061 INFO    WriterThread:25836 [datastore.py:close():296] close: /project/wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb
+2024-08-24 20:20:37,224 INFO    SenderThread:25836 [sender.py:finish():1572] shutting down sender
+2024-08-24 20:20:37,224 INFO    SenderThread:25836 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-24 20:20:37,224 INFO    SenderThread:25836 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240824_202022-z2bjbf6e/logs/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_setup.py:_flush():76] Configure stats pid to 25210
+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train baseline'}
+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240824_202022-z2bjbf6e/logs/debug.log
+2024-08-24 20:20:22,645 INFO    MainThread:25210 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240824_202022-z2bjbf6e/logs/debug-internal.log
+2024-08-24 20:20:22,646 INFO    MainThread:25210 [wandb_init.py:init():566] calling init triggers
+2024-08-24 20:20:22,646 INFO    MainThread:25210 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document'], 'test_data_path': ['1205770', '/work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 4, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-baseline-qwen2-0.5B_train_2024-08-24-20:20:07', 'wandb_project': 'yans_experiment', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-baseline-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 3.5e-06, 'lr_decay_style': 'cosine', 'lr_decay_iters': 23178, 'lr_warmup_iters': 500, 'min_lr': 3.5e-07, 'train_iters': 23178, 'train_samples': None, 'global_batch_size': 1280, 'micro_batch_size': 16, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-baseline-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 8, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 10}
+2024-08-24 20:20:22,646 INFO    MainThread:25210 [wandb_init.py:init():616] starting backend
+2024-08-24 20:20:22,646 INFO    MainThread:25210 [wandb_init.py:init():620] setting up manager
+2024-08-24 20:20:22,651 INFO    MainThread:25210 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-24 20:20:22,652 INFO    MainThread:25210 [wandb_init.py:init():628] backend started and connected
+2024-08-24 20:20:22,659 INFO    MainThread:25210 [wandb_init.py:init():720] updated telemetry
+2024-08-24 20:20:22,672 INFO    MainThread:25210 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-24 20:20:23,105 INFO    MainThread:25210 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-24 20:20:23,127 INFO    MainThread:25210 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-24 20:20:23,127 INFO    MainThread:25210 [wandb_init.py:init():804] starting run threads in backend
+2024-08-24 20:20:23,283 INFO    MainThread:25210 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-24 20:20:23,283 INFO    MainThread:25210 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-24 20:20:23,283 INFO    MainThread:25210 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-24 20:20:23,283 INFO    MainThread:25210 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-24 20:20:23,284 INFO    MainThread:25210 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-24 20:20:37,225 WARNING MsgRouterThr:25210 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240824_202022-z2bjbf6e/run-z2bjbf6e.wandb ADDED Viewed

Binary file (18.8 kB). View file

wandb/run-20240826_221726-7jzdp89j/files/config.yaml ADDED Viewed

	@@ -0,0 +1,342 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '1754785366'
+  - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
+  - '28623823675'
+  - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '1205770'
+  - /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '1205770'
+  - /work/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 1024
+num_workers:
+  desc: null
+  value: 4
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-1.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-baseline-qwen2-1.5B-3.5e-5_train_2024-08-26-22:17:00
+wandb_project:
+  desc: null
+  value: yans_experiment
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-baseline-qwen2-1.5B-3.5e-5
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-baseline-qwen2-1.5B-3.5e-5
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-1.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: anyprecision
+lr:
+  desc: null
+  value: 3.5e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 23178
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 3.5e-06
+train_iters:
+  desc: null
+  value: 23178
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 1280
+micro_batch_size:
+  desc: null
+  value: 16
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 131072
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-08
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-baseline-qwen2-1.5B-3.5e-5
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: true
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+valid_micro_batch_size:
+  desc: null
+  value: 1
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 8
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 10
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1724678246.995911
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Qwen2ForCausalLM
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 1536
+model_type:
+  desc: null
+  value: qwen2
+max_position_embeddings:
+  desc: null
+  value: 1024
+num_attention_heads:
+  desc: null
+  value: 12
+num_hidden_layers:
+  desc: null
+  value: 28

wandb/run-20240826_221726-7jzdp89j/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240826_221726-7jzdp89j/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,375 @@

+absl-py==2.1.0
+accelerate==0.23.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+astroid==3.2.4
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bert-score==0.3.13
+bleach==6.1.0
+blis==0.7.11
+build==1.2.1
+cachecontrol==0.14.0
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+cleo==2.1.0
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cramjam==2.8.3
+crashtest==0.4.1
+cryptography==43.0.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+dataclasses-json==0.6.7
+dataproperty==1.0.1
+datasets==2.20.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.8
+distributed==2023.11.0
+distro==1.9.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+dulwich==0.21.7
+einops==0.7.0
+emoji==2.12.1
+entmax==1.3
+evaluate==0.4.2
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastparquet==2023.10.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+fugashi==1.3.2
+fuzzywuzzy==0.18.0
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+greenlet==3.0.3
+grpcio==1.60.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.24.5
+hydra-core==1.3.2
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+installer==0.7.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+isort==5.13.2
+jaraco.classes==3.4.0
+jedi==0.19.1
+jeepney==0.8.0
+jinja2==3.1.3
+jiter==0.5.0
+joblib==1.3.2
+json5==0.9.14
+jsonargparse==3.13.1
+jsonlines==4.0.0
+jsonnet==0.19.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+keyring==24.3.1
+kiwisolver==1.4.5
+langchain-community==0.2.12
+langchain-core==0.2.31
+langchain-huggingface==0.0.2
+langchain-openai==0.1.21
+langchain-text-splitters==0.2.2
+langchain==0.2.13
+langcodes==3.3.0
+langsmith==0.1.99
+lazy-loader==0.3
+levenshtein==0.25.1
+librosa==0.10.1
+lightning-utilities==0.11.6
+llm-jp-eval==1.4.0
+llvmlite==0.40.1
+lm-eval==0.3.0
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+marshmallow==3.21.3
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mbstrdecoder==1.1.3
+mccabe==0.7.0
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+mojimoji==0.0.13
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+neologdn==0.5.3
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numexpr==2.10.1
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+omegaconf==2.3.0
+onnx==1.15.0rc2
+openai==1.40.6
+opencv==4.7.0
+optree==0.10.0
+orjson==3.10.7
+packaging==23.2
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+pathvalidate==3.2.0
+peft==0.5.0
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+pkginfo==1.11.1
+plac==1.4.3
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+poetry-core==1.9.0
+poetry-plugin-export==1.8.0
+poetry==1.8.3
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==15.0.2
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycountry==24.6.1
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pylint==3.2.6
+pynvml==11.4.1
+pyparsing==3.1.1
+pyproject-hooks==1.1.0
+pytablewriter==1.2.0
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+python-levenshtein==0.25.1
+pytorch-lightning==2.4.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapidfuzz==3.9.6
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+requests==2.32.3
+rhoknp==1.7.0
+rich==13.7.0
+rmm==23.12.0
+rouge-score==0.1.2
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.2
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.12.0
+secretstorage==3.3.3
+send2trash==1.8.2
+sentence-transformers==3.0.1
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+shellingham==1.5.4
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+sqlalchemy==2.0.32
+sqlitedict==2.1.0
+srsly==2.4.8
+stack-data==0.6.3
+sumeval==0.2.2
+sympy==1.12
+tabledata==1.3.3
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tcolorpy==0.1.6
+tenacity==8.5.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+text-generation==0.7.0
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tiktoken==0.7.0
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.13.2
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchmetrics==0.10.3
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm-multiprocess==0.0.11
+tqdm==4.66.5
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+trove-classifiers==2024.7.2
+typepy==1.3.2
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.12.2
+typing-inspect==0.9.0
+tzdata==2024.1
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+unbabel-comet==2.2.2
+unidic-lite==1.0.8
+urllib3==1.26.18
+virtualenv==20.26.3
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+word2number==1.1
+xdoctest==1.0.2
+xgboost==1.7.6
+xmltodict==0.13.0
+xxhash==3.4.1
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0
+zstandard==0.23.0