koichi12 commited on Nov 28, 2024

Commit

04522a2

verified ·

1 Parent(s): b38ed3f

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

wandb/run-20240802_180656-l8nnlt0c/files/config.yaml +335 -0
wandb/run-20240802_180656-l8nnlt0c/files/output.log +0 -0
wandb/run-20240802_180656-l8nnlt0c/files/requirements.txt +271 -0
wandb/run-20240802_180656-l8nnlt0c/files/wandb-metadata.json +215 -0
wandb/run-20240802_180656-l8nnlt0c/files/wandb-summary.json +1 -0
wandb/run-20240802_180656-l8nnlt0c/logs/debug.log +30 -0
wandb/run-20240804_021608-l90yeme3/files/config.yaml +335 -0
wandb/run-20240804_021608-l90yeme3/files/output.log +0 -0
wandb/run-20240804_021608-l90yeme3/files/requirements.txt +271 -0
wandb/run-20240804_021608-l90yeme3/files/wandb-metadata.json +215 -0
wandb/run-20240804_021608-l90yeme3/files/wandb-summary.json +1 -0
wandb/run-20240804_021608-l90yeme3/logs/debug-internal.log +0 -0
wandb/run-20240804_021608-l90yeme3/logs/debug.log +29 -0
wandb/run-20240804_035906-457c7q3q/files/config.yaml +335 -0
wandb/run-20240804_035906-457c7q3q/files/output.log +130 -0
wandb/run-20240804_035906-457c7q3q/files/requirements.txt +271 -0
wandb/run-20240804_035906-457c7q3q/files/wandb-metadata.json +215 -0
wandb/run-20240804_035906-457c7q3q/files/wandb-summary.json +1 -0
wandb/run-20240804_035906-457c7q3q/logs/debug-internal.log +186 -0
wandb/run-20240804_035906-457c7q3q/logs/debug.log +29 -0
wandb/run-20240804_035906-457c7q3q/run-457c7q3q.wandb +0 -0
wandb/run-20240804_143449-7tyiihss/files/config.yaml +335 -0
wandb/run-20240804_143449-7tyiihss/files/output.log +135 -0
wandb/run-20240804_143449-7tyiihss/files/requirements.txt +271 -0
wandb/run-20240804_143449-7tyiihss/files/wandb-metadata.json +215 -0
wandb/run-20240804_143449-7tyiihss/files/wandb-summary.json +1 -0
wandb/run-20240804_143449-7tyiihss/logs/debug-internal.log +186 -0
wandb/run-20240804_143449-7tyiihss/logs/debug.log +30 -0
wandb/run-20240804_143449-7tyiihss/run-7tyiihss.wandb +0 -0
wandb/run-20240804_153511-5ba5jbt6/files/config.yaml +335 -0
wandb/run-20240804_153511-5ba5jbt6/files/output.log +135 -0
wandb/run-20240804_153511-5ba5jbt6/files/requirements.txt +271 -0
wandb/run-20240804_153511-5ba5jbt6/files/wandb-metadata.json +215 -0
wandb/run-20240804_153511-5ba5jbt6/files/wandb-summary.json +1 -0
wandb/run-20240804_153511-5ba5jbt6/logs/debug-internal.log +188 -0
wandb/run-20240804_153511-5ba5jbt6/logs/debug.log +30 -0
wandb/run-20240804_153511-5ba5jbt6/run-5ba5jbt6.wandb +0 -0
wandb/run-20240812_052446-qrv0d6sp/files/config.yaml +314 -0
wandb/run-20240812_052446-qrv0d6sp/files/output.log +12 -0
wandb/run-20240812_052446-qrv0d6sp/files/requirements.txt +271 -0
wandb/run-20240812_052446-qrv0d6sp/files/wandb-metadata.json +215 -0
wandb/run-20240812_052446-qrv0d6sp/files/wandb-summary.json +1 -0
wandb/run-20240812_052446-qrv0d6sp/logs/debug-internal.log +185 -0
wandb/run-20240812_052446-qrv0d6sp/logs/debug.log +28 -0
wandb/run-20240812_052446-qrv0d6sp/run-qrv0d6sp.wandb +0 -0
wandb/run-20240812_072401-esew3nhv/files/config.yaml +335 -0
wandb/run-20240812_072401-esew3nhv/files/requirements.txt +271 -0
wandb/run-20240812_072401-esew3nhv/files/wandb-metadata.json +215 -0
wandb/run-20240812_072401-esew3nhv/logs/debug-internal.log +240 -0
wandb/run-20240812_072401-esew3nhv/logs/debug.log +29 -0

wandb/run-20240802_180656-l8nnlt0c/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-mistral-sample_train_2024-08-02-18:06:43
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-mistral-sample
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-mistral-sample
+base_model:
+  desc: null
+  value: /share/pretrained_lm/custom/tiny-mistral
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-mistral-sample
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32768
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722589616.489856
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 256
+model_type:
+  desc: null
+  value: mistral
+max_position_embeddings:
+  desc: null
+  value: 512
+num_attention_heads:
+  desc: null
+  value: 4
+num_hidden_layers:
+  desc: null
+  value: 4
+model_architecture:
+  desc: null
+  value: MistralForCausalLM

wandb/run-20240802_180656-l8nnlt0c/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240802_180656-l8nnlt0c/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240802_180656-l8nnlt0c/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-02T09:06:57.070043",
+    "startedAt": "2024-08-02T09:06:56.476807",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/custom/tiny-mistral",
+        "--save",
+        "/work/llm_recipes/models/tiny-mistral-sample",
+        "--load",
+        "/work/llm_recipes/models/tiny-mistral-sample",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-mistral-sample",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-mistral-sample_train_2024-08-02-18:06:43"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0409999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.041,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240802_180656-l8nnlt0c/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"training/loss": 8.714340209960938, "training/perplexity": 6089.615424118352, "utils/batch_size": 8, "utils/global_batch_size": 320, "utils/seq_len": 513, "utils/gradient_accumulation_steps": 40, "utils/iteration": 20000, "optimizer/lr": 1e-06, "optimizer/variance_l2": 0.01379441768482063, "optimizer/variance_sqrt_l2": 1.002313441281401, "optimizer/momentum_l2": 0.9743922417897144, "optimizer/weight_l2": 101.93656115447489, "optimizer/variance_l1": 1.003814697265625, "optimizer/variance_sqrt_l1": 592.75, "optimizer/momentum_l1": 429.375, "optimizer/weight_l1": 333120.0, "optimizer/variance_abs_max": 0.0012969970703125, "optimizer/variance_sqrt_abs_max": 0.0361328125, "optimizer/momentum_abs_max": 0.035400390625, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 1.1439334890019381, "stats/tokens_per_sec": 143504.84672253692, "stats/tokens_per_sec_per_gpu": 143504.84672253692, "stats/tflops": 10.15887571314936, "_timestamp": 1722611185.6210454, "_runtime": 21569.131189346313, "_step": 20000, "evaluation/val_loss": 8.702303886413574, "evaluation/val_ppl": 6016.75830078125, "_wandb": {"runtime": 21568}}

wandb/run-20240802_180656-l8nnlt0c/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-02 18:06:56,482 INFO    MainThread:14630 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-02 18:06:56,482 INFO    MainThread:14630 [wandb_setup.py:_flush():76] Configure stats pid to 14630
+2024-08-02 18:06:56,482 INFO    MainThread:14630 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240802_180656-l8nnlt0c/logs/debug.log
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240802_180656-l8nnlt0c/logs/debug-internal.log
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_init.py:init():566] calling init triggers
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-02-18:06:43', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_init.py:init():616] starting backend
+2024-08-02 18:06:56,483 INFO    MainThread:14630 [wandb_init.py:init():620] setting up manager
+2024-08-02 18:06:56,488 INFO    MainThread:14630 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-02 18:06:56,489 INFO    MainThread:14630 [wandb_init.py:init():628] backend started and connected
+2024-08-02 18:06:56,494 INFO    MainThread:14630 [wandb_init.py:init():720] updated telemetry
+2024-08-02 18:06:56,505 INFO    MainThread:14630 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-02 18:06:56,963 INFO    MainThread:14630 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-02 18:06:57,043 INFO    MainThread:14630 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-02 18:06:57,043 INFO    MainThread:14630 [wandb_init.py:init():804] starting run threads in backend
+2024-08-02 18:06:57,104 INFO    MainThread:14630 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-02 18:06:57,105 INFO    MainThread:14630 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-02 18:06:57,105 INFO    MainThread:14630 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-02 18:06:57,105 INFO    MainThread:14630 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-02 18:06:57,106 INFO    MainThread:14630 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-02 18:06:58,607 INFO    MainThread:14630 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 512, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
+2024-08-02 18:06:58,607 INFO    MainThread:14630 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-03 00:06:33,941 WARNING MsgRouterThr:14630 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_021608-l90yeme3/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 1024
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-mistral-sample5_train_2024-08-04-02:15:57
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-mistral-sample5
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-mistral-sample5
+base_model:
+  desc: null
+  value: /share/pretrained_lm/custom/tiny-mistral
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-mistral-sample5
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32768
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722705368.213775
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 256
+model_type:
+  desc: null
+  value: mistral
+max_position_embeddings:
+  desc: null
+  value: 1024
+num_attention_heads:
+  desc: null
+  value: 4
+num_hidden_layers:
+  desc: null
+  value: 4
+model_architecture:
+  desc: null
+  value: MistralForCausalLM

wandb/run-20240804_021608-l90yeme3/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240804_021608-l90yeme3/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_021608-l90yeme3/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-03T17:16:08.874656",
+    "startedAt": "2024-08-03T17:16:08.183460",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "1024",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/custom/tiny-mistral",
+        "--save",
+        "/work/llm_recipes/models/tiny-mistral-sample5",
+        "--load",
+        "/work/llm_recipes/models/tiny-mistral-sample5",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-mistral-sample5",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-mistral-sample5_train_2024-08-04-02:15:57"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.034,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240804_021608-l90yeme3/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"training/loss": 8.691668510437012, "training/perplexity": 5953.106781607083, "utils/batch_size": 8, "utils/global_batch_size": 320, "utils/seq_len": 1025, "utils/gradient_accumulation_steps": 40, "utils/iteration": 3169, "optimizer/lr": 1.91351934671402e-05, "optimizer/variance_l2": 0.01361168373992461, "optimizer/variance_sqrt_l2": 0.9996963278311211, "optimizer/momentum_l2": 0.9700213365567774, "optimizer/weight_l2": 101.93656115447489, "optimizer/variance_l1": 0.9979248046875, "optimizer/variance_sqrt_l1": 598.25, "optimizer/momentum_l1": 420.75, "optimizer/weight_l1": 332992.0, "optimizer/variance_abs_max": 0.00130462646484375, "optimizer/variance_sqrt_abs_max": 0.0361328125, "optimizer/momentum_abs_max": 0.03515625, "optimizer/weight_abs_max": 1.0, "stats/1_iteration_time": 1.5153617600008147, "stats/tokens_per_sec": 216449.96505641245, "stats/tokens_per_sec_per_gpu": 216449.96505641245, "stats/tflops": 16.684531271256578, "_timestamp": 1722710244.673399, "_runtime": 4876.459624052048, "_step": 3169, "evaluation/val_loss": 8.686346054077148, "evaluation/val_ppl": 5921.50634765625, "_wandb": {"runtime": 4876}}

wandb/run-20240804_021608-l90yeme3/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240804_021608-l90yeme3/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-08-04 02:16:08,207 INFO    MainThread:11734 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 02:16:08,207 INFO    MainThread:11734 [wandb_setup.py:_flush():76] Configure stats pid to 11734
+2024-08-04 02:16:08,207 INFO    MainThread:11734 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 02:16:08,207 INFO    MainThread:11734 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 02:16:08,207 INFO    MainThread:11734 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
+2024-08-04 02:16:08,207 INFO    MainThread:11734 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 02:16:08,207 INFO    MainThread:11734 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 02:16:08,207 INFO    MainThread:11734 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_021608-l90yeme3/logs/debug.log
+2024-08-04 02:16:08,208 INFO    MainThread:11734 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_021608-l90yeme3/logs/debug-internal.log
+2024-08-04 02:16:08,208 INFO    MainThread:11734 [wandb_init.py:init():566] calling init triggers
+2024-08-04 02:16:08,208 INFO    MainThread:11734 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample5_train_2024-08-04-02:15:57', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample5', 'save': '/work/llm_recipes/models/tiny-mistral-sample5', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample5', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
+2024-08-04 02:16:08,208 INFO    MainThread:11734 [wandb_init.py:init():616] starting backend
+2024-08-04 02:16:08,208 INFO    MainThread:11734 [wandb_init.py:init():620] setting up manager
+2024-08-04 02:16:08,212 INFO    MainThread:11734 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 02:16:08,213 INFO    MainThread:11734 [wandb_init.py:init():628] backend started and connected
+2024-08-04 02:16:08,218 INFO    MainThread:11734 [wandb_init.py:init():720] updated telemetry
+2024-08-04 02:16:08,228 INFO    MainThread:11734 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 02:16:08,766 INFO    MainThread:11734 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 02:16:08,847 INFO    MainThread:11734 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 02:16:08,847 INFO    MainThread:11734 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 02:16:08,953 INFO    MainThread:11734 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 02:16:08,953 INFO    MainThread:11734 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 02:16:08,954 INFO    MainThread:11734 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 02:16:08,954 INFO    MainThread:11734 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 02:16:08,954 INFO    MainThread:11734 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 02:16:09,857 INFO    MainThread:11734 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 1024, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
+2024-08-04 02:16:09,857 INFO    MainThread:11734 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}

wandb/run-20240804_035906-457c7q3q/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-llama-sample_train_2024-08-04-03:58:55
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama-sample
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama-sample
+base_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 2000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 2000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-llama-sample
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32000
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722711546.225609
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 2048
+model_type:
+  desc: null
+  value: llama
+max_position_embeddings:
+  desc: null
+  value: 2048
+num_attention_heads:
+  desc: null
+  value: 32
+num_hidden_layers:
+  desc: null
+  value: 22
+model_architecture:
+  desc: null
+  value: LlamaForCausalLM

wandb/run-20240804_035906-457c7q3q/files/output.log ADDED Viewed

	@@ -0,0 +1,130 @@

+Created Hugging Face repository with ID koichi12/tiny-llama-sample.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping model loading
+--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
+You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      640000
+    validation: 35200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama-sample, skipping optimizer loading
+File not found: /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama-sample/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): LlamaForCausalLM(
+    (model): LlamaModel(
+      (embed_tokens): Embedding(32000, 2048)
+      (layers): ModuleList(
+        (0-21): 22 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): LlamaDecoderLayer(
+              (self_attn): LlamaFlashAttention2(
+                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (v_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm()
+              (post_attention_layernorm): LlamaRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): LlamaRMSNorm()
+      (rotary_emb): LlamaRotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
+  )
+)
+model config: LlamaConfig {
+  "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32000
+}
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
+    loss: torch.Tensor = model(**batch).loss
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
+    output = self._fsdp_wrapped_module(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
+    outputs = self.model(
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
+    cache_position = torch.arange(
+RuntimeError: CUDA error: device-side assert triggered
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

wandb/run-20240804_035906-457c7q3q/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_035906-457c7q3q/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-03T18:59:06.856800",
+    "startedAt": "2024-08-03T18:59:06.213352",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "2000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "2000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+        "--save",
+        "/work/llm_recipes/models/tiny-llama-sample",
+        "--load",
+        "/work/llm_recipes/models/tiny-llama-sample",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-llama-sample",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-llama-sample_train_2024-08-04-03:58:55"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.034,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240804_035906-457c7q3q/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 3}}

wandb/run-20240804_035906-457c7q3q/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,186 @@

+2024-08-04 03:59:06,227 INFO    StreamThr :13051 [internal.py:wandb_internal():86] W&B internal server running at pid: 13051, started at: 2024-08-04 03:59:06.226186
+2024-08-04 03:59:06,228 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: status
+2024-08-04 03:59:06,230 INFO    WriterThread:13051 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_035906-457c7q3q/run-457c7q3q.wandb
+2024-08-04 03:59:06,231 DEBUG   SenderThread:13051 [sender.py:send():382] send: header
+2024-08-04 03:59:06,244 DEBUG   SenderThread:13051 [sender.py:send():382] send: run
+2024-08-04 03:59:06,745 INFO    SenderThread:13051 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_035906-457c7q3q/files
+2024-08-04 03:59:06,745 INFO    SenderThread:13051 [sender.py:_start_run_threads():1136] run started: 457c7q3q with start time 1722711546.225609
+2024-08-04 03:59:06,750 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 03:59:06,751 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: check_version
+2024-08-04 03:59:06,837 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 03:59:06,843 DEBUG   HandlerThread:13051 [system_info.py:__init__():27] System info init
+2024-08-04 03:59:06,843 DEBUG   HandlerThread:13051 [system_info.py:__init__():42] System info init done
+2024-08-04 03:59:06,843 INFO    HandlerThread:13051 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 03:59:06,843 INFO    SystemMonitor:13051 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 03:59:06,843 INFO    HandlerThread:13051 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 03:59:06,844 INFO    SystemMonitor:13051 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 03:59:06,844 INFO    SystemMonitor:13051 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 03:59:06,845 INFO    SystemMonitor:13051 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 03:59:06,846 INFO    SystemMonitor:13051 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 03:59:06,847 INFO    SystemMonitor:13051 [interfaces.py:start():190] Started network monitoring
+2024-08-04 03:59:06,856 DEBUG   HandlerThread:13051 [system_info.py:probe():151] Probing system
+2024-08-04 03:59:06,858 DEBUG   HandlerThread:13051 [system_info.py:_probe_git():136] Probing git
+2024-08-04 03:59:06,869 DEBUG   HandlerThread:13051 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 03:59:06,869 DEBUG   HandlerThread:13051 [system_info.py:probe():199] Probing system done
+2024-08-04 03:59:06,869 DEBUG   HandlerThread:13051 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T18:59:06.856800', 'startedAt': '2024-08-03T18:59:06.213352', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama-sample', '--load', '/work/llm_recipes/models/tiny-llama-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama-sample_train_2024-08-04-03:58:55'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
+2024-08-04 03:59:06,870 INFO    HandlerThread:13051 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 03:59:06,870 INFO    HandlerThread:13051 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 03:59:06,871 INFO    HandlerThread:13051 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 03:59:06,876 DEBUG   SenderThread:13051 [sender.py:send():382] send: files
+2024-08-04 03:59:06,877 INFO    SenderThread:13051 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 03:59:06,886 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 03:59:06,886 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 03:59:06,886 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 03:59:06,887 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 03:59:06,888 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 03:59:07,128 DEBUG   SenderThread:13051 [sender.py:send():382] send: telemetry
+2024-08-04 03:59:07,553 INFO    wandb-upload_0:13051 [upload_job.py:push():131] Uploaded file /tmp/tmpuq1rfkhgwandb/205blebe-wandb-metadata.json
+2024-08-04 03:59:07,747 INFO    Thread-12 :13051 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035906-457c7q3q/files/output.log
+2024-08-04 03:59:07,747 INFO    Thread-12 :13051 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035906-457c7q3q/files/wandb-metadata.json
+2024-08-04 03:59:07,747 INFO    Thread-12 :13051 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035906-457c7q3q/files/requirements.txt
+2024-08-04 03:59:09,511 DEBUG   SenderThread:13051 [sender.py:send():382] send: config
+2024-08-04 03:59:09,512 DEBUG   SenderThread:13051 [sender.py:send():382] send: config
+2024-08-04 03:59:09,747 INFO    Thread-12 :13051 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035906-457c7q3q/files/output.log
+2024-08-04 03:59:10,109 DEBUG   SenderThread:13051 [sender.py:send():382] send: exit
+2024-08-04 03:59:10,109 INFO    SenderThread:13051 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 03:59:10,109 INFO    SenderThread:13051 [sender.py:send_exit():591] handling runtime: 3
+2024-08-04 03:59:10,123 INFO    SenderThread:13051 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 03:59:10,123 INFO    SenderThread:13051 [sender.py:send_exit():597] send defer
+2024-08-04 03:59:10,123 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:10,123 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 03:59:10,124 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:10,124 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 03:59:10,124 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 1
+2024-08-04 03:59:10,124 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:10,124 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 03:59:10,124 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:10,124 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 03:59:10,124 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 2
+2024-08-04 03:59:10,124 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:10,124 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 03:59:10,124 INFO    HandlerThread:13051 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 03:59:10,124 DEBUG   SystemMonitor:13051 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 03:59:10,125 INFO    HandlerThread:13051 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 03:59:10,125 DEBUG   SystemMonitor:13051 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 03:59:10,125 INFO    HandlerThread:13051 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 03:59:10,125 DEBUG   SystemMonitor:13051 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 03:59:10,159 INFO    HandlerThread:13051 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 03:59:10,159 INFO    HandlerThread:13051 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 03:59:10,159 INFO    HandlerThread:13051 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 03:59:10,160 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:10,160 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 03:59:10,160 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 3
+2024-08-04 03:59:10,160 DEBUG   SenderThread:13051 [sender.py:send():382] send: stats
+2024-08-04 03:59:10,160 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:10,160 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 03:59:10,161 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:10,161 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 03:59:10,161 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 4
+2024-08-04 03:59:10,161 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:10,161 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 03:59:10,161 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:10,161 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 03:59:10,161 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 5
+2024-08-04 03:59:10,161 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:10,161 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 03:59:10,161 DEBUG   SenderThread:13051 [sender.py:send():382] send: summary
+2024-08-04 03:59:10,165 INFO    SenderThread:13051 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 03:59:10,165 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:10,165 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 03:59:10,165 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 6
+2024-08-04 03:59:10,165 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:10,165 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 03:59:10,165 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:10,165 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 03:59:10,168 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 03:59:10,367 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 7
+2024-08-04 03:59:10,367 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:10,367 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 03:59:10,367 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:10,368 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 03:59:10,749 INFO    Thread-12 :13051 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035906-457c7q3q/files/config.yaml
+2024-08-04 03:59:10,749 INFO    Thread-12 :13051 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_035906-457c7q3q/files/wandb-summary.json
+2024-08-04 03:59:11,109 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 03:59:11,580 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 8
+2024-08-04 03:59:11,580 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 03:59:11,580 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:11,581 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 03:59:11,581 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:11,581 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 03:59:11,581 INFO    SenderThread:13051 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 03:59:11,582 INFO    SenderThread:13051 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 03:59:11,595 INFO    SenderThread:13051 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 03:59:11,631 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 9
+2024-08-04 03:59:11,631 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:11,631 DEBUG   SenderThread:13051 [sender.py:send():382] send: artifact
+2024-08-04 03:59:11,631 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 03:59:11,749 INFO    Thread-12 :13051 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_035906-457c7q3q/files/output.log
+2024-08-04 03:59:12,109 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 03:59:12,520 INFO    SenderThread:13051 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 03:59:12,520 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:12,520 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 03:59:12,520 INFO    SenderThread:13051 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 03:59:12,750 INFO    SenderThread:13051 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_035906-457c7q3q/files
+2024-08-04 03:59:12,750 INFO    SenderThread:13051 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035906-457c7q3q/files/requirements.txt requirements.txt
+2024-08-04 03:59:12,751 INFO    SenderThread:13051 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035906-457c7q3q/files/config.yaml config.yaml
+2024-08-04 03:59:12,752 INFO    SenderThread:13051 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035906-457c7q3q/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 03:59:12,752 INFO    SenderThread:13051 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035906-457c7q3q/files/wandb-summary.json wandb-summary.json
+2024-08-04 03:59:12,754 INFO    SenderThread:13051 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_035906-457c7q3q/files/output.log output.log
+2024-08-04 03:59:12,755 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 10
+2024-08-04 03:59:12,755 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 03:59:12,755 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:12,757 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 03:59:12,757 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:12,757 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 03:59:12,757 INFO    SenderThread:13051 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 03:59:13,109 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 03:59:13,110 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 03:59:13,154 INFO    wandb-upload_0:13051 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035906-457c7q3q/files/requirements.txt
+2024-08-04 03:59:13,257 INFO    wandb-upload_1:13051 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035906-457c7q3q/files/config.yaml
+2024-08-04 03:59:13,334 INFO    wandb-upload_2:13051 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035906-457c7q3q/files/wandb-summary.json
+2024-08-04 03:59:13,368 INFO    wandb-upload_3:13051 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_035906-457c7q3q/files/output.log
+2024-08-04 03:59:13,568 INFO    Thread-11 (_thread_body):13051 [sender.py:transition_state():617] send defer: 11
+2024-08-04 03:59:13,569 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:13,569 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 03:59:13,569 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:13,569 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 03:59:13,569 INFO    SenderThread:13051 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 03:59:13,569 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 12
+2024-08-04 03:59:13,570 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:13,570 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 03:59:13,570 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:13,570 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 03:59:13,570 INFO    SenderThread:13051 [file_stream.py:finish():595] file stream finish called
+2024-08-04 03:59:13,759 INFO    SenderThread:13051 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 03:59:13,759 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 13
+2024-08-04 03:59:13,759 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:13,759 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 03:59:13,759 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:13,759 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 03:59:13,759 INFO    SenderThread:13051 [sender.py:transition_state():617] send defer: 14
+2024-08-04 03:59:13,759 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 03:59:13,759 DEBUG   SenderThread:13051 [sender.py:send():382] send: final
+2024-08-04 03:59:13,759 INFO    HandlerThread:13051 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 03:59:13,760 DEBUG   SenderThread:13051 [sender.py:send():382] send: footer
+2024-08-04 03:59:13,760 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: defer
+2024-08-04 03:59:13,760 INFO    SenderThread:13051 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 03:59:13,760 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 03:59:13,760 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 03:59:13,761 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 03:59:13,761 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 03:59:13,761 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 03:59:13,761 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 03:59:13,761 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 03:59:13,761 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 03:59:13,762 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: server_info
+2024-08-04 03:59:13,763 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 03:59:13,927 DEBUG   SenderThread:13051 [sender.py:send_request():409] send_request: job_info
+2024-08-04 03:59:13,927 INFO    MainThread:13051 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 03:59:13,927 INFO    MainThread:13051 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 03:59:13,928 INFO    MainThread:13051 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 03:59:13,928 DEBUG   HandlerThread:13051 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 03:59:13,928 INFO    HandlerThread:13051 [handler.py:finish():869] shutting down handler
+2024-08-04 03:59:14,763 INFO    WriterThread:13051 [datastore.py:close():296] close: /project/wandb/run-20240804_035906-457c7q3q/run-457c7q3q.wandb
+2024-08-04 03:59:14,927 INFO    SenderThread:13051 [sender.py:finish():1572] shutting down sender
+2024-08-04 03:59:14,928 INFO    SenderThread:13051 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 03:59:14,928 INFO    SenderThread:13051 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_035906-457c7q3q/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_setup.py:_flush():76] Configure stats pid to 12980
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_035906-457c7q3q/logs/debug.log
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_035906-457c7q3q/logs/debug-internal.log
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_init.py:init():566] calling init triggers
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama-sample_train_2024-08-04-03:58:55', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama-sample', 'save': '/work/llm_recipes/models/tiny-llama-sample', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
+2024-08-04 03:59:06,219 INFO    MainThread:12980 [wandb_init.py:init():616] starting backend
+2024-08-04 03:59:06,220 INFO    MainThread:12980 [wandb_init.py:init():620] setting up manager
+2024-08-04 03:59:06,224 INFO    MainThread:12980 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 03:59:06,225 INFO    MainThread:12980 [wandb_init.py:init():628] backend started and connected
+2024-08-04 03:59:06,230 INFO    MainThread:12980 [wandb_init.py:init():720] updated telemetry
+2024-08-04 03:59:06,240 INFO    MainThread:12980 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 03:59:06,750 INFO    MainThread:12980 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 03:59:06,830 INFO    MainThread:12980 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 03:59:06,830 INFO    MainThread:12980 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 03:59:06,885 INFO    MainThread:12980 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 03:59:06,885 INFO    MainThread:12980 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 03:59:06,885 INFO    MainThread:12980 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 03:59:06,886 INFO    MainThread:12980 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 03:59:06,887 INFO    MainThread:12980 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 03:59:09,511 INFO    MainThread:12980 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
+2024-08-04 03:59:09,511 INFO    MainThread:12980 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}

wandb/run-20240804_035906-457c7q3q/run-457c7q3q.wandb ADDED Viewed

Binary file (20.8 kB). View file

wandb/run-20240804_143449-7tyiihss/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-llama_train_2024-08-04-14:34:38
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+base_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 2000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 2000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-llama
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32000
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722749689.905326
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 2048
+model_type:
+  desc: null
+  value: llama
+max_position_embeddings:
+  desc: null
+  value: 2048
+num_attention_heads:
+  desc: null
+  value: 32
+num_hidden_layers:
+  desc: null
+  value: 22
+model_architecture:
+  desc: null
+  value: LlamaForCausalLM

wandb/run-20240804_143449-7tyiihss/files/output.log ADDED Viewed

	@@ -0,0 +1,135 @@

+Created Hugging Face repository with ID koichi12/tiny-llama.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
+--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
+You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      640000
+    validation: 35200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): LlamaForCausalLM(
+    (model): LlamaModel(
+      (embed_tokens): Embedding(32000, 2048)
+      (layers): ModuleList(
+        (0-21): 22 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): LlamaDecoderLayer(
+              (self_attn): LlamaFlashAttention2(
+                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (v_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm()
+              (post_attention_layernorm): LlamaRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): LlamaRMSNorm()
+      (rotary_emb): LlamaRotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
+  )
+)
+model config: LlamaConfig {
+  "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32000
+}
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
+    batch = next(train_dataloader)
+  File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
+    for x in iter:
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
+    data = self._next_data()
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
+    return self._process_data(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
+    data.reraise()
+  File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
+    raise exception
+RuntimeError: Caught RuntimeError in DataLoader worker process 0.
+Original Traceback (most recent call last):
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
+    data = fetcher.fetch(index)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+    return self.collate_fn(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
+    return collate(batch, collate_fn_map=default_collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
+    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
+    return torch.stack(batch, 0, out=out)
+RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1

wandb/run-20240804_143449-7tyiihss/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_143449-7tyiihss/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T05:34:50.487822",
+    "startedAt": "2024-08-04T05:34:49.889154",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "2000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "2000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+        "--save",
+        "/work/llm_recipes/models/tiny-llama",
+        "--load",
+        "/work/llm_recipes/models/tiny-llama",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-llama",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-llama_train_2024-08-04-14:34:38"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48781967163086
+    }
+}

wandb/run-20240804_143449-7tyiihss/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 3}}

wandb/run-20240804_143449-7tyiihss/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,186 @@

+2024-08-04 14:34:49,906 INFO    StreamThr :11193 [internal.py:wandb_internal():86] W&B internal server running at pid: 11193, started at: 2024-08-04 14:34:49.905947
+2024-08-04 14:34:49,908 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: status
+2024-08-04 14:34:49,910 INFO    WriterThread:11193 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_143449-7tyiihss/run-7tyiihss.wandb
+2024-08-04 14:34:49,911 DEBUG   SenderThread:11193 [sender.py:send():382] send: header
+2024-08-04 14:34:49,924 DEBUG   SenderThread:11193 [sender.py:send():382] send: run
+2024-08-04 14:34:50,371 INFO    SenderThread:11193 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_143449-7tyiihss/files
+2024-08-04 14:34:50,372 INFO    SenderThread:11193 [sender.py:_start_run_threads():1136] run started: 7tyiihss with start time 1722749689.905326
+2024-08-04 14:34:50,377 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 14:34:50,377 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: check_version
+2024-08-04 14:34:50,468 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 14:34:50,474 DEBUG   HandlerThread:11193 [system_info.py:__init__():27] System info init
+2024-08-04 14:34:50,474 DEBUG   HandlerThread:11193 [system_info.py:__init__():42] System info init done
+2024-08-04 14:34:50,474 INFO    HandlerThread:11193 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 14:34:50,474 INFO    SystemMonitor:11193 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 14:34:50,475 INFO    HandlerThread:11193 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 14:34:50,475 INFO    SystemMonitor:11193 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 14:34:50,475 INFO    SystemMonitor:11193 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 14:34:50,477 INFO    SystemMonitor:11193 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 14:34:50,477 INFO    SystemMonitor:11193 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 14:34:50,478 INFO    SystemMonitor:11193 [interfaces.py:start():190] Started network monitoring
+2024-08-04 14:34:50,487 DEBUG   HandlerThread:11193 [system_info.py:probe():151] Probing system
+2024-08-04 14:34:50,490 DEBUG   HandlerThread:11193 [system_info.py:_probe_git():136] Probing git
+2024-08-04 14:34:50,504 DEBUG   HandlerThread:11193 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 14:34:50,504 DEBUG   HandlerThread:11193 [system_info.py:probe():199] Probing system done
+2024-08-04 14:34:50,504 DEBUG   HandlerThread:11193 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:34:50.487822', 'startedAt': '2024-08-04T05:34:49.889154', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:34:38'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
+2024-08-04 14:34:50,505 INFO    HandlerThread:11193 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 14:34:50,505 INFO    HandlerThread:11193 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 14:34:50,506 INFO    HandlerThread:11193 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 14:34:50,512 DEBUG   SenderThread:11193 [sender.py:send():382] send: files
+2024-08-04 14:34:50,512 INFO    SenderThread:11193 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 14:34:50,521 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 14:34:50,521 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 14:34:50,521 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:34:50,521 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 14:34:50,523 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 14:34:50,781 DEBUG   SenderThread:11193 [sender.py:send():382] send: telemetry
+2024-08-04 14:34:51,211 INFO    wandb-upload_0:11193 [upload_job.py:push():131] Uploaded file /tmp/tmp2tpc65lqwandb/b71f3euv-wandb-metadata.json
+2024-08-04 14:34:51,373 INFO    Thread-12 :11193 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143449-7tyiihss/files/wandb-metadata.json
+2024-08-04 14:34:51,374 INFO    Thread-12 :11193 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143449-7tyiihss/files/requirements.txt
+2024-08-04 14:34:52,374 INFO    Thread-12 :11193 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143449-7tyiihss/files/output.log
+2024-08-04 14:34:53,774 DEBUG   SenderThread:11193 [sender.py:send():382] send: config
+2024-08-04 14:34:53,774 DEBUG   SenderThread:11193 [sender.py:send():382] send: config
+2024-08-04 14:34:53,858 DEBUG   SenderThread:11193 [sender.py:send():382] send: exit
+2024-08-04 14:34:53,858 INFO    SenderThread:11193 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 14:34:53,858 INFO    SenderThread:11193 [sender.py:send_exit():591] handling runtime: 3
+2024-08-04 14:34:53,859 INFO    SenderThread:11193 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:34:53,860 INFO    SenderThread:11193 [sender.py:send_exit():597] send defer
+2024-08-04 14:34:53,860 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:53,860 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 14:34:53,860 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:53,860 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 14:34:53,860 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 1
+2024-08-04 14:34:53,860 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:53,860 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 14:34:53,861 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:53,861 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 14:34:53,861 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 2
+2024-08-04 14:34:53,861 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:53,861 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 14:34:53,861 INFO    HandlerThread:11193 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 14:34:53,861 DEBUG   SystemMonitor:11193 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 14:34:53,861 INFO    HandlerThread:11193 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 14:34:53,861 DEBUG   SystemMonitor:11193 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 14:34:53,861 INFO    HandlerThread:11193 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 14:34:53,862 DEBUG   SystemMonitor:11193 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 14:34:53,894 INFO    HandlerThread:11193 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 14:34:53,894 INFO    HandlerThread:11193 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 14:34:53,894 INFO    HandlerThread:11193 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 14:34:53,894 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:53,895 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 14:34:53,895 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 3
+2024-08-04 14:34:53,895 DEBUG   SenderThread:11193 [sender.py:send():382] send: stats
+2024-08-04 14:34:53,895 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:53,895 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 14:34:53,895 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:53,895 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 14:34:53,895 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 4
+2024-08-04 14:34:53,895 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:53,895 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 14:34:53,895 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:53,896 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 14:34:53,896 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 5
+2024-08-04 14:34:53,896 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:53,896 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 14:34:53,896 DEBUG   SenderThread:11193 [sender.py:send():382] send: summary
+2024-08-04 14:34:53,897 INFO    SenderThread:11193 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:34:53,897 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:53,897 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 14:34:53,897 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 6
+2024-08-04 14:34:53,897 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:53,897 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 14:34:53,897 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:53,897 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 14:34:53,900 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 14:34:54,104 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 7
+2024-08-04 14:34:54,104 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:54,104 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 14:34:54,104 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:54,104 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 14:34:54,376 INFO    Thread-12 :11193 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143449-7tyiihss/files/output.log
+2024-08-04 14:34:54,376 INFO    Thread-12 :11193 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143449-7tyiihss/files/config.yaml
+2024-08-04 14:34:54,376 INFO    Thread-12 :11193 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143449-7tyiihss/files/wandb-summary.json
+2024-08-04 14:34:54,858 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:34:56,041 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 8
+2024-08-04 14:34:56,041 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:34:56,041 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:56,042 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 14:34:56,042 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:56,042 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 14:34:56,042 INFO    SenderThread:11193 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 14:34:56,043 INFO    SenderThread:11193 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 14:34:56,056 INFO    SenderThread:11193 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 14:34:56,064 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 9
+2024-08-04 14:34:56,065 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:56,065 DEBUG   SenderThread:11193 [sender.py:send():382] send: artifact
+2024-08-04 14:34:56,065 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 14:34:56,380 INFO    Thread-12 :11193 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143449-7tyiihss/files/output.log
+2024-08-04 14:34:56,858 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:34:56,895 INFO    SenderThread:11193 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 14:34:56,895 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:56,895 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 14:34:56,895 INFO    SenderThread:11193 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 14:34:57,381 INFO    SenderThread:11193 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_143449-7tyiihss/files
+2024-08-04 14:34:57,382 INFO    SenderThread:11193 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143449-7tyiihss/files/requirements.txt requirements.txt
+2024-08-04 14:34:57,382 INFO    SenderThread:11193 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143449-7tyiihss/files/config.yaml config.yaml
+2024-08-04 14:34:57,382 INFO    SenderThread:11193 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143449-7tyiihss/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 14:34:57,384 INFO    SenderThread:11193 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143449-7tyiihss/files/wandb-summary.json wandb-summary.json
+2024-08-04 14:34:57,386 INFO    SenderThread:11193 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143449-7tyiihss/files/output.log output.log
+2024-08-04 14:34:57,387 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 10
+2024-08-04 14:34:57,388 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:34:57,388 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:57,388 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 14:34:57,389 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:57,390 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 14:34:57,390 INFO    SenderThread:11193 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:34:57,784 INFO    wandb-upload_1:11193 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143449-7tyiihss/files/config.yaml
+2024-08-04 14:34:57,859 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:34:57,859 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:34:57,882 INFO    wandb-upload_0:11193 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143449-7tyiihss/files/requirements.txt
+2024-08-04 14:34:57,946 INFO    wandb-upload_3:11193 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143449-7tyiihss/files/output.log
+2024-08-04 14:34:57,948 INFO    wandb-upload_2:11193 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143449-7tyiihss/files/wandb-summary.json
+2024-08-04 14:34:58,148 INFO    Thread-11 (_thread_body):11193 [sender.py:transition_state():617] send defer: 11
+2024-08-04 14:34:58,149 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:58,149 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 14:34:58,149 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:58,149 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 14:34:58,149 INFO    SenderThread:11193 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 14:34:58,149 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 12
+2024-08-04 14:34:58,150 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:58,150 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 14:34:58,150 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:58,150 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 14:34:58,150 INFO    SenderThread:11193 [file_stream.py:finish():595] file stream finish called
+2024-08-04 14:34:58,337 INFO    SenderThread:11193 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 14:34:58,337 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 13
+2024-08-04 14:34:58,337 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:58,337 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 14:34:58,338 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:58,338 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 14:34:58,338 INFO    SenderThread:11193 [sender.py:transition_state():617] send defer: 14
+2024-08-04 14:34:58,338 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:34:58,338 DEBUG   SenderThread:11193 [sender.py:send():382] send: final
+2024-08-04 14:34:58,338 DEBUG   SenderThread:11193 [sender.py:send():382] send: footer
+2024-08-04 14:34:58,338 INFO    HandlerThread:11193 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 14:34:58,339 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:34:58,339 INFO    SenderThread:11193 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 14:34:58,339 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:34:58,339 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:34:58,340 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:34:58,340 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 14:34:58,340 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:34:58,340 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 14:34:58,340 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: server_info
+2024-08-04 14:34:58,342 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 14:34:58,342 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:34:58,342 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 14:34:58,505 DEBUG   SenderThread:11193 [sender.py:send_request():409] send_request: job_info
+2024-08-04 14:34:58,506 INFO    MainThread:11193 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 14:34:58,506 INFO    MainThread:11193 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 14:34:58,506 INFO    MainThread:11193 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 14:34:58,506 DEBUG   HandlerThread:11193 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 14:34:58,506 INFO    HandlerThread:11193 [handler.py:finish():869] shutting down handler
+2024-08-04 14:34:59,343 INFO    WriterThread:11193 [datastore.py:close():296] close: /project/wandb/run-20240804_143449-7tyiihss/run-7tyiihss.wandb
+2024-08-04 14:34:59,506 INFO    SenderThread:11193 [sender.py:finish():1572] shutting down sender
+2024-08-04 14:34:59,506 INFO    SenderThread:11193 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:34:59,506 INFO    SenderThread:11193 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_143449-7tyiihss/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 14:34:49,898 INFO    MainThread:11121 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_setup.py:_flush():76] Configure stats pid to 11121
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_143449-7tyiihss/logs/debug.log
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_143449-7tyiihss/logs/debug-internal.log
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_init.py:init():566] calling init triggers
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:34:38', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_init.py:init():616] starting backend
+2024-08-04 14:34:49,899 INFO    MainThread:11121 [wandb_init.py:init():620] setting up manager
+2024-08-04 14:34:49,904 INFO    MainThread:11121 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 14:34:49,905 INFO    MainThread:11121 [wandb_init.py:init():628] backend started and connected
+2024-08-04 14:34:49,910 INFO    MainThread:11121 [wandb_init.py:init():720] updated telemetry
+2024-08-04 14:34:49,920 INFO    MainThread:11121 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 14:34:50,376 INFO    MainThread:11121 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 14:34:50,461 INFO    MainThread:11121 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 14:34:50,461 INFO    MainThread:11121 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 14:34:50,520 INFO    MainThread:11121 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 14:34:50,520 INFO    MainThread:11121 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 14:34:50,521 INFO    MainThread:11121 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 14:34:50,521 INFO    MainThread:11121 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 14:34:50,521 INFO    MainThread:11121 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 14:34:53,773 INFO    MainThread:11121 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
+2024-08-04 14:34:53,774 INFO    MainThread:11121 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 14:34:59,507 WARNING MsgRouterThr:11121 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_143449-7tyiihss/run-7tyiihss.wandb ADDED Viewed

Binary file (20.4 kB). View file

wandb/run-20240804_153511-5ba5jbt6/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-llama_train_2024-08-04-15:34:59
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+base_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 2000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 2000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-llama
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32000
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722753311.766293
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 2048
+model_type:
+  desc: null
+  value: llama
+max_position_embeddings:
+  desc: null
+  value: 2048
+num_attention_heads:
+  desc: null
+  value: 32
+num_hidden_layers:
+  desc: null
+  value: 22
+model_architecture:
+  desc: null
+  value: LlamaForCausalLM

wandb/run-20240804_153511-5ba5jbt6/files/output.log ADDED Viewed

	@@ -0,0 +1,135 @@

+Created Hugging Face repository with ID koichi12/tiny-llama.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
+--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
+You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      640000
+    validation: 35200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): LlamaForCausalLM(
+    (model): LlamaModel(
+      (embed_tokens): Embedding(32000, 2048)
+      (layers): ModuleList(
+        (0-21): 22 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): LlamaDecoderLayer(
+              (self_attn): LlamaFlashAttention2(
+                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (v_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm()
+              (post_attention_layernorm): LlamaRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): LlamaRMSNorm()
+      (rotary_emb): LlamaRotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
+  )
+)
+model config: LlamaConfig {
+  "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32000
+}
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
+    batch = next(train_dataloader)
+  File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
+    for x in iter:
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
+    data = self._next_data()
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
+    return self._process_data(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
+    data.reraise()
+  File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
+    raise exception
+RuntimeError: Caught RuntimeError in DataLoader worker process 0.
+Original Traceback (most recent call last):
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
+    data = fetcher.fetch(index)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+    return self.collate_fn(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
+    return collate(batch, collate_fn_map=default_collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
+    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
+    return torch.stack(batch, 0, out=out)
+RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1

wandb/run-20240804_153511-5ba5jbt6/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_153511-5ba5jbt6/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T06:35:12.365765",
+    "startedAt": "2024-08-04T06:35:11.753150",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "2000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "2000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+        "--save",
+        "/work/llm_recipes/models/tiny-llama",
+        "--load",
+        "/work/llm_recipes/models/tiny-llama",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-llama",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-llama_train_2024-08-04-15:34:59"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.034,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48781967163086
+    }
+}

wandb/run-20240804_153511-5ba5jbt6/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 3}}

wandb/run-20240804_153511-5ba5jbt6/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,188 @@

+2024-08-04 15:35:11,766 INFO    StreamThr :10035 [internal.py:wandb_internal():86] W&B internal server running at pid: 10035, started at: 2024-08-04 15:35:11.765926
+2024-08-04 15:35:11,768 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: status
+2024-08-04 15:35:11,771 INFO    WriterThread:10035 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_153511-5ba5jbt6/run-5ba5jbt6.wandb
+2024-08-04 15:35:11,772 DEBUG   SenderThread:10035 [sender.py:send():382] send: header
+2024-08-04 15:35:11,786 DEBUG   SenderThread:10035 [sender.py:send():382] send: run
+2024-08-04 15:35:12,256 INFO    SenderThread:10035 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_153511-5ba5jbt6/files
+2024-08-04 15:35:12,256 INFO    SenderThread:10035 [sender.py:_start_run_threads():1136] run started: 5ba5jbt6 with start time 1722753311.766293
+2024-08-04 15:35:12,259 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 15:35:12,260 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: check_version
+2024-08-04 15:35:12,346 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 15:35:12,352 DEBUG   HandlerThread:10035 [system_info.py:__init__():27] System info init
+2024-08-04 15:35:12,352 DEBUG   HandlerThread:10035 [system_info.py:__init__():42] System info init done
+2024-08-04 15:35:12,352 INFO    HandlerThread:10035 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 15:35:12,352 INFO    SystemMonitor:10035 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 15:35:12,352 INFO    HandlerThread:10035 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 15:35:12,353 INFO    SystemMonitor:10035 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 15:35:12,353 INFO    SystemMonitor:10035 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 15:35:12,354 INFO    SystemMonitor:10035 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 15:35:12,354 INFO    SystemMonitor:10035 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 15:35:12,354 INFO    SystemMonitor:10035 [interfaces.py:start():190] Started network monitoring
+2024-08-04 15:35:12,365 DEBUG   HandlerThread:10035 [system_info.py:probe():151] Probing system
+2024-08-04 15:35:12,367 DEBUG   HandlerThread:10035 [system_info.py:_probe_git():136] Probing git
+2024-08-04 15:35:12,379 DEBUG   HandlerThread:10035 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 15:35:12,379 DEBUG   HandlerThread:10035 [system_info.py:probe():199] Probing system done
+2024-08-04 15:35:12,379 DEBUG   HandlerThread:10035 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T06:35:12.365765', 'startedAt': '2024-08-04T06:35:11.753150', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-15:34:59'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
+2024-08-04 15:35:12,379 INFO    HandlerThread:10035 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 15:35:12,379 INFO    HandlerThread:10035 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 15:35:12,380 INFO    HandlerThread:10035 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 15:35:12,392 DEBUG   SenderThread:10035 [sender.py:send():382] send: files
+2024-08-04 15:35:12,392 INFO    SenderThread:10035 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 15:35:12,401 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 15:35:12,401 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 15:35:12,401 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 15:35:12,402 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 15:35:12,403 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 15:35:12,635 DEBUG   SenderThread:10035 [sender.py:send():382] send: telemetry
+2024-08-04 15:35:13,069 INFO    wandb-upload_0:10035 [upload_job.py:push():131] Uploaded file /tmp/tmpekww83l_wandb/2um60osn-wandb-metadata.json
+2024-08-04 15:35:13,258 INFO    Thread-12 :10035 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_153511-5ba5jbt6/files/requirements.txt
+2024-08-04 15:35:13,258 INFO    Thread-12 :10035 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_153511-5ba5jbt6/files/output.log
+2024-08-04 15:35:13,259 INFO    Thread-12 :10035 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_153511-5ba5jbt6/files/wandb-metadata.json
+2024-08-04 15:35:15,255 DEBUG   SenderThread:10035 [sender.py:send():382] send: config
+2024-08-04 15:35:15,255 DEBUG   SenderThread:10035 [sender.py:send():382] send: config
+2024-08-04 15:35:15,259 INFO    Thread-12 :10035 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_153511-5ba5jbt6/files/output.log
+2024-08-04 15:35:15,561 DEBUG   SenderThread:10035 [sender.py:send():382] send: exit
+2024-08-04 15:35:15,561 INFO    SenderThread:10035 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 15:35:15,561 INFO    SenderThread:10035 [sender.py:send_exit():591] handling runtime: 3
+2024-08-04 15:35:15,562 INFO    SenderThread:10035 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 15:35:15,563 INFO    SenderThread:10035 [sender.py:send_exit():597] send defer
+2024-08-04 15:35:15,563 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:15,563 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 15:35:15,563 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:15,563 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 15:35:15,563 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 1
+2024-08-04 15:35:15,563 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:15,563 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 15:35:15,563 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:15,563 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 15:35:15,564 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 2
+2024-08-04 15:35:15,564 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:15,564 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 15:35:15,564 INFO    HandlerThread:10035 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 15:35:15,564 INFO    HandlerThread:10035 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 15:35:15,564 DEBUG   SystemMonitor:10035 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 15:35:15,564 INFO    HandlerThread:10035 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 15:35:15,564 DEBUG   SystemMonitor:10035 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 15:35:15,565 DEBUG   SystemMonitor:10035 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 15:35:15,597 INFO    HandlerThread:10035 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 15:35:15,597 INFO    HandlerThread:10035 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 15:35:15,597 INFO    HandlerThread:10035 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 15:35:15,598 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:15,598 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 15:35:15,598 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 3
+2024-08-04 15:35:15,598 DEBUG   SenderThread:10035 [sender.py:send():382] send: stats
+2024-08-04 15:35:15,598 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:15,598 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 15:35:15,598 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:15,598 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 15:35:15,598 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 4
+2024-08-04 15:35:15,598 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:15,599 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 15:35:15,599 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:15,599 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 15:35:15,599 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 5
+2024-08-04 15:35:15,599 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:15,599 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 15:35:15,599 DEBUG   SenderThread:10035 [sender.py:send():382] send: summary
+2024-08-04 15:35:15,600 INFO    SenderThread:10035 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 15:35:15,600 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:15,600 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 15:35:15,600 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 6
+2024-08-04 15:35:15,600 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:15,600 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 15:35:15,601 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:15,601 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 15:35:15,603 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 15:35:15,791 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 7
+2024-08-04 15:35:15,791 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:15,791 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 15:35:15,791 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:15,791 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 15:35:16,260 INFO    Thread-12 :10035 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_153511-5ba5jbt6/files/output.log
+2024-08-04 15:35:16,260 INFO    Thread-12 :10035 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_153511-5ba5jbt6/files/config.yaml
+2024-08-04 15:35:16,260 INFO    Thread-12 :10035 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_153511-5ba5jbt6/files/wandb-summary.json
+2024-08-04 15:35:16,561 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 15:35:17,260 INFO    Thread-12 :10035 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_153511-5ba5jbt6/files/output.log
+2024-08-04 15:35:17,299 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 8
+2024-08-04 15:35:17,299 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 15:35:17,299 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:17,299 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 15:35:17,299 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:17,299 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 15:35:17,299 INFO    SenderThread:10035 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 15:35:17,300 INFO    SenderThread:10035 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 15:35:17,314 INFO    SenderThread:10035 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 15:35:17,322 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 9
+2024-08-04 15:35:17,322 DEBUG   SenderThread:10035 [sender.py:send():382] send: artifact
+2024-08-04 15:35:17,322 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:17,323 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 15:35:17,561 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 15:35:18,177 INFO    SenderThread:10035 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 15:35:18,177 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:18,177 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 15:35:18,177 INFO    SenderThread:10035 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 15:35:18,261 INFO    SenderThread:10035 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_153511-5ba5jbt6/files/output.log
+2024-08-04 15:35:18,261 INFO    SenderThread:10035 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_153511-5ba5jbt6/files
+2024-08-04 15:35:18,262 INFO    SenderThread:10035 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_153511-5ba5jbt6/files/requirements.txt requirements.txt
+2024-08-04 15:35:18,262 INFO    SenderThread:10035 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_153511-5ba5jbt6/files/config.yaml config.yaml
+2024-08-04 15:35:18,263 INFO    SenderThread:10035 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_153511-5ba5jbt6/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 15:35:18,263 INFO    SenderThread:10035 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_153511-5ba5jbt6/files/wandb-summary.json wandb-summary.json
+2024-08-04 15:35:18,265 INFO    SenderThread:10035 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_153511-5ba5jbt6/files/output.log output.log
+2024-08-04 15:35:18,266 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 10
+2024-08-04 15:35:18,267 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 15:35:18,267 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:18,268 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 15:35:18,268 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:18,268 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 15:35:18,268 INFO    SenderThread:10035 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 15:35:18,562 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 15:35:18,562 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 15:35:18,679 INFO    wandb-upload_0:10035 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_153511-5ba5jbt6/files/requirements.txt
+2024-08-04 15:35:18,797 INFO    wandb-upload_1:10035 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_153511-5ba5jbt6/files/config.yaml
+2024-08-04 15:35:18,860 INFO    wandb-upload_2:10035 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_153511-5ba5jbt6/files/wandb-summary.json
+2024-08-04 15:35:18,877 INFO    wandb-upload_3:10035 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_153511-5ba5jbt6/files/output.log
+2024-08-04 15:35:19,077 INFO    Thread-11 (_thread_body):10035 [sender.py:transition_state():617] send defer: 11
+2024-08-04 15:35:19,077 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:19,078 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 15:35:19,078 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:19,078 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 15:35:19,078 INFO    SenderThread:10035 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 15:35:19,078 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 12
+2024-08-04 15:35:19,078 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:19,078 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 15:35:19,078 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:19,078 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 15:35:19,078 INFO    SenderThread:10035 [file_stream.py:finish():595] file stream finish called
+2024-08-04 15:35:19,260 INFO    SenderThread:10035 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 15:35:19,260 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 13
+2024-08-04 15:35:19,261 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:19,261 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 15:35:19,261 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:19,261 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 15:35:19,261 INFO    SenderThread:10035 [sender.py:transition_state():617] send defer: 14
+2024-08-04 15:35:19,261 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 15:35:19,261 DEBUG   SenderThread:10035 [sender.py:send():382] send: final
+2024-08-04 15:35:19,261 INFO    HandlerThread:10035 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 15:35:19,261 DEBUG   SenderThread:10035 [sender.py:send():382] send: footer
+2024-08-04 15:35:19,262 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: defer
+2024-08-04 15:35:19,262 INFO    SenderThread:10035 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 15:35:19,262 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 15:35:19,262 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 15:35:19,262 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 15:35:19,263 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 15:35:19,263 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 15:35:19,263 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: server_info
+2024-08-04 15:35:19,263 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 15:35:19,264 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 15:35:19,265 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 15:35:19,265 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 15:35:19,429 DEBUG   SenderThread:10035 [sender.py:send_request():409] send_request: job_info
+2024-08-04 15:35:19,429 INFO    MainThread:10035 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 15:35:19,429 INFO    MainThread:10035 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 15:35:19,429 INFO    MainThread:10035 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 15:35:19,429 DEBUG   HandlerThread:10035 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 15:35:19,429 INFO    HandlerThread:10035 [handler.py:finish():869] shutting down handler
+2024-08-04 15:35:20,265 INFO    WriterThread:10035 [datastore.py:close():296] close: /project/wandb/run-20240804_153511-5ba5jbt6/run-5ba5jbt6.wandb
+2024-08-04 15:35:20,429 INFO    SenderThread:10035 [sender.py:finish():1572] shutting down sender
+2024-08-04 15:35:20,429 INFO    SenderThread:10035 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 15:35:20,429 INFO    SenderThread:10035 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_153511-5ba5jbt6/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 15:35:11,758 INFO    MainThread:9964 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_setup.py:_flush():76] Configure stats pid to 9964
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_153511-5ba5jbt6/logs/debug.log
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_153511-5ba5jbt6/logs/debug-internal.log
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_init.py:init():566] calling init triggers
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-15:34:59', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_init.py:init():616] starting backend
+2024-08-04 15:35:11,759 INFO    MainThread:9964 [wandb_init.py:init():620] setting up manager
+2024-08-04 15:35:11,764 INFO    MainThread:9964 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 15:35:11,766 INFO    MainThread:9964 [wandb_init.py:init():628] backend started and connected
+2024-08-04 15:35:11,770 INFO    MainThread:9964 [wandb_init.py:init():720] updated telemetry
+2024-08-04 15:35:11,782 INFO    MainThread:9964 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 15:35:12,259 INFO    MainThread:9964 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 15:35:12,339 INFO    MainThread:9964 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 15:35:12,339 INFO    MainThread:9964 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 15:35:12,400 INFO    MainThread:9964 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 15:35:12,400 INFO    MainThread:9964 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 15:35:12,400 INFO    MainThread:9964 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 15:35:12,400 INFO    MainThread:9964 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 15:35:12,401 INFO    MainThread:9964 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 15:35:15,253 INFO    MainThread:9964 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
+2024-08-04 15:35:15,253 INFO    MainThread:9964 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 15:35:20,430 WARNING MsgRouterThr:9964 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_153511-5ba5jbt6/run-5ba5jbt6.wandb ADDED Viewed

Binary file (20.4 kB). View file

wandb/run-20240812_052446-qrv0d6sp/files/config.yaml ADDED Viewed

	@@ -0,0 +1,314 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-qwen2-0.5B_train_2024-08-12-05:24:35
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 5
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12//yans-qwen2-0.5B
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1723407886.294165
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64

wandb/run-20240812_052446-qrv0d6sp/files/output.log ADDED Viewed

	@@ -0,0 +1,12 @@

+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 85, in main
+    setup_huggingface_repository(args)
+  File "/project/src/llama_recipes/utils/hf_hub_utils.py", line 10, in setup_huggingface_repository
+    create_repo(
+  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn
+    validate_repo_id(arg_value)
+  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id
+    raise HFValidationError(
+huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'koichi12//yans-qwen2-0.5B'. Use `repo_type` argument if needed.

wandb/run-20240812_052446-qrv0d6sp/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240812_052446-qrv0d6sp/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-11T20:24:46.917714",
+    "startedAt": "2024-08-11T20:24:46.281353",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "5",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--load",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12//yans-qwen2-0.5B",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-qwen2-0.5B_train_2024-08-12-05:24:35"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0429999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.487823486328125
+    }
+}

wandb/run-20240812_052446-qrv0d6sp/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 0}}

wandb/run-20240812_052446-qrv0d6sp/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,185 @@

+2024-08-12 05:24:46,295 INFO    StreamThr :10279 [internal.py:wandb_internal():86] W&B internal server running at pid: 10279, started at: 2024-08-12 05:24:46.294899
+2024-08-12 05:24:46,297 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: status
+2024-08-12 05:24:46,299 INFO    WriterThread:10279 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_052446-qrv0d6sp/run-qrv0d6sp.wandb
+2024-08-12 05:24:46,300 DEBUG   SenderThread:10279 [sender.py:send():382] send: header
+2024-08-12 05:24:46,314 DEBUG   SenderThread:10279 [sender.py:send():382] send: run
+2024-08-12 05:24:46,803 INFO    SenderThread:10279 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_052446-qrv0d6sp/files
+2024-08-12 05:24:46,803 INFO    SenderThread:10279 [sender.py:_start_run_threads():1136] run started: qrv0d6sp with start time 1723407886.294165
+2024-08-12 05:24:46,808 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: check_version
+2024-08-12 05:24:46,809 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: check_version
+2024-08-12 05:24:46,897 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: run_start
+2024-08-12 05:24:46,903 DEBUG   HandlerThread:10279 [system_info.py:__init__():27] System info init
+2024-08-12 05:24:46,903 DEBUG   HandlerThread:10279 [system_info.py:__init__():42] System info init done
+2024-08-12 05:24:46,903 INFO    HandlerThread:10279 [system_monitor.py:start():194] Starting system monitor
+2024-08-12 05:24:46,903 INFO    SystemMonitor:10279 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-12 05:24:46,904 INFO    HandlerThread:10279 [system_monitor.py:probe():214] Collecting system info
+2024-08-12 05:24:46,904 INFO    SystemMonitor:10279 [interfaces.py:start():190] Started cpu monitoring
+2024-08-12 05:24:46,904 INFO    SystemMonitor:10279 [interfaces.py:start():190] Started disk monitoring
+2024-08-12 05:24:46,905 INFO    SystemMonitor:10279 [interfaces.py:start():190] Started gpu monitoring
+2024-08-12 05:24:46,906 INFO    SystemMonitor:10279 [interfaces.py:start():190] Started memory monitoring
+2024-08-12 05:24:46,907 INFO    SystemMonitor:10279 [interfaces.py:start():190] Started network monitoring
+2024-08-12 05:24:46,917 DEBUG   HandlerThread:10279 [system_info.py:probe():151] Probing system
+2024-08-12 05:24:46,919 DEBUG   HandlerThread:10279 [system_info.py:_probe_git():136] Probing git
+2024-08-12 05:24:46,932 DEBUG   HandlerThread:10279 [system_info.py:_probe_git():144] Probing git done
+2024-08-12 05:24:46,932 DEBUG   HandlerThread:10279 [system_info.py:probe():199] Probing system done
+2024-08-12 05:24:46,932 DEBUG   HandlerThread:10279 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T20:24:46.917714', 'startedAt': '2024-08-11T20:24:46.281353', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '5', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12//yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-05:24:35'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
+2024-08-12 05:24:46,932 INFO    HandlerThread:10279 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-12 05:24:46,932 INFO    HandlerThread:10279 [system_monitor.py:probe():227] Publishing system info
+2024-08-12 05:24:46,934 INFO    HandlerThread:10279 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-12 05:24:46,940 DEBUG   SenderThread:10279 [sender.py:send():382] send: files
+2024-08-12 05:24:46,940 INFO    SenderThread:10279 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-12 05:24:46,949 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-12 05:24:46,949 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:24:46,950 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: python_packages
+2024-08-12 05:24:46,950 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:24:46,951 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:24:47,180 DEBUG   SenderThread:10279 [sender.py:send():382] send: telemetry
+2024-08-12 05:24:47,182 DEBUG   SenderThread:10279 [sender.py:send():382] send: exit
+2024-08-12 05:24:47,182 INFO    SenderThread:10279 [sender.py:send_exit():589] handling exit code: 1
+2024-08-12 05:24:47,182 INFO    SenderThread:10279 [sender.py:send_exit():591] handling runtime: 0
+2024-08-12 05:24:47,183 INFO    SenderThread:10279 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:24:47,184 INFO    SenderThread:10279 [sender.py:send_exit():597] send defer
+2024-08-12 05:24:47,184 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:47,184 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-12 05:24:47,184 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:47,184 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-12 05:24:47,184 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 1
+2024-08-12 05:24:47,184 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:47,184 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-12 05:24:47,184 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:47,184 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-12 05:24:47,184 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 2
+2024-08-12 05:24:47,184 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:47,185 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-12 05:24:47,185 INFO    HandlerThread:10279 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-12 05:24:47,185 DEBUG   SystemMonitor:10279 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-12 05:24:47,185 INFO    HandlerThread:10279 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-12 05:24:47,185 DEBUG   SystemMonitor:10279 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-12 05:24:47,185 INFO    HandlerThread:10279 [interfaces.py:finish():202] Joined disk monitor
+2024-08-12 05:24:47,185 DEBUG   SystemMonitor:10279 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-12 05:24:47,218 INFO    HandlerThread:10279 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-12 05:24:47,218 INFO    HandlerThread:10279 [interfaces.py:finish():202] Joined memory monitor
+2024-08-12 05:24:47,218 INFO    HandlerThread:10279 [interfaces.py:finish():202] Joined network monitor
+2024-08-12 05:24:47,219 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:47,219 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-12 05:24:47,219 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 3
+2024-08-12 05:24:47,219 DEBUG   SenderThread:10279 [sender.py:send():382] send: stats
+2024-08-12 05:24:47,219 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:47,219 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-12 05:24:47,219 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:47,219 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-12 05:24:47,219 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 4
+2024-08-12 05:24:47,219 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:47,219 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-12 05:24:47,220 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:47,220 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-12 05:24:47,220 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 5
+2024-08-12 05:24:47,220 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:47,220 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-12 05:24:47,220 DEBUG   SenderThread:10279 [sender.py:send():382] send: summary
+2024-08-12 05:24:47,221 INFO    SenderThread:10279 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:24:47,221 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:47,221 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-12 05:24:47,221 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 6
+2024-08-12 05:24:47,221 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:47,221 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-12 05:24:47,221 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:47,222 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-12 05:24:47,224 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:24:47,422 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 7
+2024-08-12 05:24:47,422 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:47,422 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-12 05:24:47,422 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:47,422 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-12 05:24:47,581 INFO    wandb-upload_0:10279 [upload_job.py:push():131] Uploaded file /tmp/tmppaigcwc7wandb/d7sbkpsh-wandb-metadata.json
+2024-08-12 05:24:47,805 INFO    Thread-12 :10279 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052446-qrv0d6sp/files/config.yaml
+2024-08-12 05:24:47,805 INFO    Thread-12 :10279 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052446-qrv0d6sp/files/requirements.txt
+2024-08-12 05:24:47,806 INFO    Thread-12 :10279 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052446-qrv0d6sp/files/output.log
+2024-08-12 05:24:47,806 INFO    Thread-12 :10279 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052446-qrv0d6sp/files/wandb-metadata.json
+2024-08-12 05:24:47,806 INFO    Thread-12 :10279 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052446-qrv0d6sp/files/wandb-summary.json
+2024-08-12 05:24:47,995 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:24:49,187 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 8
+2024-08-12 05:24:49,187 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:24:49,187 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:49,187 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-12 05:24:49,188 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:49,188 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-12 05:24:49,188 INFO    SenderThread:10279 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-12 05:24:49,189 INFO    SenderThread:10279 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-12 05:24:49,203 INFO    SenderThread:10279 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-12 05:24:49,211 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 9
+2024-08-12 05:24:49,212 DEBUG   SenderThread:10279 [sender.py:send():382] send: artifact
+2024-08-12 05:24:49,212 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:49,213 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-12 05:24:49,806 INFO    Thread-12 :10279 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052446-qrv0d6sp/files/output.log
+2024-08-12 05:24:49,996 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:24:50,071 INFO    SenderThread:10279 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTgzMzc4Mw==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTEzOTgzMzc4Mw==', 'versionIndex': 6}}}
+2024-08-12 05:24:50,071 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:50,072 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-12 05:24:50,072 INFO    SenderThread:10279 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-12 05:24:50,807 INFO    SenderThread:10279 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_052446-qrv0d6sp/files
+2024-08-12 05:24:50,808 INFO    SenderThread:10279 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052446-qrv0d6sp/files/requirements.txt requirements.txt
+2024-08-12 05:24:50,808 INFO    SenderThread:10279 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052446-qrv0d6sp/files/config.yaml config.yaml
+2024-08-12 05:24:50,808 INFO    SenderThread:10279 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052446-qrv0d6sp/files/wandb-metadata.json wandb-metadata.json
+2024-08-12 05:24:50,808 INFO    SenderThread:10279 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052446-qrv0d6sp/files/wandb-summary.json wandb-summary.json
+2024-08-12 05:24:50,808 INFO    SenderThread:10279 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052446-qrv0d6sp/files/output.log output.log
+2024-08-12 05:24:50,808 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 10
+2024-08-12 05:24:50,809 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:24:50,809 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:50,812 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-12 05:24:50,814 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:50,815 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-12 05:24:50,815 INFO    SenderThread:10279 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 05:24:50,997 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:24:50,997 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:24:51,206 INFO    wandb-upload_1:10279 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052446-qrv0d6sp/files/config.yaml
+2024-08-12 05:24:51,307 INFO    wandb-upload_0:10279 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052446-qrv0d6sp/files/requirements.txt
+2024-08-12 05:24:51,390 INFO    wandb-upload_3:10279 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052446-qrv0d6sp/files/output.log
+2024-08-12 05:24:51,401 INFO    wandb-upload_2:10279 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052446-qrv0d6sp/files/wandb-summary.json
+2024-08-12 05:24:51,602 INFO    Thread-11 (_thread_body):10279 [sender.py:transition_state():617] send defer: 11
+2024-08-12 05:24:51,602 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:51,602 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-12 05:24:51,603 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:51,603 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-12 05:24:51,603 INFO    SenderThread:10279 [file_pusher.py:join():178] waiting for file pusher
+2024-08-12 05:24:51,603 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 12
+2024-08-12 05:24:51,603 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:51,603 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-12 05:24:51,603 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:51,603 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-12 05:24:51,603 INFO    SenderThread:10279 [file_stream.py:finish():595] file stream finish called
+2024-08-12 05:24:51,998 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:24:52,287 INFO    SenderThread:10279 [file_stream.py:finish():599] file stream finish is done
+2024-08-12 05:24:52,287 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 13
+2024-08-12 05:24:52,287 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:24:52,287 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:52,288 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-12 05:24:52,288 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:52,288 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-12 05:24:52,288 INFO    SenderThread:10279 [sender.py:transition_state():617] send defer: 14
+2024-08-12 05:24:52,288 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:24:52,288 DEBUG   SenderThread:10279 [sender.py:send():382] send: final
+2024-08-12 05:24:52,288 INFO    HandlerThread:10279 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-12 05:24:52,289 DEBUG   SenderThread:10279 [sender.py:send():382] send: footer
+2024-08-12 05:24:52,289 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:24:52,289 INFO    SenderThread:10279 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-12 05:24:52,289 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:24:52,289 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:24:52,290 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:24:52,290 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:24:52,290 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: server_info
+2024-08-12 05:24:52,290 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: server_info
+2024-08-12 05:24:52,292 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-12 05:24:52,292 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-12 05:24:52,292 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:24:52,293 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: job_info
+2024-08-12 05:24:52,456 DEBUG   SenderThread:10279 [sender.py:send_request():409] send_request: job_info
+2024-08-12 05:24:52,457 INFO    MainThread:10279 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-12 05:24:52,457 INFO    MainThread:10279 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-12 05:24:52,457 INFO    MainThread:10279 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-12 05:24:52,457 DEBUG   HandlerThread:10279 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-12 05:24:52,457 INFO    HandlerThread:10279 [handler.py:finish():869] shutting down handler
+2024-08-12 05:24:53,293 INFO    WriterThread:10279 [datastore.py:close():296] close: /project/wandb/run-20240812_052446-qrv0d6sp/run-qrv0d6sp.wandb
+2024-08-12 05:24:53,457 INFO    SenderThread:10279 [sender.py:finish():1572] shutting down sender
+2024-08-12 05:24:53,457 INFO    SenderThread:10279 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 05:24:53,457 INFO    SenderThread:10279 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240812_052446-qrv0d6sp/logs/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2024-08-12 05:24:46,287 INFO    MainThread:10208 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-12 05:24:46,287 INFO    MainThread:10208 [wandb_setup.py:_flush():76] Configure stats pid to 10208
+2024-08-12 05:24:46,287 INFO    MainThread:10208 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-12 05:24:46,287 INFO    MainThread:10208 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-12 05:24:46,287 INFO    MainThread:10208 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
+2024-08-12 05:24:46,287 INFO    MainThread:10208 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-12 05:24:46,287 INFO    MainThread:10208 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-12 05:24:46,288 INFO    MainThread:10208 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_052446-qrv0d6sp/logs/debug.log
+2024-08-12 05:24:46,288 INFO    MainThread:10208 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_052446-qrv0d6sp/logs/debug-internal.log
+2024-08-12 05:24:46,288 INFO    MainThread:10208 [wandb_init.py:init():566] calling init triggers
+2024-08-12 05:24:46,288 INFO    MainThread:10208 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-05:24:35', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 5, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12//yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
+2024-08-12 05:24:46,288 INFO    MainThread:10208 [wandb_init.py:init():616] starting backend
+2024-08-12 05:24:46,288 INFO    MainThread:10208 [wandb_init.py:init():620] setting up manager
+2024-08-12 05:24:46,293 INFO    MainThread:10208 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-12 05:24:46,293 INFO    MainThread:10208 [wandb_init.py:init():628] backend started and connected
+2024-08-12 05:24:46,298 INFO    MainThread:10208 [wandb_init.py:init():720] updated telemetry
+2024-08-12 05:24:46,310 INFO    MainThread:10208 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-12 05:24:46,808 INFO    MainThread:10208 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-12 05:24:46,889 INFO    MainThread:10208 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-12 05:24:46,889 INFO    MainThread:10208 [wandb_init.py:init():804] starting run threads in backend
+2024-08-12 05:24:46,948 INFO    MainThread:10208 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-12 05:24:46,949 INFO    MainThread:10208 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-12 05:24:46,949 INFO    MainThread:10208 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-12 05:24:46,949 INFO    MainThread:10208 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-12 05:24:46,950 INFO    MainThread:10208 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-12 05:24:53,458 WARNING MsgRouterThr:10208 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240812_052446-qrv0d6sp/run-qrv0d6sp.wandb ADDED Viewed

Binary file (7.11 kB). View file

wandb/run-20240812_072401-esew3nhv/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-qwen2-0.5B_train_2024-08-12-07:23:49
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 5
+save_interval:
+  desc: null
+  value: 5
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-qwen2-0.5B
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1723415041.503914
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Qwen2ForCausalLM
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 896
+model_type:
+  desc: null
+  value: qwen2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 14
+num_hidden_layers:
+  desc: null
+  value: 24

wandb/run-20240812_072401-esew3nhv/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240812_072401-esew3nhv/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-11T22:24:02.142128",
+    "startedAt": "2024-08-11T22:24:01.491031",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "5",
+        "--eval-interval",
+        "5",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--load",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-qwen2-0.5B",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-qwen2-0.5B_train_2024-08-12-07:23:49"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0429999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.487823486328125
+    }
+}

wandb/run-20240812_072401-esew3nhv/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,240 @@

+2024-08-12 07:24:01,505 INFO    StreamThr :14117 [internal.py:wandb_internal():86] W&B internal server running at pid: 14117, started at: 2024-08-12 07:24:01.504656
+2024-08-12 07:24:01,507 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status
+2024-08-12 07:24:01,508 INFO    WriterThread:14117 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_072401-esew3nhv/run-esew3nhv.wandb
+2024-08-12 07:24:01,509 DEBUG   SenderThread:14117 [sender.py:send():382] send: header
+2024-08-12 07:24:01,545 DEBUG   SenderThread:14117 [sender.py:send():382] send: run
+2024-08-12 07:24:02,027 INFO    SenderThread:14117 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_072401-esew3nhv/files
+2024-08-12 07:24:02,028 INFO    SenderThread:14117 [sender.py:_start_run_threads():1136] run started: esew3nhv with start time 1723415041.503914
+2024-08-12 07:24:02,033 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: check_version
+2024-08-12 07:24:02,033 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: check_version
+2024-08-12 07:24:02,121 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: run_start
+2024-08-12 07:24:02,127 DEBUG   HandlerThread:14117 [system_info.py:__init__():27] System info init
+2024-08-12 07:24:02,128 DEBUG   HandlerThread:14117 [system_info.py:__init__():42] System info init done
+2024-08-12 07:24:02,128 INFO    HandlerThread:14117 [system_monitor.py:start():194] Starting system monitor
+2024-08-12 07:24:02,128 INFO    SystemMonitor:14117 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-12 07:24:02,128 INFO    HandlerThread:14117 [system_monitor.py:probe():214] Collecting system info
+2024-08-12 07:24:02,129 INFO    SystemMonitor:14117 [interfaces.py:start():190] Started cpu monitoring
+2024-08-12 07:24:02,129 INFO    SystemMonitor:14117 [interfaces.py:start():190] Started disk monitoring
+2024-08-12 07:24:02,130 INFO    SystemMonitor:14117 [interfaces.py:start():190] Started gpu monitoring
+2024-08-12 07:24:02,131 INFO    SystemMonitor:14117 [interfaces.py:start():190] Started memory monitoring
+2024-08-12 07:24:02,131 INFO    SystemMonitor:14117 [interfaces.py:start():190] Started network monitoring
+2024-08-12 07:24:02,142 DEBUG   HandlerThread:14117 [system_info.py:probe():151] Probing system
+2024-08-12 07:24:02,144 DEBUG   HandlerThread:14117 [system_info.py:_probe_git():136] Probing git
+2024-08-12 07:24:02,156 DEBUG   HandlerThread:14117 [system_info.py:_probe_git():144] Probing git done
+2024-08-12 07:24:02,156 DEBUG   HandlerThread:14117 [system_info.py:probe():199] Probing system done
+2024-08-12 07:24:02,156 DEBUG   HandlerThread:14117 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T22:24:02.142128', 'startedAt': '2024-08-11T22:24:01.491031', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '5', '--eval-interval', '5', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-07:23:49'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
+2024-08-12 07:24:02,156 INFO    HandlerThread:14117 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-12 07:24:02,156 INFO    HandlerThread:14117 [system_monitor.py:probe():227] Publishing system info
+2024-08-12 07:24:02,158 INFO    HandlerThread:14117 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-12 07:24:02,164 DEBUG   SenderThread:14117 [sender.py:send():382] send: files
+2024-08-12 07:24:02,164 INFO    SenderThread:14117 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-12 07:24:02,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-12 07:24:02,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: python_packages
+2024-08-12 07:24:02,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:24:02,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:24:02,217 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:24:02,505 DEBUG   SenderThread:14117 [sender.py:send():382] send: telemetry
+2024-08-12 07:24:02,825 INFO    wandb-upload_0:14117 [upload_job.py:push():131] Uploaded file /tmp/tmpynfca8juwandb/hnmvl8ac-wandb-metadata.json
+2024-08-12 07:24:03,029 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_072401-esew3nhv/files/wandb-metadata.json
+2024-08-12 07:24:03,030 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_072401-esew3nhv/files/output.log
+2024-08-12 07:24:03,030 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_072401-esew3nhv/files/requirements.txt
+2024-08-12 07:24:05,030 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/output.log
+2024-08-12 07:24:05,380 DEBUG   SenderThread:14117 [sender.py:send():382] send: config
+2024-08-12 07:24:05,381 DEBUG   SenderThread:14117 [sender.py:send():382] send: config
+2024-08-12 07:24:06,031 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/output.log
+2024-08-12 07:24:07,031 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/output.log
+2024-08-12 07:24:07,381 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:08,032 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/output.log
+2024-08-12 07:24:12,382 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:17,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:24:17,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:24:17,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:24:17,384 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:22,385 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:27,385 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:32,173 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:24:32,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:24:32,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:24:33,387 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:34,049 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/config.yaml
+2024-08-12 07:24:38,589 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:43,590 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:47,173 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:24:47,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:24:47,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:24:49,433 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:54,434 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:24:59,434 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:02,132 DEBUG   SystemMonitor:14117 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-12 07:25:02,133 DEBUG   SenderThread:14117 [sender.py:send():382] send: stats
+2024-08-12 07:25:02,173 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:25:02,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:25:02,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:25:05,393 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:10,394 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:15,394 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:17,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:25:17,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:25:17,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:25:21,046 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:25:21,089 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:23,081 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/output.log
+2024-08-12 07:25:26,090 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:31,091 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:32,134 DEBUG   SenderThread:14117 [sender.py:send():382] send: stats
+2024-08-12 07:25:32,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:25:32,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:25:32,175 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:25:36,423 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:41,424 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:46,425 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:47,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:25:47,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:25:47,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:25:52,370 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:25:57,371 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:02,135 DEBUG   SenderThread:14117 [sender.py:send():382] send: stats
+2024-08-12 07:26:02,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:26:02,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:26:02,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:26:02,441 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:07,441 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:12,442 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:17,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:26:17,174 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:26:17,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:26:18,440 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:23,440 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:28,441 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:32,136 DEBUG   SenderThread:14117 [sender.py:send():382] send: stats
+2024-08-12 07:26:32,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:26:32,175 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:26:32,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:26:34,377 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:36,068 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 07:26:36,070 DEBUG   SenderThread:14117 [sender.py:send():382] send: history
+2024-08-12 07:26:36,071 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:26:36,072 INFO    SenderThread:14117 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:26:36,128 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_072401-esew3nhv/files/wandb-summary.json
+2024-08-12 07:26:37,129 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/output.log
+2024-08-12 07:26:40,110 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:45,111 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:47,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:26:47,175 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:26:47,176 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:26:50,379 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:26:55,380 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:00,381 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:02,137 DEBUG   SenderThread:14117 [sender.py:send():382] send: stats
+2024-08-12 07:27:02,174 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 07:27:02,175 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 07:27:02,216 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 07:27:06,378 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:11,379 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:13,948 DEBUG   SenderThread:14117 [sender.py:send():382] send: exit
+2024-08-12 07:27:13,948 INFO    SenderThread:14117 [sender.py:send_exit():589] handling exit code: 255
+2024-08-12 07:27:13,948 INFO    SenderThread:14117 [sender.py:send_exit():591] handling runtime: 191
+2024-08-12 07:27:13,950 INFO    SenderThread:14117 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:27:13,950 INFO    SenderThread:14117 [sender.py:send_exit():597] send defer
+2024-08-12 07:27:13,950 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:13,950 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-12 07:27:13,950 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:13,950 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-12 07:27:13,950 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 1
+2024-08-12 07:27:13,951 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:13,951 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-12 07:27:13,951 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:13,951 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-12 07:27:13,951 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 2
+2024-08-12 07:27:13,951 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:13,951 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-12 07:27:13,951 INFO    HandlerThread:14117 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-12 07:27:13,951 DEBUG   SystemMonitor:14117 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-12 07:27:13,951 INFO    HandlerThread:14117 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-12 07:27:13,951 DEBUG   SystemMonitor:14117 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-12 07:27:13,952 INFO    HandlerThread:14117 [interfaces.py:finish():202] Joined disk monitor
+2024-08-12 07:27:13,986 INFO    HandlerThread:14117 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-12 07:27:13,986 INFO    HandlerThread:14117 [interfaces.py:finish():202] Joined memory monitor
+2024-08-12 07:27:13,986 INFO    HandlerThread:14117 [interfaces.py:finish():202] Joined network monitor
+2024-08-12 07:27:13,987 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:13,987 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-12 07:27:13,987 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 3
+2024-08-12 07:27:13,987 DEBUG   SenderThread:14117 [sender.py:send():382] send: stats
+2024-08-12 07:27:13,987 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:13,987 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-12 07:27:13,989 DEBUG   SenderThread:14117 [sender.py:send():382] send: history
+2024-08-12 07:27:13,989 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 07:27:13,990 INFO    SenderThread:14117 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:27:13,990 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:13,990 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-12 07:27:13,990 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 4
+2024-08-12 07:27:13,990 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:13,990 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-12 07:27:13,990 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:13,990 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-12 07:27:13,991 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 5
+2024-08-12 07:27:13,991 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:13,991 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-12 07:27:13,991 DEBUG   SenderThread:14117 [sender.py:send():382] send: summary
+2024-08-12 07:27:13,992 INFO    SenderThread:14117 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 07:27:13,992 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:13,992 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-12 07:27:13,993 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 6
+2024-08-12 07:27:13,993 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:13,993 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-12 07:27:13,993 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:13,993 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-12 07:27:13,993 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 7
+2024-08-12 07:27:13,993 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:13,993 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:13,993 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-12 07:27:13,993 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:13,993 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-12 07:27:14,154 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/wandb-summary.json
+2024-08-12 07:27:14,948 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 07:27:15,265 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 8
+2024-08-12 07:27:15,265 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 07:27:15,265 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:15,265 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-12 07:27:15,265 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:15,265 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-12 07:27:15,266 INFO    SenderThread:14117 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-12 07:27:15,266 INFO    SenderThread:14117 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-12 07:27:15,281 INFO    SenderThread:14117 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-12 07:27:15,289 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 9
+2024-08-12 07:27:15,290 DEBUG   SenderThread:14117 [sender.py:send():382] send: artifact
+2024-08-12 07:27:15,290 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:15,291 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-12 07:27:15,948 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 07:27:16,156 INFO    Thread-12 :14117 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_072401-esew3nhv/files/output.log
+2024-08-12 07:27:16,288 INFO    SenderThread:14117 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
+2024-08-12 07:27:16,288 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:16,288 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-12 07:27:16,288 INFO    SenderThread:14117 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-12 07:27:17,157 INFO    SenderThread:14117 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_072401-esew3nhv/files
+2024-08-12 07:27:17,157 INFO    SenderThread:14117 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_072401-esew3nhv/files/requirements.txt requirements.txt
+2024-08-12 07:27:17,157 INFO    SenderThread:14117 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_072401-esew3nhv/files/config.yaml config.yaml
+2024-08-12 07:27:17,158 INFO    SenderThread:14117 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_072401-esew3nhv/files/wandb-metadata.json wandb-metadata.json
+2024-08-12 07:27:17,158 INFO    SenderThread:14117 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_072401-esew3nhv/files/wandb-summary.json wandb-summary.json
+2024-08-12 07:27:17,158 INFO    SenderThread:14117 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_072401-esew3nhv/files/output.log output.log
+2024-08-12 07:27:17,158 INFO    SenderThread:14117 [sender.py:transition_state():617] send defer: 10
+2024-08-12 07:27:17,158 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 07:27:17,158 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 07:27:17,159 INFO    HandlerThread:14117 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-12 07:27:17,159 DEBUG   SenderThread:14117 [sender.py:send_request():409] send_request: defer
+2024-08-12 07:27:17,159 INFO    SenderThread:14117 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-12 07:27:17,159 INFO    SenderThread:14117 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 07:27:22,160 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:27,160 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:32,161 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:37,162 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:42,162 DEBUG   HandlerThread:14117 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 07:27:46,742 WARNING StreamThr :14117 [internal.py:is_dead():414] Internal process exiting, parent pid 14046 disappeared
+2024-08-12 07:27:46,742 ERROR   StreamThr :14117 [internal.py:wandb_internal():152] Internal process shutdown.
+2024-08-12 07:27:47,163 INFO    SenderThread:14117 [sender.py:finish():1572] shutting down sender
+2024-08-12 07:27:47,163 INFO    HandlerThread:14117 [handler.py:finish():869] shutting down handler
+2024-08-12 07:27:47,163 INFO    SenderThread:14117 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 07:27:47,163 INFO    SenderThread:14117 [file_pusher.py:join():178] waiting for file pusher
+2024-08-12 07:27:47,163 INFO    SenderThread:14117 [file_stream.py:finish():595] file stream finish called
+2024-08-12 07:27:47,163 INFO    WriterThread:14117 [datastore.py:close():296] close: /project/wandb/run-20240812_072401-esew3nhv/run-esew3nhv.wandb
+2024-08-12 07:27:47,333 INFO    SenderThread:14117 [file_stream.py:finish():599] file stream finish is done

wandb/run-20240812_072401-esew3nhv/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_setup.py:_flush():76] Configure stats pid to 14046
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_072401-esew3nhv/logs/debug.log
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_072401-esew3nhv/logs/debug-internal.log
+2024-08-12 07:24:01,497 INFO    MainThread:14046 [wandb_init.py:init():566] calling init triggers
+2024-08-12 07:24:01,498 INFO    MainThread:14046 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-07:23:49', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 5, 'save_interval': 5, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
+2024-08-12 07:24:01,498 INFO    MainThread:14046 [wandb_init.py:init():616] starting backend
+2024-08-12 07:24:01,498 INFO    MainThread:14046 [wandb_init.py:init():620] setting up manager
+2024-08-12 07:24:01,503 INFO    MainThread:14046 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-12 07:24:01,503 INFO    MainThread:14046 [wandb_init.py:init():628] backend started and connected
+2024-08-12 07:24:01,508 INFO    MainThread:14046 [wandb_init.py:init():720] updated telemetry
+2024-08-12 07:24:01,540 INFO    MainThread:14046 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-12 07:24:02,032 INFO    MainThread:14046 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-12 07:24:02,113 INFO    MainThread:14046 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-12 07:24:02,114 INFO    MainThread:14046 [wandb_init.py:init():804] starting run threads in backend
+2024-08-12 07:24:02,173 INFO    MainThread:14046 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-12 07:24:02,173 INFO    MainThread:14046 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-12 07:24:02,174 INFO    MainThread:14046 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-12 07:24:02,174 INFO    MainThread:14046 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-12 07:24:02,174 INFO    MainThread:14046 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-12 07:24:05,379 INFO    MainThread:14046 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
+2024-08-12 07:24:05,380 INFO    MainThread:14046 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}