diff --git "a/qwen4B_models/qwen_4B_d1_iter2_model/lora.log" "b/qwen4B_models/qwen_4B_d1_iter2_model/lora.log" new file mode 100644--- /dev/null +++ "b/qwen4B_models/qwen_4B_d1_iter2_model/lora.log" @@ -0,0 +1,1126 @@ +[2024-07-15 10:01:03,518] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:04,910] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2024-07-15 10:01:04,960] [INFO] [runner.py:571:main] cmd = /ML-A100/team/mm/zhangge/anaconda3/envs/improve/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None ../../src/train_bash.py --deepspeed ../deepspeed/ds_z3_config.json --stage sft --do_train --model_name_or_path /ML-A100/team/mm/eamon/self_instruction/models/Qwen1_5_4B --dataset qwen_4B_d1_iter2_model --dataset_dir ../../data --template qwen_like --finetuning_type lora --lora_target all --lora_rank 8 --lora_alpha 16 --lora_dropout 0.05 --output_dir /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model --overwrite_cache --overwrite_output_dir --cutoff_len 1024 --preprocessing_num_workers 8 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 2 --lr_scheduler_type cosine --logging_steps 10 --warmup_steps 20 --save_steps 100 --eval_steps 100 --evaluation_strategy steps --load_best_model_at_end --learning_rate 5e-5 --num_train_epochs 2.0 --max_samples 3000 --val_size 0.1 --ddp_timeout 180000000 --plot_loss --bf16 +[2024-07-15 10:01:06,996] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_IB_PCI_RELAXED_ORDERING=1 +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_DEBUG=INFO +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_SOCKET_IFNAME=eth1 +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_IB_GID_INDEX=7 +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_IB_RETRY_CNT=7 +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_IB_DISABLE=0 +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_VERSION=2.19.3 +[2024-07-15 10:01:07,909] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=23 +[2024-07-15 10:01:07,909] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} +[2024-07-15 10:01:07,909] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=8, node_rank=0 +[2024-07-15 10:01:07,909] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) +[2024-07-15 10:01:07,909] [INFO] [launch.py:163:main] dist_world_size=8 +[2024-07-15 10:01:07,909] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +[2024-07-15 10:01:13,877] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:13,877] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:13,879] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:13,881] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:13,889] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:13,905] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:14,342] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:14,342] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 10:01:16,090] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 10:01:16,109] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 10:01:16,109] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 10:01:16,115] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 10:01:16,115] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 10:01:16,116] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 10:01:16,371] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 10:01:16,371] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2024-07-15 10:01:16,379] [INFO] [comm.py:637:init_distributed] cdb=None +07/15/2024 10:01:16 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 10:01:16 - INFO - llmtuner.hparams.parser - Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 10:01:16 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 10:01:17 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 10:01:17 - INFO - llmtuner.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 10:01:17 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 10:01:17 - INFO - llmtuner.hparams.parser - Process rank: 3, device: cuda:3, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 10:01:17 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 10:01:17 - INFO - llmtuner.hparams.parser - Process rank: 6, device: cuda:6, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 10:01:17 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 10:01:17 - INFO - llmtuner.hparams.parser - Process rank: 4, device: cuda:4, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 10:01:17 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 10:01:17 - INFO - llmtuner.hparams.parser - Process rank: 5, device: cuda:5, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 10:01:17 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 10:01:17 - INFO - llmtuner.hparams.parser - Process rank: 7, device: cuda:7, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 10:01:17 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 10:01:17 - INFO - llmtuner.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 10:01:17 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 10:01:17 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_outputs/qwen_4B_iter1_d1_10k_output_filtered_evaluated_filtered.json... +07/15/2024 10:01:17 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 10:01:17 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 10:01:17 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 10:01:17 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 10:01:17 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 10:01:17 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 10:01:17 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +t-20240715144706-mwmm8-worker-0:91732:91732 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91732:91732 [0] NCCL INFO Bootstrap : Using eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91732:91732 [0] NCCL INFO cudaDriverVersion 12030 +NCCL version 2.18.1+cuda12.1 +t-20240715144706-mwmm8-worker-0:91734:91734 [2] NCCL INFO cudaDriverVersion 12030 +t-20240715144706-mwmm8-worker-0:91734:91734 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91734:91734 [2] NCCL INFO Bootstrap : Using eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91737:91737 [5] NCCL INFO cudaDriverVersion 12030 +t-20240715144706-mwmm8-worker-0:91737:91737 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91738:91738 [6] NCCL INFO cudaDriverVersion 12030 +t-20240715144706-mwmm8-worker-0:91737:91737 [5] NCCL INFO Bootstrap : Using eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91738:91738 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91739:91739 [7] NCCL INFO cudaDriverVersion 12030 +t-20240715144706-mwmm8-worker-0:91738:91738 [6] NCCL INFO Bootstrap : Using eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91739:91739 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91735:91735 [3] NCCL INFO cudaDriverVersion 12030 +t-20240715144706-mwmm8-worker-0:91735:91735 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91733:91733 [1] NCCL INFO cudaDriverVersion 12030 +t-20240715144706-mwmm8-worker-0:91733:91733 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91736:91736 [4] NCCL INFO cudaDriverVersion 12030 +t-20240715144706-mwmm8-worker-0:91736:91736 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91739:91739 [7] NCCL INFO Bootstrap : Using eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91735:91735 [3] NCCL INFO Bootstrap : Using eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91733:91733 [1] NCCL INFO Bootstrap : Using eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91736:91736 [4] NCCL INFO Bootstrap : Using eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO P2P plugin IBext +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO P2P plugin IBext +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO P2P plugin IBext +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO P2P plugin IBext +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO P2P plugin IBext +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO P2P plugin IBext +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO P2P plugin IBext +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO P2P plugin IBext +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.40.22<0> +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Setting affinity for GPU 1 to ffffffff,00000000,ffffffff +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO NVLS multicast support is not available on dev 1 +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Setting affinity for GPU 3 to ffffffff,00000000,ffffffff +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO NVLS multicast support is not available on dev 3 +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,00000000,ffffffff,00000000 +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO NVLS multicast support is not available on dev 4 +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,00000000,ffffffff,00000000 +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO NVLS multicast support is not available on dev 5 +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Setting affinity for GPU 2 to ffffffff,00000000,ffffffff +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO NVLS multicast support is not available on dev 2 +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,00000000,ffffffff,00000000 +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO NVLS multicast support is not available on dev 6 +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,00000000,ffffffff,00000000 +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO NVLS multicast support is not available on dev 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Setting affinity for GPU 0 to ffffffff,00000000,ffffffff +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO NVLS multicast support is not available on dev 0 +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 01/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 02/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 03/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 06/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 09/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 10/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 11/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 14/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 00/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 00/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 00/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 01/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 02/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 03/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 04/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 00/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 01/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 02/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 01/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 01/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 05/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 03/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 00/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 02/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 02/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 06/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 07/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 08/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 09/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 10/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 11/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 00/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 01/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 02/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 03/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 04/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 05/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 04/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 03/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 03/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 01/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 00/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 00/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 04/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 05/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 04/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 02/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 01/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 01/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 12/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 13/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 14/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 06/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 07/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 08/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 09/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 10/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 11/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 06/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 05/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 07/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 06/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 03/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 08/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 02/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 02/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 07/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 04/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 03/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 03/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 15/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 12/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 13/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 14/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 15/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 05/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 08/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 09/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 06/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 09/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 10/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 07/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 05/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 04/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 04/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 10/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 11/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 08/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 06/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 05/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 05/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 11/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 12/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 13/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 14/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 12/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 09/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 06/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 07/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 06/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 15/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 13/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 10/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 08/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 07/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 07/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 11/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 14/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 09/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 08/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 08/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 12/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 15/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 09/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 10/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 09/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 11/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 13/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 10/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 10/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 12/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 14/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 11/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 11/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 13/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 12/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 12/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 15/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 13/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 14/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 13/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 14/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 14/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 15/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Channel 15/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 15/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 00/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 01/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 02/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 03/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 00/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 04/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 05/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 01/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 02/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 03/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 06/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 07/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 04/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 05/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 06/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 07/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 08/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 09/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 10/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 11/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 12/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 13/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 00/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 01/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 02/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 08/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 09/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 14/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Channel 15/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 03/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 04/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 05/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 06/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 07/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 08/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 09/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 00/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 01/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 02/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 10/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 11/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 00/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 10/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 11/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 12/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 13/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 03/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 14/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 04/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Channel 15/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 05/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 06/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 12/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 01/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 13/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 02/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 03/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 04/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 05/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 06/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 07/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 07/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 14/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 08/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 08/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Channel 15/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 09/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 09/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 10/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 11/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 12/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 13/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 14/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Channel 15/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 00/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 01/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 00/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 02/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 01/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 02/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 03/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 10/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 11/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 12/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 13/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 14/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Channel 15/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 03/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 04/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 04/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 05/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 05/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 06/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 06/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 07/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 07/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 08/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 08/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 09/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 09/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 10/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 10/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 11/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 11/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 12/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 13/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 14/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 12/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Channel 15/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 13/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 14/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Channel 15/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91735:92933 [3] NCCL INFO comm 0xd0505a0 rank 3 nranks 8 cudaDev 3 busId 2d000 commId 0xe41909ebc53a198b - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91733:92932 [1] NCCL INFO comm 0xce60b10 rank 1 nranks 8 cudaDev 1 busId 13000 commId 0xe41909ebc53a198b - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91737:92931 [5] NCCL INFO comm 0xc7fa7d0 rank 5 nranks 8 cudaDev 5 busId ca000 commId 0xe41909ebc53a198b - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91738:92934 [6] NCCL INFO comm 0xd3a0fb0 rank 6 nranks 8 cudaDev 6 busId e0000 commId 0xe41909ebc53a198b - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91739:92936 [7] NCCL INFO comm 0xc8e2c10 rank 7 nranks 8 cudaDev 7 busId e4000 commId 0xe41909ebc53a198b - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91734:92930 [2] NCCL INFO comm 0xdd9d6d0 rank 2 nranks 8 cudaDev 2 busId 29000 commId 0xe41909ebc53a198b - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91736:92935 [4] NCCL INFO comm 0xc16ac60 rank 4 nranks 8 cudaDev 4 busId c5000 commId 0xe41909ebc53a198b - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91732:92922 [0] NCCL INFO comm 0xc5634e0 rank 0 nranks 8 cudaDev 0 busId d000 commId 0xe41909ebc53a198b - Init COMPLETE +07/15/2024 10:01:22 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_outputs/qwen_4B_iter1_d1_10k_output_filtered_evaluated_filtered.json... +07/15/2024 10:01:22 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_outputs/qwen_4B_iter1_d1_10k_output_filtered_evaluated_filtered.json... +07/15/2024 10:01:22 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 10:01:22 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 10:01:22 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_outputs/qwen_4B_iter1_d1_10k_output_filtered_evaluated_filtered.json... +07/15/2024 10:01:22 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 10:01:22 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_outputs/qwen_4B_iter1_d1_10k_output_filtered_evaluated_filtered.json... +07/15/2024 10:01:22 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_outputs/qwen_4B_iter1_d1_10k_output_filtered_evaluated_filtered.json... +07/15/2024 10:01:22 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_outputs/qwen_4B_iter1_d1_10k_output_filtered_evaluated_filtered.json... +07/15/2024 10:01:22 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 10:01:22 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 10:01:22 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_outputs/qwen_4B_iter1_d1_10k_output_filtered_evaluated_filtered.json... +07/15/2024 10:01:22 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 10:01:22 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +input_ids: +[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 37134, 26887, 1995, 504, 279, 58218, 2150, 911, 894, 2007, 323, 10339, 3170, 432, 374, 1661, 369, 498, 624, 23729, 12879, 151645, 198, 151644, 77091, 198, 23729, 12879, 374, 264, 6233, 3283, 7407, 389, 279, 18172, 13648, 315, 7043, 13, 576, 3283, 374, 3881, 369, 1181, 6233, 26845, 11, 48177, 7674, 11, 323, 17923, 3607, 13, 1084, 374, 264, 2244, 1992, 311, 3947, 369, 2176, 31653, 323, 24914, 11, 438, 1052, 374, 2677, 2494, 311, 653, 323, 1490, 382, 3966, 2874, 3170, 5836, 12879, 374, 773, 2244, 374, 1576, 315, 1181, 6233, 26845, 13, 576, 3283, 702, 3807, 3460, 26845, 11, 2670, 17809, 29243, 5540, 323, 16608, 4753, 594, 1232, 59226, 13, 4220, 26845, 3010, 264, 2244, 1992, 311, 11967, 323, 4669, 6993, 11, 438, 1632, 438, 311, 15759, 304, 5257, 7488, 1741, 438, 98638, 11, 5619, 9833, 11, 476, 1101, 1251, 2630, 31924, 382, 14037, 2874, 3170, 5836, 12879, 374, 2244, 374, 1576, 315, 1181, 48177, 7674, 13, 576, 3283, 374, 2114, 311, 3807, 1947, 42554, 11, 50577, 11, 323, 43766, 13, 2619, 525, 1083, 1657, 4627, 323, 15254, 4357, 6814, 279, 1042, 11, 3259, 432, 264, 2244, 1992, 369, 1846, 879, 2948, 279, 18560, 382, 26746, 374, 2441, 2874, 3170, 5836, 12879, 374, 773, 2244, 13, 576, 3283, 374, 2114, 311, 1657, 15556, 323, 3607, 26763, 429, 8683, 17923, 323, 16807, 35005, 13, 5542, 66222, 311, 84401, 311, 22502, 11, 1052, 374, 2494, 369, 5019, 311, 4669, 382, 27489, 11, 5836, 12879, 374, 264, 2244, 1992, 311, 3947, 369, 5489, 3330, 311, 3139, 264, 48177, 323, 6233, 3283, 448, 10077, 315, 2513, 311, 653, 323, 1490, 13, 151643] +inputs: +<|im_start|>system +You are a helpful assistant.<|im_end|> +<|im_start|>user +Synthesize information from the wikipedia page about anyplace and explain why it is good for you. +San Francisco<|im_end|> +<|im_start|>assistant +San Francisco is a beautiful city located on the northern coast of California. The city is known for its beautiful parks, lively culture, and delicious food. It is a great place to visit for both tourists and locals, as there is always something to do and see. + +One reason why San Francisco is so great is because of its beautiful parks. The city has several large parks, including Golden Gate Park and Fisherman's Wharf. These parks offer a great place to relax and enjoy nature, as well as to participate in various activities such as jogging, playing sports, or just people-watching. + +Another reason why San Francisco is great is because of its lively culture. The city is home to several art galleries, museums, and theaters. There are also many music and dance events throughout the year, making it a great place for those who love the arts. + +Food is another reason why San Francisco is so great. The city is home to many restaurants and food trucks that serve delicious and diverse cuisine. From sushi to tacos to pizza, there is something for everyone to enjoy. + +Overall, San Francisco is a great place to visit for anyone looking to experience a lively and beautiful city with lots of things to do and see.<|endoftext|> +label_ids: +[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 23729, 12879, 374, 264, 6233, 3283, 7407, 389, 279, 18172, 13648, 315, 7043, 13, 576, 3283, 374, 3881, 369, 1181, 6233, 26845, 11, 48177, 7674, 11, 323, 17923, 3607, 13, 1084, 374, 264, 2244, 1992, 311, 3947, 369, 2176, 31653, 323, 24914, 11, 438, 1052, 374, 2677, 2494, 311, 653, 323, 1490, 382, 3966, 2874, 3170, 5836, 12879, 374, 773, 2244, 374, 1576, 315, 1181, 6233, 26845, 13, 576, 3283, 702, 3807, 3460, 26845, 11, 2670, 17809, 29243, 5540, 323, 16608, 4753, 594, 1232, 59226, 13, 4220, 26845, 3010, 264, 2244, 1992, 311, 11967, 323, 4669, 6993, 11, 438, 1632, 438, 311, 15759, 304, 5257, 7488, 1741, 438, 98638, 11, 5619, 9833, 11, 476, 1101, 1251, 2630, 31924, 382, 14037, 2874, 3170, 5836, 12879, 374, 2244, 374, 1576, 315, 1181, 48177, 7674, 13, 576, 3283, 374, 2114, 311, 3807, 1947, 42554, 11, 50577, 11, 323, 43766, 13, 2619, 525, 1083, 1657, 4627, 323, 15254, 4357, 6814, 279, 1042, 11, 3259, 432, 264, 2244, 1992, 369, 1846, 879, 2948, 279, 18560, 382, 26746, 374, 2441, 2874, 3170, 5836, 12879, 374, 773, 2244, 13, 576, 3283, 374, 2114, 311, 1657, 15556, 323, 3607, 26763, 429, 8683, 17923, 323, 16807, 35005, 13, 5542, 66222, 311, 84401, 311, 22502, 11, 1052, 374, 2494, 369, 5019, 311, 4669, 382, 27489, 11, 5836, 12879, 374, 264, 2244, 1992, 311, 3947, 369, 5489, 3330, 311, 3139, 264, 48177, 323, 6233, 3283, 448, 10077, 315, 2513, 311, 653, 323, 1490, 13, 151643] +labels: +San Francisco is a beautiful city located on the northern coast of California. The city is known for its beautiful parks, lively culture, and delicious food. It is a great place to visit for both tourists and locals, as there is always something to do and see. + +One reason why San Francisco is so great is because of its beautiful parks. The city has several large parks, including Golden Gate Park and Fisherman's Wharf. These parks offer a great place to relax and enjoy nature, as well as to participate in various activities such as jogging, playing sports, or just people-watching. + +Another reason why San Francisco is great is because of its lively culture. The city is home to several art galleries, museums, and theaters. There are also many music and dance events throughout the year, making it a great place for those who love the arts. + +Food is another reason why San Francisco is so great. The city is home to many restaurants and food trucks that serve delicious and diverse cuisine. From sushi to tacos to pizza, there is something for everyone to enjoy. + +Overall, San Francisco is a great place to visit for anyone looking to experience a lively and beautiful city with lots of things to do and see.<|endoftext|> +[2024-07-15 10:02:05,179] [INFO] [partition_parameters.py:349:__exit__] finished initializing model - num_params = 483, num_elems = 3.95B +07/15/2024 10:02:08 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 10:02:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 10:02:08 - INFO - llmtuner.model.utils - Found linear modules: k_proj,gate_proj,up_proj,o_proj,q_proj,down_proj,v_proj +07/15/2024 10:02:08 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 10:02:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 10:02:08 - INFO - llmtuner.model.utils - Found linear modules: q_proj,o_proj,v_proj,gate_proj,up_proj,k_proj,down_proj +07/15/2024 10:02:08 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 10:02:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 10:02:08 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 10:02:08 - INFO - llmtuner.model.utils - Found linear modules: up_proj,q_proj,gate_proj,down_proj,k_proj,o_proj,v_proj +07/15/2024 10:02:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 10:02:08 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 10:02:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 10:02:08 - INFO - llmtuner.model.utils - Found linear modules: down_proj,v_proj,o_proj,gate_proj,k_proj,up_proj,q_proj +07/15/2024 10:02:08 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 10:02:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 10:02:08 - INFO - llmtuner.model.utils - Found linear modules: gate_proj,up_proj,down_proj,v_proj,o_proj,k_proj,q_proj +07/15/2024 10:02:08 - INFO - llmtuner.model.utils - Found linear modules: o_proj,gate_proj,v_proj,q_proj,k_proj,down_proj,up_proj +07/15/2024 10:02:08 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 10:02:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 10:02:08 - INFO - llmtuner.model.utils - Found linear modules: k_proj,up_proj,v_proj,down_proj,gate_proj,q_proj,o_proj +07/15/2024 10:02:09 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 10:02:09 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 10:02:09 - INFO - llmtuner.model.utils - Found linear modules: k_proj,down_proj,o_proj,up_proj,q_proj,gate_proj,v_proj +07/15/2024 10:02:17 - INFO - llmtuner.model.loader - trainable params: 15646720 || all params: 3966016000 || trainable%: 0.3945 +07/15/2024 10:02:17 - INFO - llmtuner.model.loader - trainable params: 15646720 || all params: 3966016000 || trainable%: 0.3945 +07/15/2024 10:02:18 - INFO - llmtuner.model.loader - trainable params: 15646720 || all params: 3966016000 || trainable%: 0.3945 +07/15/2024 10:02:18 - INFO - llmtuner.model.loader - trainable params: 15646720 || all params: 3966016000 || trainable%: 0.3945 +07/15/2024 10:02:18 - INFO - llmtuner.model.loader - trainable params: 15646720 || all params: 3966016000 || trainable%: 0.3945 +07/15/2024 10:02:18 - INFO - llmtuner.model.loader - trainable params: 15646720 || all params: 3966016000 || trainable%: 0.3945 +07/15/2024 10:02:18 - INFO - llmtuner.model.loader - trainable params: 15646720 || all params: 3966016000 || trainable%: 0.3945 +07/15/2024 10:02:18 - INFO - llmtuner.model.loader - trainable params: 15646720 || all params: 3966016000 || trainable%: 0.3945 +[2024-07-15 10:02:19,173] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.13.0, git-hash=unknown, git-branch=unknown +[2024-07-15 10:02:19,219] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2024-07-15 10:02:19,224] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2024-07-15 10:02:19,224] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2024-07-15 10:02:19,279] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW +[2024-07-15 10:02:19,279] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type= +[2024-07-15 10:02:19,279] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2024-07-15 10:02:19,279] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer +[2024-07-15 10:02:19,471] [INFO] [utils.py:791:see_memory_usage] Stage 3 initialize beginning +[2024-07-15 10:02:19,471] [INFO] [utils.py:792:see_memory_usage] MA 1.61 GB Max_MA 3.03 GB CA 1.72 GB Max_CA 3 GB +[2024-07-15 10:02:19,472] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 7.85 GB, percent = 0.4% +[2024-07-15 10:02:19,481] [INFO] [stage3.py:128:__init__] Reduce bucket size 6553600 +[2024-07-15 10:02:19,481] [INFO] [stage3.py:129:__init__] Prefetch bucket size 5898240 +[2024-07-15 10:02:19,663] [INFO] [utils.py:791:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2024-07-15 10:02:19,663] [INFO] [utils.py:792:see_memory_usage] MA 1.61 GB Max_MA 1.61 GB CA 1.72 GB Max_CA 2 GB +[2024-07-15 10:02:19,664] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 7.85 GB, percent = 0.4% +Parameter Offload: Total persistent parameters: 9525760 in 641 params +[2024-07-15 10:02:20,090] [INFO] [utils.py:791:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2024-07-15 10:02:20,091] [INFO] [utils.py:792:see_memory_usage] MA 1.59 GB Max_MA 1.61 GB CA 1.72 GB Max_CA 2 GB +[2024-07-15 10:02:20,091] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 7.85 GB, percent = 0.4% +[2024-07-15 10:02:20,286] [INFO] [utils.py:791:see_memory_usage] Before creating fp16 partitions +[2024-07-15 10:02:20,286] [INFO] [utils.py:792:see_memory_usage] MA 1.59 GB Max_MA 1.59 GB CA 1.72 GB Max_CA 2 GB +[2024-07-15 10:02:20,287] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 7.85 GB, percent = 0.4% +[2024-07-15 10:02:20,714] [INFO] [utils.py:791:see_memory_usage] After creating fp16 partitions: 1 +[2024-07-15 10:02:20,715] [INFO] [utils.py:792:see_memory_usage] MA 1.59 GB Max_MA 1.59 GB CA 1.67 GB Max_CA 2 GB +[2024-07-15 10:02:20,716] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 7.88 GB, percent = 0.4% +[2024-07-15 10:02:20,896] [INFO] [utils.py:791:see_memory_usage] Before creating fp32 partitions +[2024-07-15 10:02:20,897] [INFO] [utils.py:792:see_memory_usage] MA 1.59 GB Max_MA 1.59 GB CA 1.67 GB Max_CA 2 GB +[2024-07-15 10:02:20,897] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 7.88 GB, percent = 0.4% +[2024-07-15 10:02:21,078] [INFO] [utils.py:791:see_memory_usage] After creating fp32 partitions +[2024-07-15 10:02:21,079] [INFO] [utils.py:792:see_memory_usage] MA 1.6 GB Max_MA 1.6 GB CA 1.67 GB Max_CA 2 GB +[2024-07-15 10:02:21,079] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 7.88 GB, percent = 0.4% +[2024-07-15 10:02:21,296] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states +[2024-07-15 10:02:21,297] [INFO] [utils.py:792:see_memory_usage] MA 1.6 GB Max_MA 1.6 GB CA 1.67 GB Max_CA 2 GB +[2024-07-15 10:02:21,297] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.1 GB, percent = 0.4% +[2024-07-15 10:02:21,535] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states +[2024-07-15 10:02:21,536] [INFO] [utils.py:792:see_memory_usage] MA 1.61 GB Max_MA 1.63 GB CA 1.71 GB Max_CA 2 GB +[2024-07-15 10:02:21,536] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.13 GB, percent = 0.4% +[2024-07-15 10:02:21,537] [INFO] [stage3.py:482:_setup_for_real_optimizer] optimizer state initialized +[2024-07-15 10:02:21,951] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer +[2024-07-15 10:02:21,952] [INFO] [utils.py:792:see_memory_usage] MA 1.63 GB Max_MA 1.63 GB CA 1.71 GB Max_CA 2 GB +[2024-07-15 10:02:21,952] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.34 GB, percent = 0.4% +[2024-07-15 10:02:21,952] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW +[2024-07-15 10:02:21,953] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2024-07-15 10:02:21,953] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2024-07-15 10:02:21,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.999)] +[2024-07-15 10:02:21,957] [INFO] [config.py:984:print] DeepSpeedEngine configuration: +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] amp_enabled .................. False +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] amp_params ................... False +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] bfloat16_enabled ............. True +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] comms_config ................. +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] communication_data_type ...... None +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2024-07-15 10:02:21,958] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] curriculum_params_legacy ..... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] data_efficiency_enabled ...... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] dataloader_drop_last ......... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] disable_allgather ............ False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] dump_state ................... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... None +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] eigenvalue_enabled ........... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] eigenvalue_verbose ........... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] elasticity_enabled ........... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] fp16_auto_cast ............... None +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] fp16_enabled ................. False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] global_rank .................. 0 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] grad_accum_dtype ............. None +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] gradient_accumulation_steps .. 2 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] gradient_clipping ............ 1.0 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] graph_harvesting ............. False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] initial_dynamic_scale ........ 1 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] load_universal_checkpoint .... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] loss_scale ................... 1.0 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] memory_breakdown ............. False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] mics_hierarchial_params_gather False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] mics_shard_size .............. -1 +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] optimizer_name ............... None +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] optimizer_params ............. None +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] pld_enabled .................. False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] pld_params ................... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] prescale_gradients ........... False +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] scheduler_name ............... None +[2024-07-15 10:02:21,959] [INFO] [config.py:988:print] scheduler_params ............. None +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32 +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] sparse_attention ............. None +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] steps_per_print .............. inf +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] train_batch_size ............. 16 +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 1 +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] use_node_local_storage ....... False +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] wall_clock_breakdown ......... False +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] weight_quantization_config ... None +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] world_size ................... 8 +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] zero_allow_untested_optimizer True +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=6553600 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=5898240 param_persistence_threshold=25600 model_persistence_threshold=sys.maxsize max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] zero_enabled ................. True +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True +[2024-07-15 10:02:21,960] [INFO] [config.py:988:print] zero_optimization_stage ...... 3 +[2024-07-15 10:02:21,960] [INFO] [config.py:974:print_user_config] json = { + "train_batch_size": 16, + "train_micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 2, + "gradient_clipping": 1.0, + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1.000000e+09, + "reduce_bucket_size": 6.553600e+06, + "stage3_prefetch_bucket_size": 5.898240e+06, + "stage3_param_persistence_threshold": 2.560000e+04, + "stage3_max_live_parameters": 1.000000e+09, + "stage3_max_reuse_distance": 1.000000e+09, + "stage3_gather_16bit_weights_on_model_save": true + }, + "steps_per_print": inf +} +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Using network IBext +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,00000000,ffffffff,00000000 +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO NVLS multicast support is not available on dev 7 +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,00000000,ffffffff,00000000 +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO NVLS multicast support is not available on dev 4 +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Setting affinity for GPU 3 to ffffffff,00000000,ffffffff +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO NVLS multicast support is not available on dev 3 +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Setting affinity for GPU 1 to ffffffff,00000000,ffffffff +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO NVLS multicast support is not available on dev 1 +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,00000000,ffffffff,00000000 +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO NVLS multicast support is not available on dev 6 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Setting affinity for GPU 0 to ffffffff,00000000,ffffffff +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO NVLS multicast support is not available on dev 0 +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Setting affinity for GPU 2 to ffffffff,00000000,ffffffff +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,00000000,ffffffff,00000000 +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO NVLS multicast support is not available on dev 5 +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO NVLS multicast support is not available on dev 2 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 01/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 02/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 03/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 06/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 09/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 10/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 11/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 14/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 7 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO P2P Chunksize set to 524288 +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 00/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 01/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 02/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 03/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 04/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 05/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 06/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 07/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 08/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 09/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 10/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 00/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 01/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 02/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 00/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 00/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 00/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 00/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 01/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 11/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 12/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 13/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 14/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 15/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 03/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 00/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 00/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 02/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 01/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 01/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 01/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 04/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 01/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 01/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 03/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 02/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 02/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 02/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 05/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 04/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 02/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 02/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 03/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 03/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 03/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 06/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 05/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 03/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 03/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 04/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 04/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 04/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 07/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 04/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 06/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 05/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 05/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 04/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 05/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 08/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 05/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 07/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 06/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 06/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 05/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 06/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 07/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 07/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 09/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 07/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 08/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 06/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 06/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 08/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 10/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 08/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 08/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 09/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 07/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 07/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 09/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 10/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 11/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 09/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 08/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 08/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 09/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 10/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 11/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 12/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 10/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 09/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 09/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 10/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 11/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 13/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 11/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 11/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 12/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 10/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 12/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 12/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 11/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 12/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 13/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 14/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 13/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 10/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 11/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 12/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 13/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 15/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 12/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 13/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 14/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 13/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 14/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 14/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 13/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 15/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Channel 15/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 14/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 14/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 15/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 15/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 15/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 14/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 15/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 00/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 01/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 02/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Connected all rings +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 03/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 04/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 05/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 06/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 07/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 08/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 09/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 10/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 00/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 11/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 12/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 13/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 00/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 00/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 01/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 02/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 03/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 04/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 14/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 05/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Channel 15/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 01/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 02/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 03/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 01/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 04/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 02/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 05/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 03/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 04/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 05/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 00/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 06/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 01/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 07/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 06/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 07/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 08/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 06/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 09/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 07/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 10/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 08/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 11/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 09/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 12/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 08/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 02/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 00/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 03/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 09/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 01/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 00/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 10/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 13/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 11/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 14/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 12/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Channel 15/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 13/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 14/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 04/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 01/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 02/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 10/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 05/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 02/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 03/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 11/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Channel 15/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 06/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 12/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 03/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 04/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 07/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 04/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 13/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 05/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 05/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 06/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 07/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 14/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 08/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 06/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 08/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Channel 15/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 09/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 07/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 10/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 08/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 09/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 09/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 11/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 10/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 10/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 12/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 13/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 11/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 12/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 13/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 14/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Channel 15/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 14/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Channel 15/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 11/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 12/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 13/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 14/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Channel 15/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO Connected all trees +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144706-mwmm8-worker-0:91738:94443 [6] NCCL INFO comm 0x7f69b0d72750 rank 6 nranks 8 cudaDev 6 busId e0000 commId 0xb224f0fb05f92136 - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91739:94440 [7] NCCL INFO comm 0x7ff656a4bc50 rank 7 nranks 8 cudaDev 7 busId e4000 commId 0xb224f0fb05f92136 - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91736:94444 [4] NCCL INFO comm 0x7f5ec8ead5b0 rank 4 nranks 8 cudaDev 4 busId c5000 commId 0xb224f0fb05f92136 - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91735:94442 [3] NCCL INFO comm 0x7f9de4d6cb70 rank 3 nranks 8 cudaDev 3 busId 2d000 commId 0xb224f0fb05f92136 - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91734:94438 [2] NCCL INFO comm 0x7fed7cd720e0 rank 2 nranks 8 cudaDev 2 busId 29000 commId 0xb224f0fb05f92136 - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91733:94441 [1] NCCL INFO comm 0x7faeb0d6df30 rank 1 nranks 8 cudaDev 1 busId 13000 commId 0xb224f0fb05f92136 - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91737:94439 [5] NCCL INFO comm 0x7f5e44d72fd0 rank 5 nranks 8 cudaDev 5 busId ca000 commId 0xb224f0fb05f92136 - Init COMPLETE +t-20240715144706-mwmm8-worker-0:91732:94437 [0] NCCL INFO comm 0x7f2e729b1830 rank 0 nranks 8 cudaDev 0 busId d000 commId 0xb224f0fb05f92136 - Init COMPLETE +{'loss': 1.438, 'grad_norm': 1.1161658260127314, 'learning_rate': 2.5e-05, 'epoch': 0.15} +{'loss': 1.2455, 'grad_norm': 1.141386114916924, 'learning_rate': 5e-05, 'epoch': 0.29} +{'loss': 0.8748, 'grad_norm': 0.908747056691058, 'learning_rate': 4.908874981298057e-05, 'epoch': 0.44} +{'loss': 0.8399, 'grad_norm': 1.160724824332908, 'learning_rate': 4.642142940418973e-05, 'epoch': 0.59} +{'loss': 0.8371, 'grad_norm': 1.7354204311716015, 'learning_rate': 4.2192486471335585e-05, 'epoch': 0.74} +{'loss': 0.8346, 'grad_norm': 0.989621687985043, 'learning_rate': 3.671021101749476e-05, 'epoch': 0.88} +{'loss': 0.85, 'grad_norm': 1.4467052607165527, 'learning_rate': 3.0374261005275607e-05, 'epoch': 1.03} +{'loss': 0.8218, 'grad_norm': 1.022122556125995, 'learning_rate': 2.3646527285364565e-05, 'epoch': 1.18} +{'loss': 0.8021, 'grad_norm': 1.168833164765407, 'learning_rate': 1.7017461746600506e-05, 'epoch': 1.32} +{'loss': 0.6978, 'grad_norm': 0.4995411891529773, 'learning_rate': 1.0970323365940444e-05, 'epoch': 1.47} +{'eval_loss': 0.7856930494308472, 'eval_runtime': 12.019, 'eval_samples_per_second': 10.067, 'eval_steps_per_second': 1.331, 'epoch': 1.47} +[2024-07-15 10:08:07,799] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step100 is about to be saved! +[2024-07-15 10:08:07,829] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt +[2024-07-15 10:08:07,829] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2024-07-15 10:08:07,856] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2024-07-15 10:08:07,859] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-07-15 10:08:07,893] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-07-15 10:08:07,894] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2024-07-15 10:08:07,940] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step100 is ready now! +{'loss': 0.7205, 'grad_norm': 1.6403596296455754, 'learning_rate': 5.945948621809091e-06, 'epoch': 1.62} +{'loss': 0.787, 'grad_norm': 0.8006724271557235, 'learning_rate': 2.310614508226078e-06, 'epoch': 1.76} +{'loss': 0.7046, 'grad_norm': 1.7764723276974967, 'learning_rate': 3.293369364618465e-07, 'epoch': 1.91} +[2024-07-15 10:10:00,376] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2024-07-15 10:10:00,391] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2024-07-15 10:10:00,391] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2024-07-15 10:10:00,405] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2024-07-15 10:10:00,432] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-07-15 10:10:00,444] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-07-15 10:10:00,444] [INFO] [engine.py:3019:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 0 +[2024-07-15 10:10:00,456] [INFO] [engine.py:2951:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 0 +{'train_runtime': 458.5184, 'train_samples_per_second': 4.746, 'train_steps_per_second': 0.297, 'train_loss': 0.8778598624117234, 'epoch': 2.0} +***** train metrics ***** + epoch = 2.0 + total_flos = 19661GF + train_loss = 0.8779 + train_runtime = 0:07:38.51 + train_samples_per_second = 4.746 + train_steps_per_second = 0.297 +Figure saved at: /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/training_loss.png +Figure saved at: /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen4B_models/qwen_4B_d1_iter2_model/training_eval_loss.png +***** eval metrics ***** + epoch = 2.0 + eval_loss = 0.7857 + eval_runtime = 0:00:07.93 + eval_samples_per_second = 15.241 + eval_steps_per_second = 2.015 +[2024-07-15 10:10:15,488] [INFO] [launch.py:347:main] Process 91735 exits successfully. +[2024-07-15 10:10:15,488] [INFO] [launch.py:347:main] Process 91737 exits successfully. +[2024-07-15 10:10:15,489] [INFO] [launch.py:347:main] Process 91733 exits successfully. +[2024-07-15 10:10:15,489] [INFO] [launch.py:347:main] Process 91736 exits successfully. +[2024-07-15 10:10:15,489] [INFO] [launch.py:347:main] Process 91738 exits successfully. +[2024-07-15 10:10:15,489] [INFO] [launch.py:347:main] Process 91739 exits successfully. +[2024-07-15 10:10:15,489] [INFO] [launch.py:347:main] Process 91732 exits successfully. +[2024-07-15 10:10:15,489] [INFO] [launch.py:347:main] Process 91734 exits successfully.