diff --git "a/qwen14B_models/qwen_14B_d0_iter1_model/lora.log" "b/qwen14B_models/qwen_14B_d0_iter1_model/lora.log" new file mode 100644--- /dev/null +++ "b/qwen14B_models/qwen_14B_d0_iter1_model/lora.log" @@ -0,0 +1,1171 @@ +[2024-07-15 08:39:01,977] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:04,109] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2024-07-15 08:39:04,163] [INFO] [runner.py:571:main] cmd = /ML-A100/team/mm/zhangge/anaconda3/envs/improve/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None ../../src/train_bash.py --deepspeed ../deepspeed/ds_z3_config.json --stage sft --do_train --model_name_or_path /ML-A100/team/mm/eamon/self_instruction/models/Qwen1_5_14B --dataset qwen_14B_d0_iter1_model --dataset_dir ../../data --template qwen_like --finetuning_type lora --lora_target all --lora_rank 8 --lora_alpha 16 --lora_dropout 0.05 --output_dir /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model --overwrite_cache --overwrite_output_dir --cutoff_len 1024 --preprocessing_num_workers 8 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 2 --lr_scheduler_type cosine --logging_steps 10 --warmup_steps 20 --save_steps 100 --eval_steps 100 --evaluation_strategy steps --load_best_model_at_end --learning_rate 5e-5 --num_train_epochs 2.0 --max_samples 3000 --val_size 0.1 --ddp_timeout 180000000 --plot_loss --bf16 +[2024-07-15 08:39:06,208] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_IB_PCI_RELAXED_ORDERING=1 +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_DEBUG=INFO +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_SOCKET_IFNAME=eth1 +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_IB_GID_INDEX=7 +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_IB_RETRY_CNT=7 +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_IB_DISABLE=0 +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_VERSION=2.19.3 +[2024-07-15 08:39:07,193] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=23 +[2024-07-15 08:39:07,193] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} +[2024-07-15 08:39:07,193] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=8, node_rank=0 +[2024-07-15 08:39:07,193] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) +[2024-07-15 08:39:07,193] [INFO] [launch.py:163:main] dist_world_size=8 +[2024-07-15 08:39:07,193] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +[2024-07-15 08:39:14,302] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:14,302] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:14,303] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:14,397] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:14,494] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:14,578] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:14,692] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:14,776] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-07-15 08:39:17,630] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 08:39:17,630] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 08:39:17,630] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 08:39:17,630] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2024-07-15 08:39:17,630] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 08:39:17,630] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 08:39:17,739] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 08:39:17,803] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-07-15 08:39:17,811] [INFO] [comm.py:637:init_distributed] cdb=None +07/15/2024 08:39:18 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 08:39:18 - INFO - llmtuner.hparams.parser - Process rank: 6, device: cuda:6, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 08:39:18 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 08:39:18 - INFO - llmtuner.hparams.parser - Process rank: 5, device: cuda:5, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 08:39:18 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 08:39:18 - INFO - llmtuner.hparams.parser - Process rank: 7, device: cuda:7, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 08:39:18 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 08:39:18 - INFO - llmtuner.hparams.parser - Process rank: 4, device: cuda:4, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 08:39:18 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 08:39:18 - INFO - llmtuner.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 08:39:18 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 08:39:18 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 08:39:18 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 08:39:18 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 08:39:18 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 08:39:18 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 08:39:18 - INFO - llmtuner.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 08:39:18 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 08:39:18 - INFO - llmtuner.hparams.parser - Process rank: 3, device: cuda:3, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 08:39:18 - WARNING - llmtuner.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +07/15/2024 08:39:18 - INFO - llmtuner.hparams.parser - Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +07/15/2024 08:39:18 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 08:39:18 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_outputs/qwen_14B_base_d0_10k_output_filtered_evaluated_filtered.json... +07/15/2024 08:39:18 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 08:39:19 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +07/15/2024 08:39:19 - INFO - llmtuner.data.template - Add <|im_end|>,<|endoftext|> to stop words. +t-20240715144206-zn57g-worker-0:53949:53949 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53949:53949 [0] NCCL INFO Bootstrap : Using eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53949:53949 [0] NCCL INFO cudaDriverVersion 12030 +NCCL version 2.18.1+cuda12.1 +t-20240715144206-zn57g-worker-0:53953:53953 [4] NCCL INFO cudaDriverVersion 12030 +t-20240715144206-zn57g-worker-0:53953:53953 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53953:53953 [4] NCCL INFO Bootstrap : Using eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53952:53952 [3] NCCL INFO cudaDriverVersion 12030 +t-20240715144206-zn57g-worker-0:53952:53952 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53951:53951 [2] NCCL INFO cudaDriverVersion 12030 +t-20240715144206-zn57g-worker-0:53951:53951 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53954:53954 [5] NCCL INFO cudaDriverVersion 12030 +t-20240715144206-zn57g-worker-0:53955:53955 [6] NCCL INFO cudaDriverVersion 12030 +t-20240715144206-zn57g-worker-0:53954:53954 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53956:53956 [7] NCCL INFO cudaDriverVersion 12030 +t-20240715144206-zn57g-worker-0:53955:53955 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53952:53952 [3] NCCL INFO Bootstrap : Using eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53956:53956 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53950:53950 [1] NCCL INFO cudaDriverVersion 12030 +t-20240715144206-zn57g-worker-0:53950:53950 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53951:53951 [2] NCCL INFO Bootstrap : Using eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53954:53954 [5] NCCL INFO Bootstrap : Using eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53956:53956 [7] NCCL INFO Bootstrap : Using eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53955:53955 [6] NCCL INFO Bootstrap : Using eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53950:53950 [1] NCCL INFO Bootstrap : Using eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO P2P plugin IBext +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO P2P plugin IBext +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO P2P plugin IBext +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO P2P plugin IBext +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO P2P plugin IBext +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO P2P plugin IBext +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO P2P plugin IBext +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO P2P plugin IBext +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth1 +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1. +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO NET/IB : Using [0]mlx5_1:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB eth1:172.25.16.145<0> +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,00000000,ffffffff,00000000 +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO NVLS multicast support is not available on dev 7 +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,00000000,ffffffff,00000000 +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO NVLS multicast support is not available on dev 5 +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Setting affinity for GPU 1 to ffffffff,00000000,ffffffff +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO NVLS multicast support is not available on dev 1 +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Setting affinity for GPU 2 to ffffffff,00000000,ffffffff +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,00000000,ffffffff,00000000 +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO NVLS multicast support is not available on dev 6 +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,00000000,ffffffff,00000000 +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO NVLS multicast support is not available on dev 4 +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO NVLS multicast support is not available on dev 2 +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Setting affinity for GPU 3 to ffffffff,00000000,ffffffff +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Setting affinity for GPU 0 to ffffffff,00000000,ffffffff +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO NVLS multicast support is not available on dev 3 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO NVLS multicast support is not available on dev 0 +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 01/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 02/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 03/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 06/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 09/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 10/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 11/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 14/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 00/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 01/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 02/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 00/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 03/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 01/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 04/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 02/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 05/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 03/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 06/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 04/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 00/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 00/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 00/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 00/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 00/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 00/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 07/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 05/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 01/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 01/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 01/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 01/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 01/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 01/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 02/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 06/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 08/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 02/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 02/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 02/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 02/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 02/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 03/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 07/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 09/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 03/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 03/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 03/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 03/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 03/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 04/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 08/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 04/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 04/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 10/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 04/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 04/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 04/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 05/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 09/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 11/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 05/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 05/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 05/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 05/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 05/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 12/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 06/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 10/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 06/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 06/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 06/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 06/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 06/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 13/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 07/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 11/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 07/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 07/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 07/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 07/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 07/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 12/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 14/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 08/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 08/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 08/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 08/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 08/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 08/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 13/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 15/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 09/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 09/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 09/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 09/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 09/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 09/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 14/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 10/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 10/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 10/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 10/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 10/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 15/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 10/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 11/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 11/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 11/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 11/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 11/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 11/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 12/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 12/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 12/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 12/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 12/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 12/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 13/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 13/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 13/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 13/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 13/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 13/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 14/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Channel 15/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 14/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 14/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 14/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 14/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 14/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 15/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 15/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 15/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 15/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 15/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 00/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 01/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 02/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 03/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 04/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 05/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 06/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 07/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 08/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 09/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 10/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 11/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 12/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 00/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 01/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 02/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 03/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 04/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 05/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 00/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 13/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 00/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 14/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 06/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 07/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 08/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 09/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 10/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 11/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 01/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 02/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 03/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Channel 15/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 04/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 01/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 00/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 02/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 00/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 00/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 12/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 13/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 14/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Channel 15/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 05/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 06/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 07/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 08/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 03/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 01/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 01/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 01/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 04/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 02/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 02/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 02/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 09/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 10/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 11/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 05/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 03/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 03/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 12/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 03/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 04/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 06/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 13/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 04/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 04/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 07/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 05/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 14/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 05/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 05/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 08/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Channel 15/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 06/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 06/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 06/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 07/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 07/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 09/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 07/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 08/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 08/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 10/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 08/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 09/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 09/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 10/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 10/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 11/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 11/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 09/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 11/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 12/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 12/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 10/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 13/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 13/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 11/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 12/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 14/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 14/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 12/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Channel 15/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Channel 15/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 13/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 14/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 13/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 14/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Channel 15/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Channel 15/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53952:55169 [3] NCCL INFO comm 0xc73ff80 rank 3 nranks 8 cudaDev 3 busId 2d000 commId 0xeb21f0dba5216079 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53949:55160 [0] NCCL INFO comm 0xd1d6950 rank 0 nranks 8 cudaDev 0 busId d000 commId 0xeb21f0dba5216079 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53956:55170 [7] NCCL INFO comm 0xd4e2b30 rank 7 nranks 8 cudaDev 7 busId e4000 commId 0xeb21f0dba5216079 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53953:55168 [4] NCCL INFO comm 0xdd4a650 rank 4 nranks 8 cudaDev 4 busId c5000 commId 0xeb21f0dba5216079 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53955:55172 [6] NCCL INFO comm 0xd5058a0 rank 6 nranks 8 cudaDev 6 busId e0000 commId 0xeb21f0dba5216079 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53950:55173 [1] NCCL INFO comm 0xcd4f700 rank 1 nranks 8 cudaDev 1 busId 13000 commId 0xeb21f0dba5216079 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53954:55171 [5] NCCL INFO comm 0xc19caa0 rank 5 nranks 8 cudaDev 5 busId ca000 commId 0xeb21f0dba5216079 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53951:55174 [2] NCCL INFO comm 0xca43790 rank 2 nranks 8 cudaDev 2 busId 29000 commId 0xeb21f0dba5216079 - Init COMPLETE +07/15/2024 08:39:23 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_outputs/qwen_14B_base_d0_10k_output_filtered_evaluated_filtered.json... +07/15/2024 08:39:23 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 08:39:23 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_outputs/qwen_14B_base_d0_10k_output_filtered_evaluated_filtered.json... +07/15/2024 08:39:23 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_outputs/qwen_14B_base_d0_10k_output_filtered_evaluated_filtered.json... +07/15/2024 08:39:23 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_outputs/qwen_14B_base_d0_10k_output_filtered_evaluated_filtered.json... +07/15/2024 08:39:23 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_outputs/qwen_14B_base_d0_10k_output_filtered_evaluated_filtered.json... +07/15/2024 08:39:23 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_outputs/qwen_14B_base_d0_10k_output_filtered_evaluated_filtered.json... +07/15/2024 08:39:23 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 08:39:23 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 08:39:23 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 08:39:23 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 08:39:23 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +07/15/2024 08:39:23 - INFO - llmtuner.data.loader - Loading dataset /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_outputs/qwen_14B_base_d0_10k_output_filtered_evaluated_filtered.json... +07/15/2024 08:39:23 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +input_ids: +[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 7985, 264, 10435, 1948, 1378, 5766, 429, 4990, 1992, 518, 279, 16733, 624, 13079, 25, 8713, 498, 22596, 911, 16307, 3351, 5267, 62502, 25, 2308, 11, 358, 2776, 3520, 12035, 13, 1096, 374, 847, 1156, 882, 20971, 22131, 13, 151645, 198, 151644, 77091, 198, 13079, 25, 8670, 11, 7010, 13, 10967, 525, 498, 19383, 5267, 62502, 25, 358, 2776, 2087, 311, 12095, 311, 3947, 847, 12923, 13, 2932, 594, 12163, 1052, 369, 264, 2421, 1635, 1431, 624, 13079, 25, 45717, 11, 429, 10362, 1075, 458, 7897, 8411, 13, 12243, 498, 1012, 311, 9625, 1573, 5267, 62502, 25, 2308, 11, 419, 374, 847, 1156, 882, 13, 358, 3003, 2677, 4829, 311, 728, 323, 1490, 279, 468, 3092, 301, 21938, 323, 8180, 14093, 1038, 1783, 518, 264, 2205, 51950, 624, 13079, 25, 1446, 2299, 304, 369, 264, 4228, 13, 12095, 374, 264, 6233, 3283, 13, 358, 3003, 1012, 1052, 264, 2421, 3039, 11, 773, 358, 646, 3291, 498, 429, 498, 2299, 2087, 311, 2948, 432, 624, 62502, 25, 2938, 594, 2244, 311, 6723, 13, 358, 2776, 1101, 17811, 911, 847, 10971, 13, 358, 3003, 2581, 52750, 1573, 11, 773, 358, 2776, 264, 2699, 22596, 911, 279, 1896, 1847, 323, 20327, 624, 13079, 25, 4320, 944, 10955, 11, 432, 594, 13942, 4622, 311, 387, 22596, 13, 1988, 279, 35277, 323, 13627, 525, 15387, 11, 323, 807, 2299, 2677, 45778, 979, 432, 4041, 311, 7149, 13, 1446, 3278, 387, 6915, 13, 12343, 11, 279, 6194, 2337, 1896, 1847, 323, 20327, 525, 56092, 624, 62502, 25, 358, 3900, 498, 2299, 1290, 13, 358, 3278, 1430, 537, 311, 387, 2238, 22596, 624, 13079, 25, 19881, 11, 16307, 374, 825, 315, 279, 61744, 19777, 315, 17903, 13, 1597, 3055, 498, 2299, 304, 279, 3720, 11, 498, 646, 11967, 323, 4669, 279, 11877, 13, 4320, 944, 10667, 311, 4446, 264, 1661, 2311, 476, 2494, 311, 8844, 311, 13, 1084, 3278, 1281, 279, 882, 11466, 553, 624, 62502, 25, 9731, 498, 369, 697, 31734, 5771, 11, 3757, 13, 358, 2666, 264, 2699, 2664, 1431, 13, 358, 3278, 1281, 2704, 311, 4446, 2494, 311, 1349, 624, 13079, 25, 2308, 3491, 11, 429, 594, 1128, 358, 2776, 1588, 369, 13, 12243, 264, 2244, 8411, 11, 323, 1513, 944, 10667, 311, 1430, 1045, 315, 279, 2205, 35005, 323, 3947, 678, 279, 11245, 59924, 624, 62502, 25, 9731, 498, 11, 358, 2765, 944, 13, 358, 2776, 8491, 2087, 311, 1896, 697, 9462, 13, 151643] +inputs: +<|im_start|>system +You are a helpful assistant.<|im_end|> +<|im_start|>user +Write a conversation between two characters that takes place at the airport. +John: Are you nervous about flying today? +Jane: No, I'm actually excited. This is my first time traveling abroad.<|im_end|> +<|im_start|>assistant +John: Oh, cool. Where are you headed? +Jane: I'm going to Paris to visit my sister. She's lived there for a few years now. +John: Wow, that sounds like an amazing trip. Have you been to France before? +Jane: No, this is my first time. I've always wanted to go and see the Eiffel Tower and eat croissants at a local café. +John: You're in for a treat. Paris is a beautiful city. I've been there a few times, so I can tell you that you're going to love it. +Jane: That's great to hear. I'm just worried about my flight. I've never flown before, so I'm a bit nervous about the takeoff and landing. +John: Don't worry, it's perfectly normal to be nervous. But the pilots and crew are professionals, and they're always cautious when it comes to safety. You'll be fine. Plus, the views during takeoff and landing are breathtaking. +Jane: I hope you're right. I'll try not to be too nervous. +John: Remember, flying is one of the safest modes of transportation. And once you're in the air, you can relax and enjoy the ride. Don't forget to bring a good book or something to listen to. It'll make the time fly by. +Jane: Thank you for your reassurance, John. I feel a bit better now. I'll make sure to bring something to read. +John: No problem, that's what I'm here for. Have a great trip, and don't forget to try some of the local cuisine and visit all the famous landmarks. +Jane: Thank you, I won't. I'm definitely going to take your advice.<|endoftext|> +label_ids: +[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13079, 25, 8670, 11, 7010, 13, 10967, 525, 498, 19383, 5267, 62502, 25, 358, 2776, 2087, 311, 12095, 311, 3947, 847, 12923, 13, 2932, 594, 12163, 1052, 369, 264, 2421, 1635, 1431, 624, 13079, 25, 45717, 11, 429, 10362, 1075, 458, 7897, 8411, 13, 12243, 498, 1012, 311, 9625, 1573, 5267, 62502, 25, 2308, 11, 419, 374, 847, 1156, 882, 13, 358, 3003, 2677, 4829, 311, 728, 323, 1490, 279, 468, 3092, 301, 21938, 323, 8180, 14093, 1038, 1783, 518, 264, 2205, 51950, 624, 13079, 25, 1446, 2299, 304, 369, 264, 4228, 13, 12095, 374, 264, 6233, 3283, 13, 358, 3003, 1012, 1052, 264, 2421, 3039, 11, 773, 358, 646, 3291, 498, 429, 498, 2299, 2087, 311, 2948, 432, 624, 62502, 25, 2938, 594, 2244, 311, 6723, 13, 358, 2776, 1101, 17811, 911, 847, 10971, 13, 358, 3003, 2581, 52750, 1573, 11, 773, 358, 2776, 264, 2699, 22596, 911, 279, 1896, 1847, 323, 20327, 624, 13079, 25, 4320, 944, 10955, 11, 432, 594, 13942, 4622, 311, 387, 22596, 13, 1988, 279, 35277, 323, 13627, 525, 15387, 11, 323, 807, 2299, 2677, 45778, 979, 432, 4041, 311, 7149, 13, 1446, 3278, 387, 6915, 13, 12343, 11, 279, 6194, 2337, 1896, 1847, 323, 20327, 525, 56092, 624, 62502, 25, 358, 3900, 498, 2299, 1290, 13, 358, 3278, 1430, 537, 311, 387, 2238, 22596, 624, 13079, 25, 19881, 11, 16307, 374, 825, 315, 279, 61744, 19777, 315, 17903, 13, 1597, 3055, 498, 2299, 304, 279, 3720, 11, 498, 646, 11967, 323, 4669, 279, 11877, 13, 4320, 944, 10667, 311, 4446, 264, 1661, 2311, 476, 2494, 311, 8844, 311, 13, 1084, 3278, 1281, 279, 882, 11466, 553, 624, 62502, 25, 9731, 498, 369, 697, 31734, 5771, 11, 3757, 13, 358, 2666, 264, 2699, 2664, 1431, 13, 358, 3278, 1281, 2704, 311, 4446, 2494, 311, 1349, 624, 13079, 25, 2308, 3491, 11, 429, 594, 1128, 358, 2776, 1588, 369, 13, 12243, 264, 2244, 8411, 11, 323, 1513, 944, 10667, 311, 1430, 1045, 315, 279, 2205, 35005, 323, 3947, 678, 279, 11245, 59924, 624, 62502, 25, 9731, 498, 11, 358, 2765, 944, 13, 358, 2776, 8491, 2087, 311, 1896, 697, 9462, 13, 151643] +labels: +John: Oh, cool. Where are you headed? +Jane: I'm going to Paris to visit my sister. She's lived there for a few years now. +John: Wow, that sounds like an amazing trip. Have you been to France before? +Jane: No, this is my first time. I've always wanted to go and see the Eiffel Tower and eat croissants at a local café. +John: You're in for a treat. Paris is a beautiful city. I've been there a few times, so I can tell you that you're going to love it. +Jane: That's great to hear. I'm just worried about my flight. I've never flown before, so I'm a bit nervous about the takeoff and landing. +John: Don't worry, it's perfectly normal to be nervous. But the pilots and crew are professionals, and they're always cautious when it comes to safety. You'll be fine. Plus, the views during takeoff and landing are breathtaking. +Jane: I hope you're right. I'll try not to be too nervous. +John: Remember, flying is one of the safest modes of transportation. And once you're in the air, you can relax and enjoy the ride. Don't forget to bring a good book or something to listen to. It'll make the time fly by. +Jane: Thank you for your reassurance, John. I feel a bit better now. I'll make sure to bring something to read. +John: No problem, that's what I'm here for. Have a great trip, and don't forget to try some of the local cuisine and visit all the famous landmarks. +Jane: Thank you, I won't. I'm definitely going to take your advice.<|endoftext|> +[2024-07-15 08:40:07,143] [INFO] [partition_parameters.py:349:__exit__] finished initializing model - num_params = 483, num_elems = 14.17B +07/15/2024 08:40:19 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 08:40:19 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 08:40:19 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 08:40:19 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 08:40:19 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 08:40:19 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 08:40:19 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 08:40:19 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 08:40:19 - INFO - llmtuner.model.utils - Found linear modules: q_proj,gate_proj,down_proj,v_proj,o_proj,up_proj,k_proj +07/15/2024 08:40:19 - INFO - llmtuner.model.utils - Found linear modules: up_proj,o_proj,down_proj,gate_proj,v_proj,q_proj,k_proj +07/15/2024 08:40:19 - INFO - llmtuner.model.utils - Found linear modules: down_proj,up_proj,k_proj,v_proj,o_proj,q_proj,gate_proj +07/15/2024 08:40:19 - INFO - llmtuner.model.utils - Found linear modules: gate_proj,up_proj,v_proj,k_proj,down_proj,o_proj,q_proj +07/15/2024 08:40:19 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 08:40:19 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 08:40:19 - INFO - llmtuner.model.utils - Found linear modules: q_proj,up_proj,v_proj,o_proj,k_proj,down_proj,gate_proj +07/15/2024 08:40:19 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 08:40:19 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 08:40:19 - INFO - llmtuner.model.utils - Found linear modules: up_proj,k_proj,down_proj,o_proj,v_proj,gate_proj,q_proj +07/15/2024 08:40:19 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 08:40:19 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 08:40:19 - INFO - llmtuner.model.utils - Found linear modules: k_proj,down_proj,up_proj,v_proj,q_proj,gate_proj,o_proj +07/15/2024 08:40:20 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +07/15/2024 08:40:20 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +07/15/2024 08:40:20 - INFO - llmtuner.model.utils - Found linear modules: gate_proj,v_proj,up_proj,o_proj,k_proj,down_proj,q_proj +07/15/2024 08:40:58 - INFO - llmtuner.model.loader - trainable params: 31170560 || all params: 14198461440 || trainable%: 0.2195 +07/15/2024 08:40:59 - INFO - llmtuner.model.loader - trainable params: 31170560 || all params: 14198461440 || trainable%: 0.2195 +[2024-07-15 08:41:00,436] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.13.0, git-hash=unknown, git-branch=unknown +[2024-07-15 08:41:00,494] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2024-07-15 08:41:00,502] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2024-07-15 08:41:00,502] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2024-07-15 08:41:00,605] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW +[2024-07-15 08:41:00,605] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type= +[2024-07-15 08:41:00,605] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2024-07-15 08:41:00,605] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer +07/15/2024 08:41:00 - INFO - llmtuner.model.loader - trainable params: 31170560 || all params: 14198461440 || trainable%: 0.2195 +07/15/2024 08:41:00 - INFO - llmtuner.model.loader - trainable params: 31170560 || all params: 14198461440 || trainable%: 0.2195 +[2024-07-15 08:41:00,841] [INFO] [utils.py:791:see_memory_usage] Stage 3 initialize beginning +[2024-07-15 08:41:00,842] [INFO] [utils.py:792:see_memory_usage] MA 3.99 GB Max_MA 6.83 GB CA 4.58 GB Max_CA 8 GB +[2024-07-15 08:41:00,847] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.17 GB, percent = 0.4% +[2024-07-15 08:41:00,861] [INFO] [stage3.py:128:__init__] Reduce bucket size 26214400 +[2024-07-15 08:41:00,862] [INFO] [stage3.py:129:__init__] Prefetch bucket size 23592960 +[2024-07-15 08:41:01,150] [INFO] [utils.py:791:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2024-07-15 08:41:01,151] [INFO] [utils.py:792:see_memory_usage] MA 3.99 GB Max_MA 3.99 GB CA 4.58 GB Max_CA 5 GB +[2024-07-15 08:41:01,153] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.18 GB, percent = 0.4% +07/15/2024 08:41:01 - INFO - llmtuner.model.loader - trainable params: 31170560 || all params: 14198461440 || trainable%: 0.2195 +07/15/2024 08:41:01 - INFO - llmtuner.model.loader - trainable params: 31170560 || all params: 14198461440 || trainable%: 0.2195 +07/15/2024 08:41:01 - INFO - llmtuner.model.loader - trainable params: 31170560 || all params: 14198461440 || trainable%: 0.2195 +07/15/2024 08:41:01 - INFO - llmtuner.model.loader - trainable params: 31170560 || all params: 14198461440 || trainable%: 0.2195 +Parameter Offload: Total persistent parameters: 19051520 in 641 params +[2024-07-15 08:41:01,616] [INFO] [utils.py:791:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2024-07-15 08:41:01,616] [INFO] [utils.py:792:see_memory_usage] MA 3.94 GB Max_MA 3.99 GB CA 4.58 GB Max_CA 5 GB +[2024-07-15 08:41:01,617] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.17 GB, percent = 0.4% +[2024-07-15 08:41:01,799] [INFO] [utils.py:791:see_memory_usage] Before creating fp16 partitions +[2024-07-15 08:41:01,800] [INFO] [utils.py:792:see_memory_usage] MA 3.94 GB Max_MA 3.94 GB CA 4.58 GB Max_CA 5 GB +[2024-07-15 08:41:01,800] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.17 GB, percent = 0.4% +[2024-07-15 08:41:02,613] [INFO] [utils.py:791:see_memory_usage] After creating fp16 partitions: 1 +[2024-07-15 08:41:02,614] [INFO] [utils.py:792:see_memory_usage] MA 3.94 GB Max_MA 3.94 GB CA 4.46 GB Max_CA 5 GB +[2024-07-15 08:41:02,615] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.23 GB, percent = 0.4% +[2024-07-15 08:41:02,793] [INFO] [utils.py:791:see_memory_usage] Before creating fp32 partitions +[2024-07-15 08:41:02,794] [INFO] [utils.py:792:see_memory_usage] MA 3.94 GB Max_MA 3.94 GB CA 4.46 GB Max_CA 4 GB +[2024-07-15 08:41:02,795] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.23 GB, percent = 0.4% +[2024-07-15 08:41:02,973] [INFO] [utils.py:791:see_memory_usage] After creating fp32 partitions +[2024-07-15 08:41:02,973] [INFO] [utils.py:792:see_memory_usage] MA 3.95 GB Max_MA 3.96 GB CA 4.46 GB Max_CA 4 GB +[2024-07-15 08:41:02,974] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.23 GB, percent = 0.4% +[2024-07-15 08:41:03,156] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states +[2024-07-15 08:41:03,157] [INFO] [utils.py:792:see_memory_usage] MA 3.95 GB Max_MA 3.95 GB CA 4.46 GB Max_CA 4 GB +[2024-07-15 08:41:03,158] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.42 GB, percent = 0.4% +[2024-07-15 08:41:03,386] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states +[2024-07-15 08:41:03,386] [INFO] [utils.py:792:see_memory_usage] MA 3.98 GB Max_MA 4.01 GB CA 4.46 GB Max_CA 4 GB +[2024-07-15 08:41:03,387] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.45 GB, percent = 0.4% +[2024-07-15 08:41:03,387] [INFO] [stage3.py:482:_setup_for_real_optimizer] optimizer state initialized +[2024-07-15 08:41:03,809] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer +[2024-07-15 08:41:03,809] [INFO] [utils.py:792:see_memory_usage] MA 4.04 GB Max_MA 4.04 GB CA 4.46 GB Max_CA 4 GB +[2024-07-15 08:41:03,810] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 8.57 GB, percent = 0.5% +[2024-07-15 08:41:03,810] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW +[2024-07-15 08:41:03,810] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2024-07-15 08:41:03,810] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2024-07-15 08:41:03,810] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.999)] +[2024-07-15 08:41:03,815] [INFO] [config.py:984:print] DeepSpeedEngine configuration: +[2024-07-15 08:41:03,815] [INFO] [config.py:988:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2024-07-15 08:41:03,815] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2024-07-15 08:41:03,815] [INFO] [config.py:988:print] amp_enabled .................. False +[2024-07-15 08:41:03,815] [INFO] [config.py:988:print] amp_params ................... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] bfloat16_enabled ............. True +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] comms_config ................. +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] communication_data_type ...... None +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] curriculum_params_legacy ..... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] data_efficiency_enabled ...... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] dataloader_drop_last ......... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] disable_allgather ............ False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] dump_state ................... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... None +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] eigenvalue_enabled ........... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] eigenvalue_verbose ........... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] elasticity_enabled ........... False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] fp16_auto_cast ............... None +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] fp16_enabled ................. False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] global_rank .................. 0 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] grad_accum_dtype ............. None +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] gradient_accumulation_steps .. 2 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] gradient_clipping ............ 1.0 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] graph_harvesting ............. False +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2024-07-15 08:41:03,816] [INFO] [config.py:988:print] initial_dynamic_scale ........ 1 +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] load_universal_checkpoint .... False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] loss_scale ................... 1.0 +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] memory_breakdown ............. False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] mics_hierarchial_params_gather False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] mics_shard_size .............. -1 +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] optimizer_name ............... None +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] optimizer_params ............. None +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] pld_enabled .................. False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] pld_params ................... False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] prescale_gradients ........... False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] scheduler_name ............... None +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] scheduler_params ............. None +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32 +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] sparse_attention ............. None +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] steps_per_print .............. inf +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] train_batch_size ............. 16 +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 1 +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] use_node_local_storage ....... False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] wall_clock_breakdown ......... False +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] weight_quantization_config ... None +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] world_size ................... 8 +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] zero_allow_untested_optimizer True +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=26214400 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=23592960 param_persistence_threshold=51200 model_persistence_threshold=sys.maxsize max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] zero_enabled ................. True +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True +[2024-07-15 08:41:03,817] [INFO] [config.py:988:print] zero_optimization_stage ...... 3 +[2024-07-15 08:41:03,817] [INFO] [config.py:974:print_user_config] json = { + "train_batch_size": 16, + "train_micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 2, + "gradient_clipping": 1.0, + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1.000000e+09, + "reduce_bucket_size": 2.621440e+07, + "stage3_prefetch_bucket_size": 2.359296e+07, + "stage3_param_persistence_threshold": 5.120000e+04, + "stage3_max_live_parameters": 1.000000e+09, + "stage3_max_reuse_distance": 1.000000e+09, + "stage3_gather_16bit_weights_on_model_save": true + }, + "steps_per_print": inf +} +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Using network IBext +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,00000000,ffffffff,00000000 +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO NVLS multicast support is not available on dev 6 +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,00000000,ffffffff,00000000 +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO NVLS multicast support is not available on dev 4 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Setting affinity for GPU 0 to ffffffff,00000000,ffffffff +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO NVLS multicast support is not available on dev 0 +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Setting affinity for GPU 3 to ffffffff,00000000,ffffffff +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Setting affinity for GPU 2 to ffffffff,00000000,ffffffff +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO NVLS multicast support is not available on dev 3 +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO NVLS multicast support is not available on dev 2 +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,00000000,ffffffff,00000000 +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO NVLS multicast support is not available on dev 7 +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Setting affinity for GPU 1 to ffffffff,00000000,ffffffff +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO NVLS multicast support is not available on dev 1 +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,00000000,ffffffff,00000000 +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO NVLS multicast support is not available on dev 5 +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 01/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 02/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 03/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 06/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 09/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 10/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 11/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 14/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 7 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO P2P Chunksize set to 524288 +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 00/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 01/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 00/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 00/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 02/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 01/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 01/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 03/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 02/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 00/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 00/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 00/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 02/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 00/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 04/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 03/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 00/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 01/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 01/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 03/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 01/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 01/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 05/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 01/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 04/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 02/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 02/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 02/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 02/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 06/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 02/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 05/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 03/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 03/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 04/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 03/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 03/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 05/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 03/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 07/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 06/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 04/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 04/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 06/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 04/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 04/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 04/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 08/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 07/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 05/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 05/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 05/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 07/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 05/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 09/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 05/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 06/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 08/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 06/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 06/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 08/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 06/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 10/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 06/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 07/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 09/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 07/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 07/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 09/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 07/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 11/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 07/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 08/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 10/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 08/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 08/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 12/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 10/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 08/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 11/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 09/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 09/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 09/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 11/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 08/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 09/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 10/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 13/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 09/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 12/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 10/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 10/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 12/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 10/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 10/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 13/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 11/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 11/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 14/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 13/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 11/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 11/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 12/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 13/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 14/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 12/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 11/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 15/0 : 1[13000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 12/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 14/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 12/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 14/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 12/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 13/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 15/0 : 4[c5000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 13/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 15/0 : 5[ca000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 13/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 15/0 : 6[e0000] -> 7[e4000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 13/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 14/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 14/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 14/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 14/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 15/0 : 7[e4000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 15/0 : 2[29000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 15/0 : 3[2d000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Channel 15/0 : 0[d000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 00/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 01/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Connected all rings +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 02/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 03/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 04/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 00/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 01/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 05/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 06/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 02/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 03/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 04/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 05/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 06/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 07/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 07/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 08/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 08/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 09/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 10/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 11/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 12/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 13/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 00/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 01/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 02/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 03/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 04/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 09/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 10/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 14/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Channel 15/0 : 5[ca000] -> 4[c5000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 05/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 06/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 07/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 08/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 09/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 10/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 11/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 12/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 13/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 14/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Channel 15/0 : 1[13000] -> 0[d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 11/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 12/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 13/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 00/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 14/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 01/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 00/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 02/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 01/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Channel 15/0 : 7[e4000] -> 6[e0000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 02/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 00/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 00/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 03/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 03/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 04/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 05/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 01/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 01/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 06/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 02/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 02/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 03/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 03/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 04/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 05/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 06/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 07/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 07/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 08/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 09/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 04/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 04/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 10/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 05/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 05/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 08/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 06/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 06/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 09/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 10/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 07/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 11/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 08/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 12/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 07/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 09/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 13/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 08/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 10/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 11/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 09/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 11/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 12/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 13/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 14/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Channel 15/0 : 3[2d000] -> 2[29000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 14/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 10/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 12/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Channel 15/0 : 6[e0000] -> 5[ca000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 11/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 13/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 12/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 14/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 13/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Channel 15/0 : 2[29000] -> 1[13000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 14/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Channel 15/0 : 4[c5000] -> 3[2d000] via P2P/IPC/read +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO Connected all trees +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO 16 coll channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer +t-20240715144206-zn57g-worker-0:53953:56808 [4] NCCL INFO comm 0x7f0814e897e0 rank 4 nranks 8 cudaDev 4 busId c5000 commId 0x530c9bc1e9e78cf2 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53950:56812 [1] NCCL INFO comm 0x7febc8fd5530 rank 1 nranks 8 cudaDev 1 busId 13000 commId 0x530c9bc1e9e78cf2 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53949:56807 [0] NCCL INFO comm 0x7f3cc8fa2ce0 rank 0 nranks 8 cudaDev 0 busId d000 commId 0x530c9bc1e9e78cf2 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53951:56811 [2] NCCL INFO comm 0x7f0f56a3fa20 rank 2 nranks 8 cudaDev 2 busId 29000 commId 0x530c9bc1e9e78cf2 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53954:56813 [5] NCCL INFO comm 0x7f0130fd1f00 rank 5 nranks 8 cudaDev 5 busId ca000 commId 0x530c9bc1e9e78cf2 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53955:56809 [6] NCCL INFO comm 0x7f2c58fd9410 rank 6 nranks 8 cudaDev 6 busId e0000 commId 0x530c9bc1e9e78cf2 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53952:56810 [3] NCCL INFO comm 0x7f8df8fb0ea0 rank 3 nranks 8 cudaDev 3 busId 2d000 commId 0x530c9bc1e9e78cf2 - Init COMPLETE +t-20240715144206-zn57g-worker-0:53956:56814 [7] NCCL INFO comm 0x7f266aa3a800 rank 7 nranks 8 cudaDev 7 busId e4000 commId 0x530c9bc1e9e78cf2 - Init COMPLETE +{'loss': 1.0824, 'grad_norm': 0.586446509459034, 'learning_rate': 2.5e-05, 'epoch': 0.06} +{'loss': 1.0109, 'grad_norm': 2.2316997759806494, 'learning_rate': 5e-05, 'epoch': 0.12} +{'loss': 0.6595, 'grad_norm': 0.4074702236613225, 'learning_rate': 4.98781004037916e-05, 'epoch': 0.18} +{'loss': 0.7391, 'grad_norm': 0.3126617987747972, 'learning_rate': 4.951359037609088e-05, 'epoch': 0.24} +{'loss': 0.7051, 'grad_norm': 0.708271236864645, 'learning_rate': 4.891002460691306e-05, 'epoch': 0.3} +{'loss': 0.7091, 'grad_norm': 0.3826205284015388, 'learning_rate': 4.807328905014201e-05, 'epoch': 0.36} +{'loss': 0.6642, 'grad_norm': 0.4309917712753102, 'learning_rate': 4.7011543523897996e-05, 'epoch': 0.41} +{'loss': 0.6352, 'grad_norm': 0.46078253102774147, 'learning_rate': 4.573514213625505e-05, 'epoch': 0.47} +{'loss': 0.695, 'grad_norm': 0.350545318021008, 'learning_rate': 4.425653231231344e-05, 'epoch': 0.53} +{'loss': 0.6716, 'grad_norm': 1.3918929820024042, 'learning_rate': 4.259013340731224e-05, 'epoch': 0.59} +{'eval_loss': 0.703880250453949, 'eval_runtime': 20.1945, 'eval_samples_per_second': 14.856, 'eval_steps_per_second': 1.882, 'epoch': 0.59} +[2024-07-15 08:47:06,381] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step100 is about to be saved! +[2024-07-15 08:47:06,413] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt +[2024-07-15 08:47:06,413] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2024-07-15 08:47:06,440] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2024-07-15 08:47:06,445] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-07-15 08:47:06,494] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-07-15 08:47:06,497] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2024-07-15 08:47:06,550] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step100 is ready now! +{'loss': 0.6649, 'grad_norm': 0.46801886344571736, 'learning_rate': 4.075219608954278e-05, 'epoch': 0.65} +{'loss': 0.6485, 'grad_norm': 0.7261378819214507, 'learning_rate': 3.876064386435646e-05, 'epoch': 0.71} +{'loss': 0.5978, 'grad_norm': 0.6311105785325938, 'learning_rate': 3.663489828471953e-05, 'epoch': 0.77} +{'loss': 0.6504, 'grad_norm': 1.0566991845506082, 'learning_rate': 3.4395689552855955e-05, 'epoch': 0.83} +{'loss': 0.6214, 'grad_norm': 0.42416309958564125, 'learning_rate': 3.206485435998498e-05, 'epoch': 0.89} +{'loss': 0.6144, 'grad_norm': 0.42527966548399343, 'learning_rate': 2.9665122935613727e-05, 'epoch': 0.95} +{'loss': 0.6493, 'grad_norm': 0.3979436989011932, 'learning_rate': 2.7219897383073373e-05, 'epoch': 1.01} +{'loss': 0.6474, 'grad_norm': 0.24203747971705217, 'learning_rate': 2.475302346296336e-05, 'epoch': 1.07} +{'loss': 0.564, 'grad_norm': 0.319431105160605, 'learning_rate': 2.2288558050064367e-05, 'epoch': 1.12} +{'loss': 0.5798, 'grad_norm': 0.5080338290607097, 'learning_rate': 1.9850534531472546e-05, 'epoch': 1.18} +{'eval_loss': 0.7010006904602051, 'eval_runtime': 18.8091, 'eval_samples_per_second': 15.95, 'eval_steps_per_second': 2.02, 'epoch': 1.18} +[2024-07-15 08:52:59,254] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved! +[2024-07-15 08:52:59,284] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt +[2024-07-15 08:52:59,284] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2024-07-15 08:52:59,310] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2024-07-15 08:52:59,317] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-07-15 08:52:59,589] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-07-15 08:52:59,590] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2024-07-15 08:52:59,623] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now! +{'loss': 0.6323, 'grad_norm': 0.564376924960013, 'learning_rate': 1.746272843378493e-05, 'epoch': 1.24} +{'loss': 0.6106, 'grad_norm': 0.498318513735455, 'learning_rate': 1.5148425564932084e-05, 'epoch': 1.3} +{'loss': 0.6374, 'grad_norm': 0.2827517725379645, 'learning_rate': 1.2930194931731382e-05, 'epoch': 1.36} +{'loss': 0.5758, 'grad_norm': 0.5094729039694312, 'learning_rate': 1.0829668647661559e-05, 'epoch': 1.42} +{'loss': 0.5988, 'grad_norm': 0.269479823527664, 'learning_rate': 8.867330977190877e-06, 'epoch': 1.48} +{'loss': 0.5956, 'grad_norm': 0.3701018971236146, 'learning_rate': 7.062318573891716e-06, 'epoch': 1.54} +{'loss': 0.5467, 'grad_norm': 0.43248474882397636, 'learning_rate': 5.4322338604131715e-06, 'epoch': 1.6} +{'loss': 0.599, 'grad_norm': 0.896259597910678, 'learning_rate': 3.992973370223896e-06, 'epoch': 1.66} +{'loss': 0.5505, 'grad_norm': 0.28651524116933125, 'learning_rate': 2.75857272513132e-06, 'epoch': 1.72} +{'loss': 0.5292, 'grad_norm': 0.5096606249052688, 'learning_rate': 1.7410697603511383e-06, 'epoch': 1.78} +{'eval_loss': 0.6997424960136414, 'eval_runtime': 19.0297, 'eval_samples_per_second': 15.765, 'eval_steps_per_second': 1.997, 'epoch': 1.78} +[2024-07-15 08:58:53,716] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step300 is about to be saved! +[2024-07-15 08:58:53,744] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt +[2024-07-15 08:58:53,744] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2024-07-15 08:58:53,771] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2024-07-15 08:58:53,773] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-07-15 08:58:53,830] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-07-15 08:58:53,831] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2024-07-15 08:58:53,885] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step300 is ready now! +{'loss': 0.6355, 'grad_norm': 0.6033783324728439, 'learning_rate': 9.503871319271551e-07, 'epoch': 1.83} +{'loss': 0.6309, 'grad_norm': 0.5681857650004716, 'learning_rate': 3.9423555131007925e-07, 'epoch': 1.89} +{'loss': 0.5814, 'grad_norm': 0.5632696843447448, 'learning_rate': 7.803859074854425e-08, 'epoch': 1.95} +[2024-07-15 09:00:56,333] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2024-07-15 09:00:56,381] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2024-07-15 09:00:56,381] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2024-07-15 09:00:56,395] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2024-07-15 09:00:56,422] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-07-15 09:00:56,443] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-07-15 09:00:56,443] [INFO] [engine.py:3019:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 0 +[2024-07-15 09:00:56,457] [INFO] [engine.py:2951:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 0 +{'train_runtime': 1192.6701, 'train_samples_per_second': 4.528, 'train_steps_per_second': 0.283, 'train_loss': 0.6494796699320776, 'epoch': 2.0} +***** train metrics ***** + epoch = 2.0 + total_flos = 114917GF + train_loss = 0.6495 + train_runtime = 0:19:52.67 + train_samples_per_second = 4.528 + train_steps_per_second = 0.283 +Figure saved at: /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/training_loss.png +Figure saved at: /ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/training_eval_loss.png +***** eval metrics ***** + epoch = 2.0 + eval_loss = 0.6997 + eval_runtime = 0:00:18.49 + eval_samples_per_second = 16.221 + eval_steps_per_second = 2.055 +[2024-07-15 09:01:33,631] [INFO] [launch.py:347:main] Process 53952 exits successfully. +[2024-07-15 09:01:33,632] [INFO] [launch.py:347:main] Process 53954 exits successfully. +[2024-07-15 09:01:33,632] [INFO] [launch.py:347:main] Process 53953 exits successfully. +[2024-07-15 09:01:33,632] [INFO] [launch.py:347:main] Process 53956 exits successfully. +[2024-07-15 09:01:33,632] [INFO] [launch.py:347:main] Process 53949 exits successfully. +[2024-07-15 09:01:33,633] [INFO] [launch.py:347:main] Process 53951 exits successfully. +[2024-07-15 09:01:34,634] [INFO] [launch.py:347:main] Process 53955 exits successfully. +[2024-07-15 09:01:34,634] [INFO] [launch.py:347:main] Process 53950 exits successfully.