Muennighoff
commited on
Commit
·
7f89c55
1
Parent(s):
e19438b
Add
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- {14m2b7100mv → 14m2b7100m}/3307601.err +0 -0
- {14m2b7100mv → 14m2b7100m}/3307601.out +0 -0
- 21m400m400m/3487337.err +141 -0
- 21m400m400m/3487337.out +852 -0
- 21m400m400m/global_step762/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- 21m400m400m/global_step762/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- 21m400m400m/global_step762/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- 21m400m400m/global_step762/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- 21m400m400m/global_step762/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- 21m400m400m/global_step762/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- 21m400m400m/global_step762/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- 21m400m400m/global_step762/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- 21m400m400m/global_step762/layer_01-model_00-model_states.pt +3 -0
- 21m400m400m/global_step762/layer_03-model_00-model_states.pt +3 -0
- 21m400m400m/global_step762/layer_04-model_00-model_states.pt +3 -0
- 21m400m400m/global_step762/layer_05-model_00-model_states.pt +3 -0
- 21m400m400m/global_step762/layer_06-model_00-model_states.pt +3 -0
- 21m400m400m/global_step762/layer_07-model_00-model_states.pt +3 -0
- 21m400m400m/global_step762/layer_09-model_00-model_states.pt +3 -0
- 21m400m400m/global_step762/mp_rank_00_model_states.pt +3 -0
- 21m400m400m/sbatch_21m400m400m.sh +172 -0
- 21m400m400m/sbatch_21m400m400mval.sh +167 -0
- 21m400m400m/tensorboard_21m400m400m/events.out.tfevents.1683665937.nid005223.63334.0 +3 -0
- 21m400m400m/tensorboard_21m400m400mval/events.out.tfevents.1683666595.nid007269.50912.0 +3 -0
- 21m400m400m/tensorboard_21m400m400mval/events.out.tfevents.1683666830.nid007269.54799.0 +3 -0
- 220m200b1b5/sbatch_220m200b1b5.sh +166 -0
- 220m200b1b5/sbatch_220m200b1b5val.sh +167 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679051664.nid006529.96495.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679051664.nid006860.1508.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679054214.nid005116.13183.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793013.nid006063.122512.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793015.nid006273.117983.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793016.nid005651.127873.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793016.nid006265.117563.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793017.nid006567.56933.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793017.nid006575.55528.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793022.nid005643.128637.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793023.nid005499.130838.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793030.nid006090.118840.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793032.nid006082.119082.0 +3 -0
- 220m200b1b5/tensorboard_220m200b1b5val/events.out.tfevents.1679048250.nid005617.100428.0 +3 -0
- 2b812b4b/tensorboard_2b812b4bval/events.out.tfevents.1683534410.nid005943.115511.0 +3 -0
- 2b816b4b/tensorboard_2b816b4bval/events.out.tfevents.1683561719.nid006565.21977.0 +3 -0
- 4b248b12b/3490059.err +0 -0
- 4b248b12b/3490059.out +0 -0
- 4b248b12b/global_step45776/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- 4b248b12b/global_step45776/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt +3 -0
- 4b248b12b/global_step45776/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +3 -0
- 4b248b12b/global_step45776/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt +3 -0
- 4b248b12b/global_step45776/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +3 -0
{14m2b7100mv → 14m2b7100m}/3307601.err
RENAMED
File without changes
|
{14m2b7100mv → 14m2b7100m}/3307601.out
RENAMED
File without changes
|
21m400m400m/3487337.err
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
0: 2023-05-10 00:12:26.642566: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
|
2 |
+
0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
3 |
+
0: 2023-05-10 00:12:26.642574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
|
4 |
+
0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
5 |
+
0: 2023-05-10 00:12:26.642587: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
|
6 |
+
0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
7 |
+
0: 2023-05-10 00:12:26.642609: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
|
8 |
+
0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
9 |
+
0: 2023-05-10 00:12:26.642609: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
|
10 |
+
0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
11 |
+
0: 2023-05-10 00:12:26.642625: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
|
12 |
+
0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
13 |
+
0: 2023-05-10 00:12:26.642632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
|
14 |
+
0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
15 |
+
0: 2023-05-10 00:12:26.642626: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
|
16 |
+
0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
17 |
+
0: 2023-05-10 00:12:35.492522: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
18 |
+
0: 2023-05-10 00:12:35.492558: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
19 |
+
0: 2023-05-10 00:12:35.492577: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
20 |
+
0: 2023-05-10 00:12:35.492623: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
21 |
+
0: 2023-05-10 00:12:35.492620: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
22 |
+
0: 2023-05-10 00:12:35.492624: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
23 |
+
0: 2023-05-10 00:12:35.492635: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
24 |
+
0: 2023-05-10 00:12:35.492657: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
25 |
+
0: 2023-05-10 00:12:35.493431: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
|
26 |
+
0: 2023-05-10 00:12:35.493451: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
|
27 |
+
0: 2023-05-10 00:12:35.493467: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
|
28 |
+
0: 2023-05-10 00:12:35.493463: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
|
29 |
+
0: 2023-05-10 00:12:35.493489: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
|
30 |
+
0: 2023-05-10 00:12:35.493493: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
|
31 |
+
0: 2023-05-10 00:12:35.493496: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
|
32 |
+
0: 2023-05-10 00:12:35.493501: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
|
33 |
+
0: 2023-05-10 00:12:57.097945: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
34 |
+
0: 2023-05-10 00:12:57.097970: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
35 |
+
0: 2023-05-10 00:12:57.097996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
36 |
+
0: 2023-05-10 00:12:57.098006: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
37 |
+
0: 2023-05-10 00:12:57.098019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
38 |
+
0: 2023-05-10 00:12:57.098024: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
39 |
+
0: 2023-05-10 00:12:57.098033: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
40 |
+
0: 2023-05-10 00:12:57.098214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
41 |
+
0: 2023-05-10 00:12:57.118445: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
42 |
+
0: 2023-05-10 00:12:57.118446: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
43 |
+
0: 2023-05-10 00:12:57.118459: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
44 |
+
0: 2023-05-10 00:12:57.118463: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
45 |
+
0: 2023-05-10 00:12:57.118466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
46 |
+
0: 2023-05-10 00:12:57.118480: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
|
47 |
+
0: 2023-05-10 00:12:57.118479: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
|
48 |
+
0: 2023-05-10 00:12:57.118472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
49 |
+
0: 2023-05-10 00:12:57.118472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
50 |
+
0: 2023-05-10 00:12:57.118473: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
|
51 |
+
0: 2023-05-10 00:12:57.118504: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
|
52 |
+
0: 2023-05-10 00:12:57.118507: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
|
53 |
+
0: 2023-05-10 00:12:57.118503: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
|
54 |
+
0: 2023-05-10 00:12:57.118515: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
|
55 |
+
0: 2023-05-10 00:12:57.118516: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
|
56 |
+
0: 2023-05-10 00:12:57.118517: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
|
57 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
58 |
+
0: Detected CUDA files, patching ldflags
|
59 |
+
0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
|
60 |
+
0: Building extension module scaled_upper_triang_masked_softmax_cuda...
|
61 |
+
0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
62 |
+
0: Loading extension module scaled_upper_triang_masked_softmax_cuda...
|
63 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
64 |
+
0: Detected CUDA files, patching ldflags
|
65 |
+
0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
|
66 |
+
0: Building extension module scaled_masked_softmax_cuda...
|
67 |
+
0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
68 |
+
0: Loading extension module scaled_masked_softmax_cuda...
|
69 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
70 |
+
0: Detected CUDA files, patching ldflags
|
71 |
+
0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
|
72 |
+
0: Building extension module fused_mix_prec_layer_norm_cuda...
|
73 |
+
0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
74 |
+
0: Loading extension module fused_mix_prec_layer_norm_cuda...
|
75 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
76 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
77 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
78 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
79 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
80 |
+
0: [92mSuccessfully preprocessed all matching files.[0m
|
81 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
|
82 |
+
0: warnings.warn(
|
83 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
|
84 |
+
0: warnings.warn(
|
85 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
|
86 |
+
0: warnings.warn(
|
87 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
|
88 |
+
0: warnings.warn(
|
89 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
|
90 |
+
0: warnings.warn(
|
91 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
|
92 |
+
0: warnings.warn(
|
93 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
|
94 |
+
0: warnings.warn(
|
95 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
|
96 |
+
0: warnings.warn(
|
97 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
98 |
+
0:
|
99 |
+
0:
|
100 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
101 |
+
0:
|
102 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
103 |
+
0:
|
104 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
105 |
+
0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja...
|
106 |
+
0: Building extension module utils...
|
107 |
+
0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
108 |
+
0: Loading extension module utils...
|
109 |
+
0: Loading extension module utils...
|
110 |
+
0: Loading extension module utils...
|
111 |
+
0: Loading extension module utils...
|
112 |
+
0: Loading extension module utils...
|
113 |
+
0: Loading extension module utils...
|
114 |
+
0: Loading extension module utils...
|
115 |
+
0: Loading extension module utils...
|
116 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
117 |
+
0:
|
118 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
119 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
120 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
121 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
122 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
123 |
+
0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...
|
124 |
+
0:
|
125 |
+
0: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...
|
126 |
+
0:
|
127 |
+
0:
|
128 |
+
0: Loading extension module utils...
|
129 |
+
0: No modifications detected for re-loaded extension module utils, skipping build step...
|
130 |
+
0: Loading extension module utils...
|
131 |
+
0: No modifications detected for re-loaded extension module utils, skipping build step...
|
132 |
+
0: Loading extension module utils...
|
133 |
+
0: No modifications detected for re-loaded extension module utils, skipping build step...
|
134 |
+
0: Loading extension module utils...
|
135 |
+
0: No modifications detected for re-loaded extension module utils, skipping build step...
|
136 |
+
0: Loading extension module utils...
|
137 |
+
0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
|
138 |
+
0: No modifications detected for re-loaded extension module utils, skipping build step...
|
139 |
+
0: Loading extension module utils...
|
140 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings
|
141 |
+
0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings")
|
21m400m400m/3487337.out
ADDED
@@ -0,0 +1,852 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model parameters: d_model 288 ffw_size 1152 kv_size 32 n_heads 7 n_layers 5
|
2 |
+
Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 5 --hidden-size 288 --num-attention-heads 7 --kv-channels 32 --ffn-hidden-size 1152 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 32 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-21m400m400mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --override-lr-scheduler --no-load-optim --reset-progress --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --tensorboard-dir tensorboard_21m400m400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_21m400m400m --load checkpoints_21m400m400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3487337.json --zero-stage 0
|
3 |
+
START 3487337: Wed 10 May 2023 12:11:40 AM EEST
|
4 |
+
0:
|
5 |
+
0:
|
6 |
+
0: ======================= ROCm System Management Interface =======================
|
7 |
+
0: ================================= Concise Info =================================
|
8 |
+
0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
9 |
+
0: 0 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
10 |
+
0: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
11 |
+
0: 2 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
12 |
+
0: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
13 |
+
0: 4 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
14 |
+
0: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
15 |
+
0: 6 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
16 |
+
0: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
17 |
+
0: ================================================================================
|
18 |
+
0: ============================= End of ROCm SMI Log ==============================
|
19 |
+
0: Launching on nid007269 (0/1), master nid007269 port 9999, GPUs 8, CUDA: True
|
20 |
+
0: using world size: 8, data-parallel-size: 8, tensor-model-parallel size: 1, pipeline-model-parallel size: 1
|
21 |
+
0: accumulate and all-reduce gradients in fp32 for bfloat16 data type.
|
22 |
+
0: using torch.bfloat16 for parameters ...
|
23 |
+
0: ------------------------ arguments ------------------------
|
24 |
+
0: abort_on_unmet_fused_kernel_constraints ......... False
|
25 |
+
0: accumulate_allreduce_grads_in_fp32 .............. True
|
26 |
+
0: adam_beta1 ...................................... 0.9
|
27 |
+
0: adam_beta2 ...................................... 0.999
|
28 |
+
0: adam_eps ........................................ 1e-08
|
29 |
+
0: adlr_autoresume ................................. False
|
30 |
+
0: adlr_autoresume_interval ........................ 1000
|
31 |
+
0: apply_query_key_layer_scaling ................... True
|
32 |
+
0: apply_residual_connection_post_layernorm ........ False
|
33 |
+
0: attention_dropout ............................... 0.1
|
34 |
+
0: attention_softmax_in_fp32 ....................... False
|
35 |
+
0: bert_binary_head ................................ True
|
36 |
+
0: bert_load ....................................... None
|
37 |
+
0: bf16 ............................................ True
|
38 |
+
0: bias_dropout_fusion ............................. True
|
39 |
+
0: bias_gelu_fusion ................................ True
|
40 |
+
0: biencoder_projection_dim ........................ 0
|
41 |
+
0: biencoder_shared_query_context_model ............ False
|
42 |
+
0: block_data_path ................................. None
|
43 |
+
0: checkpoint_activations .......................... False
|
44 |
+
0: checkpoint_in_cpu ............................... False
|
45 |
+
0: checkpoint_num_layers ........................... 1
|
46 |
+
0: clip_grad ....................................... 1.0
|
47 |
+
0: codecarbon_dir .................................. None
|
48 |
+
0: consumed_train_samples .......................... 0
|
49 |
+
0: consumed_train_tokens ........................... 0
|
50 |
+
0: consumed_valid_samples .......................... 0
|
51 |
+
0: contigious_checkpointing ........................ False
|
52 |
+
0: cpu_optimizer ................................... False
|
53 |
+
0: cpu_torch_adam .................................. False
|
54 |
+
0: curriculum_learning ............................. False
|
55 |
+
0: data_impl ....................................... mmap
|
56 |
+
0: data_parallel_size .............................. 8
|
57 |
+
0: data_path ....................................... None
|
58 |
+
0: dataloader_type ................................. single
|
59 |
+
0: DDP_impl ........................................ local
|
60 |
+
0: decoder_seq_length .............................. None
|
61 |
+
0: deepscale ....................................... False
|
62 |
+
0: deepscale_config ................................ None
|
63 |
+
0: deepspeed ....................................... True
|
64 |
+
0: deepspeed_activation_checkpointing .............. False
|
65 |
+
0: deepspeed_config ................................ ds_configs/3487337.json
|
66 |
+
0: deepspeed_mpi ................................... False
|
67 |
+
0: distribute_checkpointed_activations ............. False
|
68 |
+
0: distributed_backend ............................. nccl
|
69 |
+
0: embed_layernorm ................................. False
|
70 |
+
0: embedding_path .................................. None
|
71 |
+
0: encoder_seq_length .............................. 2048
|
72 |
+
0: eod_mask_loss ................................... False
|
73 |
+
0: eval_interval ................................... 1
|
74 |
+
0: eval_iters ...................................... 100
|
75 |
+
0: eval_only ....................................... None
|
76 |
+
0: evidence_data_path .............................. None
|
77 |
+
0: exit_duration_in_mins ........................... None
|
78 |
+
0: exit_interval ................................... None
|
79 |
+
0: ffn_hidden_size ................................. 1152
|
80 |
+
0: finetune ........................................ False
|
81 |
+
0: fp16 ............................................ False
|
82 |
+
0: fp16_lm_cross_entropy ........................... False
|
83 |
+
0: fp32_residual_connection ........................ False
|
84 |
+
0: gigaflos_no_embeds .............................. 0
|
85 |
+
0: global_batch_size ............................... 256
|
86 |
+
0: glu_activation .................................. None
|
87 |
+
0: hidden_dropout .................................. 0.1
|
88 |
+
0: hidden_size ..................................... 288
|
89 |
+
0: hysteresis ...................................... 2
|
90 |
+
0: ict_head_size ................................... None
|
91 |
+
0: ict_load ........................................ None
|
92 |
+
0: img_dim ......................................... 224
|
93 |
+
0: indexer_batch_size .............................. 128
|
94 |
+
0: indexer_log_interval ............................ 1000
|
95 |
+
0: inference ....................................... False
|
96 |
+
0: init_method_std ................................. 0.02
|
97 |
+
0: init_method_xavier_uniform ...................... False
|
98 |
+
0: initial_loss_scale .............................. 4294967296
|
99 |
+
0: kill_switch_path ................................ kill-switch-21m400m400mval
|
100 |
+
0: kv_channels ..................................... 32
|
101 |
+
0: layer_norm_fusion ............................... True
|
102 |
+
0: layernorm_epsilon ............................... 1e-05
|
103 |
+
0: lazy_mpu_init ................................... None
|
104 |
+
0: load ............................................ checkpoints_21m400m400m
|
105 |
+
0: local_rank ...................................... None
|
106 |
+
0: log_batch_size_to_tensorboard ................... True
|
107 |
+
0: log_interval .................................... 10
|
108 |
+
0: log_learning_rate_to_tensorboard ................ True
|
109 |
+
0: log_level ....................................... None
|
110 |
+
0: log_level_replica ............................... None
|
111 |
+
0: log_loss_scale_to_tensorboard ................... True
|
112 |
+
0: log_num_zeros_in_grad ........................... False
|
113 |
+
0: log_params_norm ................................. False
|
114 |
+
0: log_path ........................................ None
|
115 |
+
0: log_timers_to_tensorboard ....................... True
|
116 |
+
0: log_validation_ppl_to_tensorboard ............... True
|
117 |
+
0: loss_on_targets_only ............................ False
|
118 |
+
0: loss_scale ...................................... None
|
119 |
+
0: loss_scale_window ............................... 1000
|
120 |
+
0: lr .............................................. 0.0002
|
121 |
+
0: lr_decay_iters .................................. None
|
122 |
+
0: lr_decay_samples ................................ 1
|
123 |
+
0: lr_decay_style .................................. cosine
|
124 |
+
0: lr_decay_tokens ................................. None
|
125 |
+
0: lr_warmup_fraction .............................. None
|
126 |
+
0: lr_warmup_iters ................................. 0
|
127 |
+
0: lr_warmup_samples ............................... 0
|
128 |
+
0: make_vocab_size_divisible_by .................... 128
|
129 |
+
0: mask_prob ....................................... 0.15
|
130 |
+
0: masked_softmax_fusion ........................... True
|
131 |
+
0: max_position_embeddings ......................... 2048
|
132 |
+
0: mean_noise_span_length .......................... None
|
133 |
+
0: memory_centric_tiled_linear ..................... False
|
134 |
+
0: merge_file ...................................... gpt2/merges.txt
|
135 |
+
0: micro_batch_size ................................ 32
|
136 |
+
0: min_loss_scale .................................. 1.0
|
137 |
+
0: min_lr .......................................... 2e-05
|
138 |
+
0: mmap_warmup ..................................... False
|
139 |
+
0: no_load_optim ................................... True
|
140 |
+
0: no_load_rng ..................................... None
|
141 |
+
0: no_save_optim ................................... None
|
142 |
+
0: no_save_rng ..................................... None
|
143 |
+
0: noise_density ................................... None
|
144 |
+
0: num_attention_heads ............................. 7
|
145 |
+
0: num_channels .................................... 3
|
146 |
+
0: num_classes ..................................... 1000
|
147 |
+
0: num_layers ...................................... 5
|
148 |
+
0: num_layers_per_virtual_pipeline_stage ........... None
|
149 |
+
0: num_workers ..................................... 2
|
150 |
+
0: onnx_safe ....................................... None
|
151 |
+
0: openai_gelu ..................................... False
|
152 |
+
0: optimizer ....................................... adam
|
153 |
+
0: optimizer_fusion ................................ True
|
154 |
+
0: override_lr_scheduler ........................... True
|
155 |
+
0: pad_vocab_size_to ............................... None
|
156 |
+
0: params_dtype .................................... torch.bfloat16
|
157 |
+
0: partition_activations ........................... False
|
158 |
+
0: patch_dim ....................................... 16
|
159 |
+
0: pipeline_model_parallel_size .................... 1
|
160 |
+
0: position_embedding_type ......................... PositionEmbeddingType.absolute
|
161 |
+
0: pp_partition_method ............................. None
|
162 |
+
0: profile_backward ................................ False
|
163 |
+
0: query_in_block_prob ............................. 0.1
|
164 |
+
0: rampup_batch_size ............................... None
|
165 |
+
0: rank ............................................ 0
|
166 |
+
0: remote_device ................................... none
|
167 |
+
0: reset_attention_mask ............................ False
|
168 |
+
0: reset_position_ids .............................. False
|
169 |
+
0: reset_progress .................................. True
|
170 |
+
0: retriever_report_topk_accuracies ................ []
|
171 |
+
0: retriever_score_scaling ......................... False
|
172 |
+
0: retriever_seq_length ............................ 256
|
173 |
+
0: reweight_loss_based_on_position_frequency ....... False
|
174 |
+
0: sample_rate ..................................... 1.0
|
175 |
+
0: save ............................................ checkpoints_21m400m400m
|
176 |
+
0: save_interval ................................... 1000
|
177 |
+
0: scatter_gather_tensors_in_pipeline .............. True
|
178 |
+
0: scattered_embeddings ............................ False
|
179 |
+
0: seed ............................................ 1234
|
180 |
+
0: seq_length ...................................... 2048
|
181 |
+
0: sgd_momentum .................................... 0.9
|
182 |
+
0: short_seq_prob .................................. 0.1
|
183 |
+
0: skip_train_iteration_range ...................... None
|
184 |
+
0: split ........................................... None
|
185 |
+
0: split_transformers .............................. False
|
186 |
+
0: sync_tp_duplicated_parameters ................... False
|
187 |
+
0: synchronize_each_layer .......................... False
|
188 |
+
0: tensor_model_parallel_size ...................... 1
|
189 |
+
0: tensorboard_dir ................................. tensorboard_21m400m400mval
|
190 |
+
0: tensorboard_log_interval ........................ 1
|
191 |
+
0: tensorboard_queue_size .......................... 5
|
192 |
+
0: test_weighted_split_paths ....................... None
|
193 |
+
0: test_weighted_split_paths_path .................. None
|
194 |
+
0: tile_factor ..................................... 1
|
195 |
+
0: titles_data_path ................................ None
|
196 |
+
0: tokenizer_name_or_path .......................... None
|
197 |
+
0: tokenizer_type .................................. GPT2BPETokenizer
|
198 |
+
0: train_iters ..................................... None
|
199 |
+
0: train_samples ................................... 1
|
200 |
+
0: train_tokens .................................... None
|
201 |
+
0: train_weighted_split_names ...................... ['train']
|
202 |
+
0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']]
|
203 |
+
0: train_weighted_split_paths_path ................. None
|
204 |
+
0: train_weighted_split_splits ..................... [['0:1']]
|
205 |
+
0: train_weighted_split_weights .................... [['1.0']]
|
206 |
+
0: universal_checkpoint ............................ False
|
207 |
+
0: use_bnb_optimizer ............................... False
|
208 |
+
0: use_checkpoint_lr_scheduler ..................... False
|
209 |
+
0: use_contiguous_buffers_in_ddp ................... True
|
210 |
+
0: use_cpu_initialization .......................... None
|
211 |
+
0: use_one_sent_docs ............................... False
|
212 |
+
0: use_pin_memory .................................. False
|
213 |
+
0: valid_num_workers ............................... 2
|
214 |
+
0: valid_weighted_split_names ...................... ['validation']
|
215 |
+
0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']]
|
216 |
+
0: valid_weighted_split_paths_path ................. None
|
217 |
+
0: valid_weighted_split_splits ..................... [['0:1']]
|
218 |
+
0: valid_weighted_split_weights .................... [['1.0']]
|
219 |
+
0: virtual_pipeline_model_parallel_size ............ None
|
220 |
+
0: vocab_extra_ids ................................. 0
|
221 |
+
0: vocab_file ...................................... gpt2/vocab.json
|
222 |
+
0: weight_decay .................................... 0.1
|
223 |
+
0: world_size ...................................... 8
|
224 |
+
0: zero_allgather_bucket_size ...................... 0.0
|
225 |
+
0: zero_contigious_gradients ....................... False
|
226 |
+
0: zero_reduce_bucket_size ......................... 0.0
|
227 |
+
0: zero_reduce_scatter ............................. False
|
228 |
+
0: zero_stage ...................................... 0
|
229 |
+
0: -------------------- end of arguments ---------------------
|
230 |
+
0: setting number of micro-batches to constant 1
|
231 |
+
0: > building GPT2BPETokenizer tokenizer ...
|
232 |
+
0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
|
233 |
+
0: DeepSpeed general environment info:
|
234 |
+
0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch']
|
235 |
+
0: torch version .................... 1.13.0+rocm5.2
|
236 |
+
0: torch cuda version ............... None
|
237 |
+
0: torch hip version ................ 5.2.21151-afdc89f8
|
238 |
+
0: nvcc version ..................... None
|
239 |
+
0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed']
|
240 |
+
0: deepspeed info ................... 0.7.5, unknown, unknown
|
241 |
+
0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1
|
242 |
+
0: **** Git info for Megatron: git_hash=unknown git_branch=unknown ****
|
243 |
+
0: > initializing torch distributed ...
|
244 |
+
0: [2023-05-10 00:13:50,262] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
245 |
+
0: > setting tensorboard ...
|
246 |
+
0: > initializing tensor model parallel with size 1
|
247 |
+
0: > initializing pipeline model parallel with size 1
|
248 |
+
0: > setting random seeds to 1234 ...
|
249 |
+
0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
|
250 |
+
0: > compiling dataset index builder ...
|
251 |
+
0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
|
252 |
+
0: make: Nothing to be done for 'default'.
|
253 |
+
0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
|
254 |
+
0: >>> done with dataset index builder. Compilation time: 0.118 seconds
|
255 |
+
0: > compiling and loading fused kernels ...
|
256 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified]
|
257 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
|
258 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
|
259 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
|
260 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified]
|
261 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
|
262 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
|
263 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
|
264 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
|
265 |
+
0: Total number of unsupported CUDA function calls: 0
|
266 |
+
0:
|
267 |
+
0:
|
268 |
+
0: Total number of replaced kernel launches: 87
|
269 |
+
0: ninja: no work to do.
|
270 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified]
|
271 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified]
|
272 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
|
273 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
|
274 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
|
275 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
|
276 |
+
0: Total number of unsupported CUDA function calls: 0
|
277 |
+
0:
|
278 |
+
0:
|
279 |
+
0: Total number of replaced kernel launches: 63
|
280 |
+
0: ninja: no work to do.
|
281 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes]
|
282 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified]
|
283 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
|
284 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
|
285 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
|
286 |
+
0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
|
287 |
+
0: Total number of unsupported CUDA function calls: 0
|
288 |
+
0:
|
289 |
+
0:
|
290 |
+
0: Total number of replaced kernel launches: 67
|
291 |
+
0: [1/1] c++ layer_norm_cuda.o layer_norm_hip_kernel.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/opt/rocm/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so
|
292 |
+
0: >>> done with compiling and loading fused kernels. Compilation time: 11.020 seconds
|
293 |
+
0: time to initialize megatron (seconds): 26.195
|
294 |
+
0: [after megatron is initialized] datetime: 2023-05-10 00:14:02
|
295 |
+
0: building GPT model ...
|
296 |
+
0: [2023-05-10 00:14:02,324] [INFO] [utils.py:827:see_memory_usage] Before Building Model
|
297 |
+
0: [2023-05-10 00:14:02,325] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB
|
298 |
+
0: [2023-05-10 00:14:02,325] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.4 GB, percent = 7.4%
|
299 |
+
0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None
|
300 |
+
0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7}
|
301 |
+
0: [2023-05-10 00:14:02,576] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer
|
302 |
+
0: stage=0 layers=12
|
303 |
+
0: 0: _to_float16
|
304 |
+
0: 1: EmbeddingPipe
|
305 |
+
0: 2: <lambda>
|
306 |
+
0: 3: ParallelTransformerLayerPipe
|
307 |
+
0: 4: ParallelTransformerLayerPipe
|
308 |
+
0: 5: ParallelTransformerLayerPipe
|
309 |
+
0: 6: ParallelTransformerLayerPipe
|
310 |
+
0: 7: ParallelTransformerLayerPipe
|
311 |
+
0: 8: undo
|
312 |
+
0: 9: MixedFusedLayerNorm
|
313 |
+
0: 10: EmbeddingPipe
|
314 |
+
0: 11: float16_to_fp32
|
315 |
+
0: loss: CrossEntropy
|
316 |
+
0: [2023-05-10 00:14:03,105] [INFO] [utils.py:827:see_memory_usage] After Building Model
|
317 |
+
0: [2023-05-10 00:14:03,106] [INFO] [utils.py:828:see_memory_usage] MA 0.04 GB Max_MA 0.04 GB CA 0.06 GB Max_CA 0 GB
|
318 |
+
0: [2023-05-10 00:14:03,106] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.41 GB, percent = 7.4%
|
319 |
+
0: setting training iterations to 0
|
320 |
+
0: > learning rate decay style: cosine
|
321 |
+
0: DeepSpeed is enabled.
|
322 |
+
0: [2023-05-10 00:14:03,107] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown
|
323 |
+
0: [2023-05-10 00:14:08,294] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
|
324 |
+
0: [2023-05-10 00:14:08,295] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer
|
325 |
+
0: [2023-05-10 00:14:08,295] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer
|
326 |
+
0: [2023-05-10 00:14:08,296] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
|
327 |
+
0: [2023-05-10 00:14:08,296] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer
|
328 |
+
0: [2023-05-10 00:14:08,414] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer
|
329 |
+
0: [2023-05-10 00:14:08,415] [INFO] [utils.py:828:see_memory_usage] MA 0.04 GB Max_MA 0.04 GB CA 0.06 GB Max_CA 0 GB
|
330 |
+
0: [2023-05-10 00:14:08,415] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 38.96 GB, percent = 7.7%
|
331 |
+
0: ninja: no work to do.
|
332 |
+
0: Time to load utils op: 0.7600808143615723 seconds
|
333 |
+
0: Time to load utils op: 0.7584981918334961 seconds
|
334 |
+
0: Time to load utils op: 0.7595300674438477 seconds
|
335 |
+
0: Time to load utils op: 0.7585179805755615 seconds
|
336 |
+
0: Time to load utils op: 0.7589340209960938 seconds
|
337 |
+
0: Time to load utils op: 0.7584795951843262 seconds
|
338 |
+
0: Time to load utils op: 0.6399149894714355 seconds
|
339 |
+
0: Time to load utils op: 0.760486364364624 seconds
|
340 |
+
0: [2023-05-10 00:14:09,166] [INFO] [utils.py:827:see_memory_usage] before initializing group 0
|
341 |
+
0: [2023-05-10 00:14:09,167] [INFO] [utils.py:828:see_memory_usage] MA 0.04 GB Max_MA 0.04 GB CA 0.06 GB Max_CA 0 GB
|
342 |
+
0: [2023-05-10 00:14:09,167] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 38.29 GB, percent = 7.6%
|
343 |
+
0: Time to load utils op: 0.0005950927734375 secondsTime to load utils op: 0.00048732757568359375 seconds
|
344 |
+
0:
|
345 |
+
0: Time to load utils op: 0.00044083595275878906 seconds
|
346 |
+
0: Time to load utils op: 0.0006310939788818359 seconds
|
347 |
+
0: Time to load utils op: 0.0004439353942871094 seconds
|
348 |
+
0: Time to load utils op: 0.0006113052368164062 seconds
|
349 |
+
0: Time to load utils op: 0.0011534690856933594 seconds
|
350 |
+
0: [2023-05-10 00:14:10,015] [INFO] [utils.py:827:see_memory_usage] after initializing group 0
|
351 |
+
0: [2023-05-10 00:14:10,016] [INFO] [utils.py:828:see_memory_usage] MA 0.11 GB Max_MA 0.11 GB CA 0.15 GB Max_CA 0 GB
|
352 |
+
0: [2023-05-10 00:14:10,016] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.55 GB, percent = 7.5%
|
353 |
+
0: [2023-05-10 00:14:10,123] [INFO] [utils.py:827:see_memory_usage] before initializing group 1
|
354 |
+
0: [2023-05-10 00:14:10,124] [INFO] [utils.py:828:see_memory_usage] MA 0.11 GB Max_MA 0.11 GB CA 0.15 GB Max_CA 0 GB
|
355 |
+
0: [2023-05-10 00:14:10,124] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
|
356 |
+
0: [2023-05-10 00:14:10,226] [INFO] [utils.py:827:see_memory_usage] after initializing group 1
|
357 |
+
0: [2023-05-10 00:14:10,226] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.15 GB Max_CA 0 GB
|
358 |
+
0: [2023-05-10 00:14:10,227] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
|
359 |
+
0: [2023-05-10 00:14:10,327] [INFO] [utils.py:827:see_memory_usage] before initializing group 2
|
360 |
+
0: [2023-05-10 00:14:10,328] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.15 GB Max_CA 0 GB
|
361 |
+
0: [2023-05-10 00:14:10,328] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
|
362 |
+
0: [2023-05-10 00:14:10,430] [INFO] [utils.py:827:see_memory_usage] after initializing group 2
|
363 |
+
0: [2023-05-10 00:14:10,430] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.15 GB Max_CA 0 GB
|
364 |
+
0: [2023-05-10 00:14:10,430] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
|
365 |
+
0: [2023-05-10 00:14:10,531] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer
|
366 |
+
0: [2023-05-10 00:14:10,531] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.15 GB Max_CA 0 GB
|
367 |
+
0: [2023-05-10 00:14:10,531] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
|
368 |
+
0: [2023-05-10 00:14:10,637] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer
|
369 |
+
0: [2023-05-10 00:14:10,638] [INFO] [utils.py:828:see_memory_usage] MA 0.14 GB Max_MA 0.14 GB CA 0.15 GB Max_CA 0 GB
|
370 |
+
0: [2023-05-10 00:14:10,638] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
|
371 |
+
0: [2023-05-10 00:14:10,738] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer
|
372 |
+
0: [2023-05-10 00:14:10,739] [INFO] [utils.py:828:see_memory_usage] MA 0.14 GB Max_MA 0.14 GB CA 0.15 GB Max_CA 0 GB
|
373 |
+
0: [2023-05-10 00:14:10,739] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
|
374 |
+
0: [2023-05-10 00:14:10,739] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam
|
375 |
+
0: [2023-05-10 00:14:10,739] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler
|
376 |
+
0: [2023-05-10 00:14:10,739] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = <megatron.learning_rates.AnnealingLR object at 0x14b88d151fd0>
|
377 |
+
0: [2023-05-10 00:14:10,739] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)]
|
378 |
+
0: [2023-05-10 00:14:10,739] [INFO] [config.py:1007:print] DeepSpeedEngine configuration:
|
379 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] activation_checkpointing_config {
|
380 |
+
0: "partition_activations": false,
|
381 |
+
0: "contiguous_memory_optimization": false,
|
382 |
+
0: "cpu_checkpointing": false,
|
383 |
+
0: "number_checkpoints": null,
|
384 |
+
0: "synchronize_checkpoint_boundary": false,
|
385 |
+
0: "profile": false
|
386 |
+
0: }
|
387 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
|
388 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] amp_enabled .................. False
|
389 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] amp_params ................... False
|
390 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] autotuning_config ............ {
|
391 |
+
0: "enabled": false,
|
392 |
+
0: "start_step": null,
|
393 |
+
0: "end_step": null,
|
394 |
+
0: "metric_path": null,
|
395 |
+
0: "arg_mappings": null,
|
396 |
+
0: "metric": "throughput",
|
397 |
+
0: "model_info": null,
|
398 |
+
0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results",
|
399 |
+
0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps",
|
400 |
+
0: "overwrite": true,
|
401 |
+
0: "fast": true,
|
402 |
+
0: "start_profile_step": 3,
|
403 |
+
0: "end_profile_step": 5,
|
404 |
+
0: "tuner_type": "gridsearch",
|
405 |
+
0: "tuner_early_stopping": 5,
|
406 |
+
0: "tuner_num_trials": 50,
|
407 |
+
0: "model_info_path": null,
|
408 |
+
0: "mp_size": 1,
|
409 |
+
0: "max_train_batch_size": null,
|
410 |
+
0: "min_train_batch_size": 1,
|
411 |
+
0: "max_train_micro_batch_size_per_gpu": 1.024000e+03,
|
412 |
+
0: "min_train_micro_batch_size_per_gpu": 1,
|
413 |
+
0: "num_tuning_micro_batch_sizes": 3
|
414 |
+
0: }
|
415 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] bfloat16_enabled ............. True
|
416 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False
|
417 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True
|
418 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False
|
419 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x14b88d151d60>
|
420 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] communication_data_type ...... None
|
421 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa
|
422 |
+
0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
|
423 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] curriculum_enabled ........... False
|
424 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] curriculum_params ............ False
|
425 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] dataloader_drop_last ......... False
|
426 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] disable_allgather ............ False
|
427 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] dump_state ................... False
|
428 |
+
0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None
|
429 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False
|
430 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1
|
431 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer
|
432 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0
|
433 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100
|
434 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06
|
435 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01
|
436 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False
|
437 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] elasticity_enabled ........... False
|
438 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] flops_profiler_config ........ {
|
439 |
+
0: "enabled": false,
|
440 |
+
0: "profile_step": 1,
|
441 |
+
0: "module_depth": -1,
|
442 |
+
0: "top_modules": 1,
|
443 |
+
0: "detailed": true,
|
444 |
+
0: "output_file": null
|
445 |
+
0: }
|
446 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] fp16_auto_cast ............... None
|
447 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] fp16_enabled ................. False
|
448 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False
|
449 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] global_rank .................. 0
|
450 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1
|
451 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0
|
452 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0
|
453 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1
|
454 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] load_universal_checkpoint .... False
|
455 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] loss_scale ................... 1.0
|
456 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] memory_breakdown ............. False
|
457 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] monitor_config ............... <deepspeed.monitor.config.DeepSpeedMonitorConfig object at 0x14b88d151ca0>
|
458 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] nebula_config ................ {
|
459 |
+
0: "enabled": false,
|
460 |
+
0: "persistent_storage_path": null,
|
461 |
+
0: "persistent_time_interval": 100,
|
462 |
+
0: "num_of_version_in_retention": 2,
|
463 |
+
0: "enable_nebula_load": true,
|
464 |
+
0: "load_path": null
|
465 |
+
0: }
|
466 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False
|
467 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] optimizer_name ............... None
|
468 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] optimizer_params ............. None
|
469 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
|
470 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] pld_enabled .................. False
|
471 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] pld_params ................... False
|
472 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] prescale_gradients ........... False
|
473 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] scheduler_name ............... None
|
474 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] scheduler_params ............. None
|
475 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] sparse_attention ............. None
|
476 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False
|
477 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] steps_per_print .............. 2000
|
478 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] train_batch_size ............. 256
|
479 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 32
|
480 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] use_node_local_storage ....... False
|
481 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False
|
482 |
+
0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] world_size ................... 8
|
483 |
+
0: [2023-05-10 00:14:10,742] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False
|
484 |
+
0: [2023-05-10 00:14:10,742] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False
|
485 |
+
0: [2023-05-10 00:14:10,742] [INFO] [config.py:1011:print] zero_enabled ................. False
|
486 |
+
0: [2023-05-10 00:14:10,742] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0
|
487 |
+
0: [2023-05-10 00:14:10,742] [INFO] [config.py:996:print_user_config] json = {
|
488 |
+
0: "train_micro_batch_size_per_gpu": 32,
|
489 |
+
0: "train_batch_size": 256,
|
490 |
+
0: "gradient_clipping": 1.0,
|
491 |
+
0: "zero_optimization": {
|
492 |
+
0: "stage": 0
|
493 |
+
0: },
|
494 |
+
0: "bf16": {
|
495 |
+
0: "enabled": true
|
496 |
+
0: },
|
497 |
+
0: "steps_per_print": 2.000000e+03,
|
498 |
+
0: "wall_clock_breakdown": false
|
499 |
+
0: }
|
500 |
+
0: Time to load utils op: 0.00042510032653808594 seconds
|
501 |
+
0: [2023-05-10 00:14:10,742] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=32
|
502 |
+
0: [2023-05-10 00:14:10,754] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=12 [0, 12) STAGE_PARAMS=19703712 (19.704M) TOTAL_PARAMS=19703712 (19.704M) UNIQUE_PARAMS=19703712 (19.704M)
|
503 |
+
0: [2023-05-10 00:14:10,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
504 |
+
0: [2023-05-10 00:14:10,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
505 |
+
0: [2023-05-10 00:14:10,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
506 |
+
0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
507 |
+
0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
508 |
+
0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
509 |
+
0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
510 |
+
0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
511 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
512 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
513 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
514 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
515 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
516 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
517 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
518 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
519 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
520 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
521 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
522 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
523 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
524 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
525 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
526 |
+
0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
|
527 |
+
0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
528 |
+
0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
529 |
+
0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
530 |
+
0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
531 |
+
0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
532 |
+
0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
533 |
+
0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
534 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
535 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
|
536 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
537 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
538 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
539 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
540 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
541 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
542 |
+
0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
543 |
+
0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
544 |
+
0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
545 |
+
0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
546 |
+
0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
547 |
+
0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
548 |
+
0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
549 |
+
0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
550 |
+
0: [2023-05-10 00:14:11,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
551 |
+
0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
552 |
+
0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
553 |
+
0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
554 |
+
0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
555 |
+
0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
556 |
+
0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
557 |
+
0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
558 |
+
0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
|
559 |
+
0: [2023-05-10 00:14:11,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
560 |
+
0: [2023-05-10 00:14:11,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
561 |
+
0: [2023-05-10 00:14:11,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
562 |
+
0: [2023-05-10 00:14:11,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
563 |
+
0: [2023-05-10 00:14:11,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
564 |
+
0: [2023-05-10 00:14:11,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
565 |
+
0: [2023-05-10 00:14:11,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
566 |
+
0: [2023-05-10 00:14:11,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
|
567 |
+
0: [2023-05-10 00:14:11,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
568 |
+
0: [2023-05-10 00:14:11,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
569 |
+
0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
570 |
+
0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
571 |
+
0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
572 |
+
0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
573 |
+
0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
574 |
+
0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
575 |
+
0: [2023-05-10 00:14:11,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
576 |
+
0: [2023-05-10 00:14:11,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
577 |
+
0: [2023-05-10 00:14:11,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
578 |
+
0: [2023-05-10 00:14:11,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
579 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
580 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
581 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
582 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
583 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
584 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
585 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
586 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
587 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
588 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
589 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
590 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
591 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
|
592 |
+
0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
593 |
+
0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
594 |
+
0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
595 |
+
0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
596 |
+
0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
597 |
+
0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
598 |
+
0: [2023-05-10 00:14:11,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
599 |
+
0: [2023-05-10 00:14:11,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
600 |
+
0: [2023-05-10 00:14:11,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
|
601 |
+
0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
602 |
+
0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
603 |
+
0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
604 |
+
0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
605 |
+
0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
606 |
+
0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
607 |
+
0: [2023-05-10 00:14:11,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
608 |
+
0: [2023-05-10 00:14:11,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
609 |
+
0: [2023-05-10 00:14:11,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
610 |
+
0: [2023-05-10 00:14:11,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
611 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
612 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
613 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
614 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
615 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
616 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
617 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
618 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
619 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
620 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
621 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
622 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
623 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
|
624 |
+
0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
625 |
+
0: [2023-05-10 00:14:11,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
626 |
+
0: [2023-05-10 00:14:11,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
627 |
+
0: [2023-05-10 00:14:11,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
628 |
+
0: [2023-05-10 00:14:11,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
629 |
+
0: [2023-05-10 00:14:11,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
630 |
+
0: [2023-05-10 00:14:11,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
631 |
+
0: [2023-05-10 00:14:11,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
632 |
+
0: [2023-05-10 00:14:11,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
|
633 |
+
0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
634 |
+
0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
635 |
+
0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
636 |
+
0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
637 |
+
0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
638 |
+
0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
639 |
+
0: [2023-05-10 00:14:11,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
640 |
+
0: [2023-05-10 00:14:11,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
641 |
+
0: [2023-05-10 00:14:11,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
642 |
+
0: [2023-05-10 00:14:11,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
643 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
644 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
645 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
646 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
647 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
648 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
649 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
650 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
651 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
652 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
653 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
654 |
+
0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
|
655 |
+
0: [2023-05-10 00:14:11,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
656 |
+
0: [2023-05-10 00:14:11,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
657 |
+
0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
658 |
+
0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
659 |
+
0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
660 |
+
0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
661 |
+
0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
662 |
+
0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
663 |
+
0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
664 |
+
0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
|
665 |
+
0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
666 |
+
0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
667 |
+
0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
668 |
+
0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
669 |
+
0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
670 |
+
0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
671 |
+
0: [2023-05-10 00:14:11,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
672 |
+
0: [2023-05-10 00:14:11,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
673 |
+
0: [2023-05-10 00:14:11,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
674 |
+
0: [2023-05-10 00:14:11,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
675 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
676 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
677 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
678 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
679 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
680 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
681 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
682 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
683 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
684 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
685 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
686 |
+
0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
|
687 |
+
0: [2023-05-10 00:14:11,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
688 |
+
0: [2023-05-10 00:14:11,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
689 |
+
0: [2023-05-10 00:14:11,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
690 |
+
0: [2023-05-10 00:14:11,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
691 |
+
0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
692 |
+
0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
693 |
+
0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
694 |
+
0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
695 |
+
0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
696 |
+
0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
|
697 |
+
0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
698 |
+
0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
699 |
+
0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
700 |
+
0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
701 |
+
0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
702 |
+
0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
703 |
+
0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
704 |
+
0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
705 |
+
0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
706 |
+
0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
707 |
+
0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
708 |
+
0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
709 |
+
0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
710 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
711 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
712 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
713 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
714 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
715 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
716 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
717 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
718 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
|
719 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
720 |
+
0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
721 |
+
0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
722 |
+
0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
723 |
+
0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
724 |
+
0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
725 |
+
0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
726 |
+
0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
|
727 |
+
0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
728 |
+
0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
729 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
730 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
731 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
732 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
733 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
734 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
735 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
736 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
737 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
738 |
+
0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
739 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
740 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
741 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
742 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
743 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
744 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
745 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
746 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
747 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
748 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
749 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
750 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
751 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
752 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
|
753 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
754 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
755 |
+
0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
756 |
+
0: [2023-05-10 00:14:11,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
757 |
+
0: [2023-05-10 00:14:11,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
758 |
+
0: [2023-05-10 00:14:11,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
|
759 |
+
0: > overriding learning rate value to 0.0002
|
760 |
+
0: > overriding minimum learning rate value to 2e-05
|
761 |
+
0: > overriding warmup iterations value to 0
|
762 |
+
0: > overriding total number of iterations value to 1
|
763 |
+
0: > overriding decay style value to cosine
|
764 |
+
0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
|
765 |
+
0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
766 |
+
0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
|
767 |
+
0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
|
768 |
+
0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
|
769 |
+
0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
|
770 |
+
0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
|
771 |
+
0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
|
772 |
+
0: [2023-05-10 00:14:11,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
|
773 |
+
0: [2023-05-10 00:14:11,363] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 4
|
774 |
+
0: [2023-05-10 00:14:11,365] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 4
|
775 |
+
0: [2023-05-10 00:14:11,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
|
776 |
+
0: [2023-05-10 00:14:11,376] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 3
|
777 |
+
0: [2023-05-10 00:14:11,378] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 3
|
778 |
+
0: [2023-05-10 00:14:11,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
779 |
+
0: [2023-05-10 00:14:11,387] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 0
|
780 |
+
0: [2023-05-10 00:14:11,390] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 0
|
781 |
+
0: could not find arguments in the checkpoint ...
|
782 |
+
0: checkpoint version 3.0
|
783 |
+
0: [2023-05-10 00:14:11,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
|
784 |
+
0: [2023-05-10 00:14:11,395] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 5
|
785 |
+
0: [2023-05-10 00:14:11,397] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 5
|
786 |
+
0: [2023-05-10 00:14:11,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
|
787 |
+
0: [2023-05-10 00:14:11,398] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 2
|
788 |
+
0: [2023-05-10 00:14:11,400] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 2
|
789 |
+
0: [2023-05-10 00:14:11,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
|
790 |
+
0: [2023-05-10 00:14:11,409] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 1
|
791 |
+
0: [2023-05-10 00:14:11,411] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 1
|
792 |
+
0: [2023-05-10 00:14:11,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
|
793 |
+
0: [2023-05-10 00:14:11,444] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 7
|
794 |
+
0: [2023-05-10 00:14:11,445] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 7
|
795 |
+
0: [2023-05-10 00:14:11,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
|
796 |
+
0: [2023-05-10 00:14:11,454] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 6
|
797 |
+
0: [2023-05-10 00:14:11,455] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 6
|
798 |
+
0: successfully loaded checkpoint from checkpoints_21m400m400m at iteration 0
|
799 |
+
0: time (ms) | load-checkpoint: 701.61
|
800 |
+
0: estimated model parameters: 0.019703712
|
801 |
+
0: estimated model parameters without embeddings: 0.004626336
|
802 |
+
0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-05-10 00:14:12
|
803 |
+
0: > building train, validation, and test datasets ...
|
804 |
+
0: > datasets target sizes (minimum size):
|
805 |
+
0: train: 1
|
806 |
+
0: validation: 25600
|
807 |
+
0: test: 25600
|
808 |
+
0: > building train, validation, and test datasets for GPT ...
|
809 |
+
0: > building dataset index ...
|
810 |
+
0: reading sizes...
|
811 |
+
0: reading pointers...
|
812 |
+
0: reading document index...
|
813 |
+
0: creating numpy buffer of mmap...
|
814 |
+
0: creating memory view of numpy buffer...
|
815 |
+
0: > finished creating indexed dataset in 0.036932 seconds
|
816 |
+
0: number of documents: 835726
|
817 |
+
0: > dataset split:
|
818 |
+
0: train:
|
819 |
+
0: document indices in [0, 835726) total of 835726 documents
|
820 |
+
0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy
|
821 |
+
0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy
|
822 |
+
0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy
|
823 |
+
0: loaded indexed file in 0.113 seconds
|
824 |
+
0: total number of samples: 195101
|
825 |
+
0: total number of epochs: 1
|
826 |
+
0: > building dataset index ...
|
827 |
+
0: reading sizes...
|
828 |
+
0: reading pointers...
|
829 |
+
0: reading document index...
|
830 |
+
0: creating numpy buffer of mmap...
|
831 |
+
0: creating memory view of numpy buffer...
|
832 |
+
0: > finished creating indexed dataset in 0.110965 seconds
|
833 |
+
0: number of documents: 364608
|
834 |
+
0: > dataset split:
|
835 |
+
0: validation:
|
836 |
+
0: document indices in [0, 364608) total of 364608 documents
|
837 |
+
0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy
|
838 |
+
0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy
|
839 |
+
0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy
|
840 |
+
0: loaded indexed file in 0.096 seconds
|
841 |
+
0: total number of samples: 84978
|
842 |
+
0: total number of epochs: 1
|
843 |
+
0: > finished creating GPT datasets ...
|
844 |
+
0: time (ms) | model-and-optimizer-setup: 9807.69 | train/valid/test-data-iterators-setup: 7326.47
|
845 |
+
0: [after dataloaders are built] datetime: 2023-05-10 00:14:19
|
846 |
+
0: done with setup ...
|
847 |
+
0: training ...
|
848 |
+
0: [after training is done] datetime: 2023-05-10 00:14:19
|
849 |
+
0: -----------------------------------------------------------------------------------------------------------------
|
850 |
+
0: validation loss at the end of training for val data | lm loss value: 6.096268E+00 | lm loss PPL: 4.441970E+02 |
|
851 |
+
0: -----------------------------------------------------------------------------------------------------------------
|
852 |
+
END 3487337: Wed 10 May 2023 12:15:00 AM EEST
|
21m400m400m/global_step762/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac95f09abe87dd9c68b9b4a9829080de26ac03ebe49e9fab6898e669e10204ca
|
3 |
+
size 29560343
|
21m400m400m/global_step762/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:efb818b085f236a1a1f791c2f1a795db756a52a66ab08d0286bf5b8006b76794
|
3 |
+
size 29560215
|
21m400m400m/global_step762/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ef244aa36694f385dc28084076b72043fa3814eca2d07cf51b033500ef3989e
|
3 |
+
size 29560343
|
21m400m400m/global_step762/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1317c9587f00bee62633072cbf5394ec9ee83838a437729fb54847887465bfee
|
3 |
+
size 29560343
|
21m400m400m/global_step762/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6913ca4a0ae0e6991b668a9c7a92acc87ea502c5a5c096c07d8195a5cd06f69b
|
3 |
+
size 29560279
|
21m400m400m/global_step762/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9ef2876867f1b52a3d6362517c9d58f2b70641956bff4a03cea9b7a09fabb83c
|
3 |
+
size 29560343
|
21m400m400m/global_step762/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47ed8dd698a7b82205d723acc34b2f90ff96e074b123b252f2db5bd902c18174
|
3 |
+
size 29560471
|
21m400m400m/global_step762/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f5f4b3628d526b6ca81f845847cc6c26fafc52052188540565a0478e03ebf0a
|
3 |
+
size 29560727
|
21m400m400m/global_step762/layer_01-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88174b136681638ce4da8cddad57423ca02d50894fedede8a82b5ed6b21df9a5
|
3 |
+
size 30156035
|
21m400m400m/global_step762/layer_03-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:da290e4e9bdc0b0bf908d9233566700072a2485174cf204ca66121dc8e16afe1
|
3 |
+
size 1854659
|
21m400m400m/global_step762/layer_04-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:248612036a8e9e67d8a9b5fd1ee266df4e0bc31afc9b7162f40554f7ebd5ff96
|
3 |
+
size 1854659
|
21m400m400m/global_step762/layer_05-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c82a70fc53941272564fca38b712423a55b937d3b1395524d24a8e0b42cf2790
|
3 |
+
size 1854659
|
21m400m400m/global_step762/layer_06-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa99a8168c79f748cd8639466fd29bbb78f68c770fbf4b24763f6d72d2345b91
|
3 |
+
size 1854659
|
21m400m400m/global_step762/layer_07-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3fb0d4bf06d9d68253b1b5556324c495446c93f1237ad79195f56ff4b1cf4ecf
|
3 |
+
size 1854659
|
21m400m400m/global_step762/layer_09-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2670512cabbec37a36b100140fc6e12d6f75b5c85108026ccf5971554ee687f
|
3 |
+
size 2371
|
21m400m400m/global_step762/mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9a98d0d201d95a08d421020b41ee795e8f824d673cf1d88e81bdc3733007d47
|
3 |
+
size 27827
|
21m400m400m/sbatch_21m400m400m.sh
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
#SBATCH --ntasks-per-node=1
|
5 |
+
#SBATCH --cpus-per-task=32
|
6 |
+
#SBATCH --mem=256G
|
7 |
+
#SBATCH -p standard-g
|
8 |
+
#SBATCH -t 2-0:00:00
|
9 |
+
#SBATCH --gpus-per-node=mi250:8
|
10 |
+
#SBATCH --exclusive=user
|
11 |
+
#SBATCH --hint=nomultithread
|
12 |
+
#SBATCH --account=project_462000119
|
13 |
+
#SBATCH -o logs/%j.out
|
14 |
+
#SBATCH -e logs/%j.err
|
15 |
+
|
16 |
+
VARIANT=21m400m400m
|
17 |
+
|
18 |
+
# if run without sbatch, invoke here
|
19 |
+
if [ -z $SLURM_JOB_ID ]; then
|
20 |
+
mkdir -p logs
|
21 |
+
sbatch "$0"
|
22 |
+
exit
|
23 |
+
fi
|
24 |
+
|
25 |
+
set -euo pipefail
|
26 |
+
|
27 |
+
# symlink logs/latest.out and logs/latest.err
|
28 |
+
ln -f -s $SLURM_JOB_ID.out logs/latest.out
|
29 |
+
ln -f -s $SLURM_JOB_ID.err logs/latest.err
|
30 |
+
|
31 |
+
KILL_SWITCH_PATH=kill-switch-$VARIANT
|
32 |
+
CHECKPOINT_PATH=checkpoints_$VARIANT
|
33 |
+
TENSORBOARD_PATH=tensorboard_$VARIANT
|
34 |
+
mkdir -p $CHECKPOINT_PATH
|
35 |
+
mkdir -p $TENSORBOARD_PATH
|
36 |
+
|
37 |
+
# Data
|
38 |
+
VOCAB_FILE="gpt2/vocab.json"
|
39 |
+
MERGE_FILE="gpt2/merges.txt"
|
40 |
+
#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
|
41 |
+
TRAIN_DATA_PATH=train400m.txt
|
42 |
+
# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document"
|
43 |
+
VALID_DATA_PATH=val.txt
|
44 |
+
# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
|
45 |
+
|
46 |
+
|
47 |
+
PP_SIZE=1
|
48 |
+
TP_SIZE=1
|
49 |
+
|
50 |
+
MICRO_BATCH_SIZE=32
|
51 |
+
GRADIENT_ACCUMULATION_STEPS=1
|
52 |
+
WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
|
53 |
+
GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
|
54 |
+
|
55 |
+
# Model parameters
|
56 |
+
source model_params.sh
|
57 |
+
MODEL_PARAM=("${PARAM_21M[@]}")
|
58 |
+
NHIDDEN=${MODEL_PARAM[0]}
|
59 |
+
FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
|
60 |
+
KV_SIZE=${MODEL_PARAM[2]}
|
61 |
+
NHEADS=${MODEL_PARAM[3]}
|
62 |
+
NLAYERS=${MODEL_PARAM[4]}
|
63 |
+
SEQ_LEN=2048
|
64 |
+
|
65 |
+
echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
|
66 |
+
|
67 |
+
SAVE_INTERVAL=10000
|
68 |
+
|
69 |
+
# Tokens: 100 000 000
|
70 |
+
# -> Samples: 48828.125
|
71 |
+
#TRAIN_SAMPLES=48_828
|
72 |
+
# Tokens: 400M
|
73 |
+
# 195312.5
|
74 |
+
TRAIN_SAMPLES=195_313
|
75 |
+
|
76 |
+
|
77 |
+
OPTIMIZER_ARGS=" \
|
78 |
+
--optimizer adam \
|
79 |
+
--adam-beta1 0.9 \
|
80 |
+
--adam-beta2 0.999 \
|
81 |
+
--adam-eps 1e-8 \
|
82 |
+
--lr 2e-4 \
|
83 |
+
--min-lr 2e-5 \
|
84 |
+
--lr-decay-style cosine \
|
85 |
+
--lr-decay-samples $TRAIN_SAMPLES \
|
86 |
+
--lr-warmup-samples 1953 \
|
87 |
+
--clip-grad 1.0 \
|
88 |
+
--weight-decay 1e-1 \
|
89 |
+
"
|
90 |
+
|
91 |
+
GPT_ARGS=" \
|
92 |
+
--num-layers $NLAYERS \
|
93 |
+
--hidden-size $NHIDDEN \
|
94 |
+
--num-attention-heads $NHEADS \
|
95 |
+
--kv-channels $KV_SIZE \
|
96 |
+
--ffn-hidden-size $FFN_HIDDEN_SIZE \
|
97 |
+
--seq-length $SEQ_LEN \
|
98 |
+
--max-position-embeddings $SEQ_LEN \
|
99 |
+
--micro-batch-size $MICRO_BATCH_SIZE \
|
100 |
+
--global-batch-size $GLOBAL_BATCH_SIZE \
|
101 |
+
--train-samples $TRAIN_SAMPLES \
|
102 |
+
--vocab-file $VOCAB_FILE \
|
103 |
+
--merge-file $MERGE_FILE \
|
104 |
+
--loss-scale 12 \
|
105 |
+
--clip-grad 1.0 \
|
106 |
+
--kill-switch-path $KILL_SWITCH_PATH \
|
107 |
+
--bf16 \
|
108 |
+
--checkpoint-activations \
|
109 |
+
$OPTIMIZER_ARGS \
|
110 |
+
"
|
111 |
+
|
112 |
+
OUTPUT_ARGS=" \
|
113 |
+
--log-interval 10 \
|
114 |
+
--save-interval $SAVE_INTERVAL \
|
115 |
+
--eval-interval 1000 \
|
116 |
+
--eval-iters 1 \
|
117 |
+
--tensorboard-dir $TENSORBOARD_PATH \
|
118 |
+
--tensorboard-queue-size 5 \
|
119 |
+
--log-timers-to-tensorboard \
|
120 |
+
--log-batch-size-to-tensorboard \
|
121 |
+
--log-validation-ppl-to-tensorboard \
|
122 |
+
"
|
123 |
+
|
124 |
+
ZERO_STAGE=0
|
125 |
+
|
126 |
+
mkdir -p ds_configs
|
127 |
+
DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
|
128 |
+
|
129 |
+
cat <<EOF > $DS_CONFIG_PATH
|
130 |
+
{
|
131 |
+
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
|
132 |
+
"train_batch_size": $GLOBAL_BATCH_SIZE,
|
133 |
+
"gradient_clipping": 1.0,
|
134 |
+
"zero_optimization": {
|
135 |
+
"stage": $ZERO_STAGE
|
136 |
+
},
|
137 |
+
"bf16": {
|
138 |
+
"enabled": true
|
139 |
+
},
|
140 |
+
"steps_per_print": 2000,
|
141 |
+
"wall_clock_breakdown": false
|
142 |
+
}
|
143 |
+
EOF
|
144 |
+
|
145 |
+
DEEPSPEED_ARGS=" \
|
146 |
+
--deepspeed \
|
147 |
+
--deepspeed_config $DS_CONFIG_PATH \
|
148 |
+
--zero-stage $ZERO_STAGE \
|
149 |
+
"
|
150 |
+
|
151 |
+
CMD=" \
|
152 |
+
Megatron-DeepSpeed/pretrain_gpt.py \
|
153 |
+
--tensor-model-parallel-size $TP_SIZE \
|
154 |
+
--pipeline-model-parallel-size $PP_SIZE \
|
155 |
+
$GPT_ARGS \
|
156 |
+
$OUTPUT_ARGS \
|
157 |
+
--save $CHECKPOINT_PATH \
|
158 |
+
--load $CHECKPOINT_PATH \
|
159 |
+
--train-weighted-split-paths-path $TRAIN_DATA_PATH \
|
160 |
+
--valid-weighted-split-paths-path $VALID_DATA_PATH \
|
161 |
+
--data-impl mmap \
|
162 |
+
$DEEPSPEED_ARGS \
|
163 |
+
"
|
164 |
+
|
165 |
+
echo $CMD
|
166 |
+
|
167 |
+
echo "START $SLURM_JOBID: $(date)"
|
168 |
+
|
169 |
+
# bash launch_srun.sh $CMD
|
170 |
+
srun --label launch.sh $CMD
|
171 |
+
|
172 |
+
echo "END $SLURM_JOBID: $(date)"
|
21m400m400m/sbatch_21m400m400mval.sh
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
#SBATCH --ntasks-per-node=1
|
5 |
+
#SBATCH --cpus-per-task=32
|
6 |
+
#SBATCH --mem=256G
|
7 |
+
#SBATCH -p small-g
|
8 |
+
#SBATCH -t 12:00:00
|
9 |
+
#SBATCH --gpus-per-node=mi250:8
|
10 |
+
#SBATCH --exclusive=user
|
11 |
+
#SBATCH --hint=nomultithread
|
12 |
+
#SBATCH --account=project_462000119
|
13 |
+
#SBATCH -o logs/%j.out
|
14 |
+
#SBATCH -e logs/%j.err
|
15 |
+
|
16 |
+
VARIANT=21m400m400mval
|
17 |
+
VARIANT_CKPT=21m400m400m
|
18 |
+
|
19 |
+
# if run without sbatch, invoke here
|
20 |
+
if [ -z $SLURM_JOB_ID ]; then
|
21 |
+
mkdir -p logs
|
22 |
+
sbatch "$0"
|
23 |
+
exit
|
24 |
+
fi
|
25 |
+
|
26 |
+
set -euo pipefail
|
27 |
+
|
28 |
+
# symlink logs/latest.out and logs/latest.err
|
29 |
+
ln -f -s $SLURM_JOB_ID.out logs/latest.out
|
30 |
+
ln -f -s $SLURM_JOB_ID.err logs/latest.err
|
31 |
+
|
32 |
+
KILL_SWITCH_PATH=kill-switch-$VARIANT
|
33 |
+
CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT
|
34 |
+
TENSORBOARD_PATH=tensorboard_$VARIANT
|
35 |
+
|
36 |
+
# Data
|
37 |
+
VOCAB_FILE="gpt2/vocab.json"
|
38 |
+
MERGE_FILE="gpt2/merges.txt"
|
39 |
+
#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
|
40 |
+
TRAIN_DATA_PATH=train400m.txt
|
41 |
+
# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document"
|
42 |
+
VALID_DATA_PATH=val.txt
|
43 |
+
# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
|
44 |
+
|
45 |
+
PP_SIZE=1
|
46 |
+
TP_SIZE=1
|
47 |
+
|
48 |
+
MICRO_BATCH_SIZE=32
|
49 |
+
GRADIENT_ACCUMULATION_STEPS=1
|
50 |
+
WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
|
51 |
+
GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
|
52 |
+
|
53 |
+
# Model parameters
|
54 |
+
source model_params.sh
|
55 |
+
MODEL_PARAM=("${PARAM_20M[@]}")
|
56 |
+
NHIDDEN=${MODEL_PARAM[0]}
|
57 |
+
FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
|
58 |
+
KV_SIZE=${MODEL_PARAM[2]}
|
59 |
+
NHEADS=${MODEL_PARAM[3]}
|
60 |
+
NLAYERS=${MODEL_PARAM[4]}
|
61 |
+
SEQ_LEN=2048
|
62 |
+
|
63 |
+
echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
|
64 |
+
|
65 |
+
SAVE_INTERVAL=1000
|
66 |
+
|
67 |
+
# Tokens: 31633480000
|
68 |
+
# -> Samples: 15446035
|
69 |
+
TRAIN_SAMPLES=1
|
70 |
+
|
71 |
+
OPTIMIZER_ARGS=" \
|
72 |
+
--optimizer adam \
|
73 |
+
--adam-beta1 0.9 \
|
74 |
+
--adam-beta2 0.999 \
|
75 |
+
--adam-eps 1e-8 \
|
76 |
+
--lr 2e-4 \
|
77 |
+
--min-lr 2e-5 \
|
78 |
+
--lr-decay-style cosine \
|
79 |
+
--lr-decay-samples $TRAIN_SAMPLES \
|
80 |
+
--lr-warmup-samples 0 \
|
81 |
+
--clip-grad 1.0 \
|
82 |
+
--weight-decay 1e-1 \
|
83 |
+
--override-lr-scheduler \
|
84 |
+
--no-load-optim \
|
85 |
+
--reset-progress \
|
86 |
+
"
|
87 |
+
|
88 |
+
GPT_ARGS=" \
|
89 |
+
--num-layers $NLAYERS \
|
90 |
+
--hidden-size $NHIDDEN \
|
91 |
+
--num-attention-heads $NHEADS \
|
92 |
+
--kv-channels $KV_SIZE \
|
93 |
+
--ffn-hidden-size $FFN_HIDDEN_SIZE \
|
94 |
+
--seq-length $SEQ_LEN \
|
95 |
+
--max-position-embeddings $SEQ_LEN \
|
96 |
+
--micro-batch-size $MICRO_BATCH_SIZE \
|
97 |
+
--global-batch-size $GLOBAL_BATCH_SIZE \
|
98 |
+
--train-samples $TRAIN_SAMPLES \
|
99 |
+
--vocab-file $VOCAB_FILE \
|
100 |
+
--merge-file $MERGE_FILE \
|
101 |
+
--clip-grad 1.0 \
|
102 |
+
--kill-switch-path $KILL_SWITCH_PATH \
|
103 |
+
--bf16 \
|
104 |
+
$OPTIMIZER_ARGS \
|
105 |
+
"
|
106 |
+
|
107 |
+
OUTPUT_ARGS=" \
|
108 |
+
--log-interval 10 \
|
109 |
+
--save-interval $SAVE_INTERVAL \
|
110 |
+
--eval-interval 1 \
|
111 |
+
--eval-iters 100 \
|
112 |
+
--tensorboard-dir $TENSORBOARD_PATH \
|
113 |
+
--tensorboard-queue-size 5 \
|
114 |
+
--log-timers-to-tensorboard \
|
115 |
+
--log-batch-size-to-tensorboard \
|
116 |
+
--log-validation-ppl-to-tensorboard \
|
117 |
+
"
|
118 |
+
|
119 |
+
ZERO_STAGE=0
|
120 |
+
|
121 |
+
mkdir -p ds_configs
|
122 |
+
DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
|
123 |
+
|
124 |
+
cat <<EOF > $DS_CONFIG_PATH
|
125 |
+
{
|
126 |
+
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
|
127 |
+
"train_batch_size": $GLOBAL_BATCH_SIZE,
|
128 |
+
"gradient_clipping": 1.0,
|
129 |
+
"zero_optimization": {
|
130 |
+
"stage": $ZERO_STAGE
|
131 |
+
},
|
132 |
+
"bf16": {
|
133 |
+
"enabled": true
|
134 |
+
},
|
135 |
+
"steps_per_print": 2000,
|
136 |
+
"wall_clock_breakdown": false
|
137 |
+
}
|
138 |
+
EOF
|
139 |
+
|
140 |
+
DEEPSPEED_ARGS=" \
|
141 |
+
--deepspeed \
|
142 |
+
--deepspeed_config $DS_CONFIG_PATH \
|
143 |
+
--zero-stage $ZERO_STAGE \
|
144 |
+
"
|
145 |
+
|
146 |
+
CMD=" \
|
147 |
+
Megatron-DeepSpeed/pretrain_gpt.py \
|
148 |
+
--tensor-model-parallel-size $TP_SIZE \
|
149 |
+
--pipeline-model-parallel-size $PP_SIZE \
|
150 |
+
$GPT_ARGS \
|
151 |
+
$OUTPUT_ARGS \
|
152 |
+
--save $CHECKPOINT_PATH \
|
153 |
+
--load $CHECKPOINT_PATH \
|
154 |
+
--train-weighted-split-paths-path $TRAIN_DATA_PATH \
|
155 |
+
--valid-weighted-split-paths-path $VALID_DATA_PATH \
|
156 |
+
--data-impl mmap \
|
157 |
+
$DEEPSPEED_ARGS \
|
158 |
+
"
|
159 |
+
|
160 |
+
echo $CMD
|
161 |
+
|
162 |
+
echo "START $SLURM_JOBID: $(date)"
|
163 |
+
|
164 |
+
# bash launch_srun.sh $CMD
|
165 |
+
srun --label launch.sh $CMD
|
166 |
+
|
167 |
+
echo "END $SLURM_JOBID: $(date)"
|
21m400m400m/tensorboard_21m400m400m/events.out.tfevents.1683665937.nid005223.63334.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e21f3a911a130ca44d1743a160abe95a79e77640e38e8542149a3c89501d6f03
|
3 |
+
size 1369058
|
21m400m400m/tensorboard_21m400m400mval/events.out.tfevents.1683666595.nid007269.50912.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65257e05e3a25f4bf5fa93218d0fc6e10e8040d0f4501e532d87436fb186a6e6
|
3 |
+
size 40
|
21m400m400m/tensorboard_21m400m400mval/events.out.tfevents.1683666830.nid007269.54799.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e683f0205510ce039d2a2d0816282b94c3cdfb5b71fdac91e5280090b1a264b
|
3 |
+
size 980
|
220m200b1b5/sbatch_220m200b1b5.sh
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --nodes=8
|
3 |
+
#SBATCH --ntasks-per-node=1
|
4 |
+
#SBATCH --cpus-per-task=32
|
5 |
+
#SBATCH --mem=256G
|
6 |
+
#SBATCH -p standard-g
|
7 |
+
#SBATCH -t 48:00:00
|
8 |
+
#SBATCH --gpus-per-node=mi250:8
|
9 |
+
#SBATCH --exclusive=user
|
10 |
+
#SBATCH --hint=nomultithread
|
11 |
+
#SBATCH --account=project_462000119
|
12 |
+
#SBATCH -o logs/%j.out
|
13 |
+
#SBATCH -e logs/%j.err
|
14 |
+
|
15 |
+
VARIANT=220m200b1b5
|
16 |
+
|
17 |
+
# if run without sbatch, invoke here
|
18 |
+
if [ -z $SLURM_JOB_ID ]; then
|
19 |
+
mkdir -p logs
|
20 |
+
sbatch "$0"
|
21 |
+
exit
|
22 |
+
fi
|
23 |
+
|
24 |
+
set -euo pipefail
|
25 |
+
|
26 |
+
# symlink logs/latest.out and logs/latest.err
|
27 |
+
ln -f -s $SLURM_JOB_ID.out logs/latest.out
|
28 |
+
ln -f -s $SLURM_JOB_ID.err logs/latest.err
|
29 |
+
|
30 |
+
KILL_SWITCH_PATH=kill-switch-$VARIANT
|
31 |
+
CHECKPOINT_PATH=checkpoints_$VARIANT
|
32 |
+
TENSORBOARD_PATH=tensorboard_$VARIANT
|
33 |
+
# Start from scratch
|
34 |
+
# rm -rf "$CHECKPOINT_PATH" "$TENSORBOARD_PATH"
|
35 |
+
|
36 |
+
# Data
|
37 |
+
VOCAB_FILE="gpt2/vocab.json"
|
38 |
+
MERGE_FILE="gpt2/merges.txt"
|
39 |
+
# DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
|
40 |
+
TRAIN_DATA_PATH=train1b5.txt
|
41 |
+
# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document"
|
42 |
+
VALID_DATA_PATH=val.txt
|
43 |
+
# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
|
44 |
+
|
45 |
+
PP_SIZE=1
|
46 |
+
TP_SIZE=1
|
47 |
+
|
48 |
+
MICRO_BATCH_SIZE=4
|
49 |
+
GRADIENT_ACCUMULATION_STEPS=1
|
50 |
+
WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
|
51 |
+
GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
|
52 |
+
|
53 |
+
# Model parameters
|
54 |
+
source model_params.sh
|
55 |
+
MODEL_PARAM=("${PARAM_217M[@]}")
|
56 |
+
NHIDDEN=${MODEL_PARAM[0]}
|
57 |
+
FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
|
58 |
+
KV_SIZE=${MODEL_PARAM[2]}
|
59 |
+
NHEADS=${MODEL_PARAM[3]}
|
60 |
+
NLAYERS=${MODEL_PARAM[4]}
|
61 |
+
SEQ_LEN=2048
|
62 |
+
|
63 |
+
echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
|
64 |
+
|
65 |
+
SAVE_INTERVAL=20000
|
66 |
+
|
67 |
+
# Tokens: 7510000000
|
68 |
+
# -> Samples: 3_666_992
|
69 |
+
TRAIN_SAMPLES=3_666_992
|
70 |
+
# Tokens: 200e9
|
71 |
+
TRAIN_SAMPLES=97_656_250
|
72 |
+
|
73 |
+
OPTIMIZER_ARGS=" \
|
74 |
+
--optimizer adam \
|
75 |
+
--adam-beta1 0.9 \
|
76 |
+
--adam-beta2 0.999 \
|
77 |
+
--adam-eps 1e-8 \
|
78 |
+
--lr 2e-4 \
|
79 |
+
--min-lr 2e-5 \
|
80 |
+
--lr-decay-style cosine \
|
81 |
+
--lr-decay-samples $TRAIN_SAMPLES \
|
82 |
+
--lr-warmup-samples 976_563 \
|
83 |
+
--clip-grad 1.0 \
|
84 |
+
--weight-decay 1e-1 \
|
85 |
+
"
|
86 |
+
|
87 |
+
GPT_ARGS=" \
|
88 |
+
--num-layers $NLAYERS \
|
89 |
+
--hidden-size $NHIDDEN \
|
90 |
+
--num-attention-heads $NHEADS \
|
91 |
+
--kv-channels $KV_SIZE \
|
92 |
+
--ffn-hidden-size $FFN_HIDDEN_SIZE \
|
93 |
+
--seq-length $SEQ_LEN \
|
94 |
+
--max-position-embeddings $SEQ_LEN \
|
95 |
+
--micro-batch-size $MICRO_BATCH_SIZE \
|
96 |
+
--global-batch-size $GLOBAL_BATCH_SIZE \
|
97 |
+
--train-samples $TRAIN_SAMPLES \
|
98 |
+
--vocab-file $VOCAB_FILE \
|
99 |
+
--merge-file $MERGE_FILE \
|
100 |
+
--clip-grad 1.0 \
|
101 |
+
--kill-switch-path $KILL_SWITCH_PATH \
|
102 |
+
--bf16 \
|
103 |
+
$OPTIMIZER_ARGS \
|
104 |
+
"
|
105 |
+
|
106 |
+
OUTPUT_ARGS=" \
|
107 |
+
--log-interval 500 \
|
108 |
+
--save-interval $SAVE_INTERVAL \
|
109 |
+
--eval-interval 50000 \
|
110 |
+
--eval-iters 1 \
|
111 |
+
--tensorboard-dir $TENSORBOARD_PATH \
|
112 |
+
--tensorboard-queue-size 5 \
|
113 |
+
--log-timers-to-tensorboard \
|
114 |
+
--log-batch-size-to-tensorboard \
|
115 |
+
--log-validation-ppl-to-tensorboard \
|
116 |
+
"
|
117 |
+
|
118 |
+
ZERO_STAGE=0
|
119 |
+
|
120 |
+
mkdir -p ds_configs
|
121 |
+
DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
|
122 |
+
|
123 |
+
cat <<EOF > $DS_CONFIG_PATH
|
124 |
+
{
|
125 |
+
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
|
126 |
+
"train_batch_size": $GLOBAL_BATCH_SIZE,
|
127 |
+
"gradient_clipping": 1.0,
|
128 |
+
"zero_optimization": {
|
129 |
+
"stage": $ZERO_STAGE
|
130 |
+
},
|
131 |
+
"bf16": {
|
132 |
+
"enabled": true
|
133 |
+
},
|
134 |
+
"steps_per_print": 2000,
|
135 |
+
"wall_clock_breakdown": false
|
136 |
+
}
|
137 |
+
EOF
|
138 |
+
|
139 |
+
DEEPSPEED_ARGS=" \
|
140 |
+
--deepspeed \
|
141 |
+
--deepspeed_config $DS_CONFIG_PATH \
|
142 |
+
--zero-stage $ZERO_STAGE \
|
143 |
+
"
|
144 |
+
|
145 |
+
CMD=" \
|
146 |
+
Megatron-DeepSpeed2/pretrain_gpt.py \
|
147 |
+
--tensor-model-parallel-size $TP_SIZE \
|
148 |
+
--pipeline-model-parallel-size $PP_SIZE \
|
149 |
+
$GPT_ARGS \
|
150 |
+
$OUTPUT_ARGS \
|
151 |
+
--save $CHECKPOINT_PATH \
|
152 |
+
--load $CHECKPOINT_PATH \
|
153 |
+
--train-weighted-split-paths-path $TRAIN_DATA_PATH \
|
154 |
+
--valid-weighted-split-paths-path $VALID_DATA_PATH \
|
155 |
+
--data-impl mmap \
|
156 |
+
$DEEPSPEED_ARGS \
|
157 |
+
"
|
158 |
+
|
159 |
+
echo $CMD
|
160 |
+
|
161 |
+
echo "START $SLURM_JOBID: $(date)"
|
162 |
+
|
163 |
+
# bash launch_srun.sh $CMD
|
164 |
+
srun --label launch.sh $CMD
|
165 |
+
|
166 |
+
echo "END $SLURM_JOBID: $(date)"
|
220m200b1b5/sbatch_220m200b1b5val.sh
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --nodes=8
|
3 |
+
#SBATCH --ntasks-per-node=1
|
4 |
+
#SBATCH --cpus-per-task=32
|
5 |
+
#SBATCH --mem=256G
|
6 |
+
#SBATCH -p standard-g
|
7 |
+
#SBATCH -t 48:00:00
|
8 |
+
#SBATCH --gpus-per-node=mi250:8
|
9 |
+
#SBATCH --exclusive=user
|
10 |
+
#SBATCH --hint=nomultithread
|
11 |
+
#SBATCH --account=project_462000119
|
12 |
+
#SBATCH -o logs/%j.out
|
13 |
+
#SBATCH -e logs/%j.err
|
14 |
+
|
15 |
+
VARIANT=220m200b1b5val
|
16 |
+
VARIANT_CKPT=220m200b1b5
|
17 |
+
|
18 |
+
# if run without sbatch, invoke here
|
19 |
+
if [ -z $SLURM_JOB_ID ]; then
|
20 |
+
mkdir -p logs
|
21 |
+
sbatch "$0"
|
22 |
+
exit
|
23 |
+
fi
|
24 |
+
|
25 |
+
set -euo pipefail
|
26 |
+
|
27 |
+
# symlink logs/latest.out and logs/latest.err
|
28 |
+
ln -f -s $SLURM_JOB_ID.out logs/latest.out
|
29 |
+
ln -f -s $SLURM_JOB_ID.err logs/latest.err
|
30 |
+
|
31 |
+
KILL_SWITCH_PATH=kill-switch-$VARIANT
|
32 |
+
CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT
|
33 |
+
TENSORBOARD_PATH=tensorboard_$VARIANT
|
34 |
+
|
35 |
+
# Data
|
36 |
+
VOCAB_FILE="gpt2/vocab.json"
|
37 |
+
MERGE_FILE="gpt2/merges.txt"
|
38 |
+
# DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
|
39 |
+
TRAIN_DATA_PATH=train1b5.txt
|
40 |
+
# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_7B5_text_document"
|
41 |
+
VALID_DATA_PATH=val.txt
|
42 |
+
# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
|
43 |
+
|
44 |
+
PP_SIZE=1
|
45 |
+
TP_SIZE=1
|
46 |
+
|
47 |
+
MICRO_BATCH_SIZE=4
|
48 |
+
GRADIENT_ACCUMULATION_STEPS=1
|
49 |
+
WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
|
50 |
+
GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
|
51 |
+
|
52 |
+
# Model parameters
|
53 |
+
source model_params.sh
|
54 |
+
MODEL_PARAM=("${PARAM_217M[@]}")
|
55 |
+
NHIDDEN=${MODEL_PARAM[0]}
|
56 |
+
FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
|
57 |
+
KV_SIZE=${MODEL_PARAM[2]}
|
58 |
+
NHEADS=${MODEL_PARAM[3]}
|
59 |
+
NLAYERS=${MODEL_PARAM[4]}
|
60 |
+
SEQ_LEN=2048
|
61 |
+
|
62 |
+
echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
|
63 |
+
|
64 |
+
SAVE_INTERVAL=1000
|
65 |
+
|
66 |
+
# Tokens: 7510000000
|
67 |
+
# -> Samples: 3_666_992
|
68 |
+
TRAIN_SAMPLES=1
|
69 |
+
|
70 |
+
OPTIMIZER_ARGS=" \
|
71 |
+
--optimizer adam \
|
72 |
+
--adam-beta1 0.9 \
|
73 |
+
--adam-beta2 0.999 \
|
74 |
+
--adam-eps 1e-8 \
|
75 |
+
--lr 2e-4 \
|
76 |
+
--min-lr 2e-5 \
|
77 |
+
--lr-decay-style cosine \
|
78 |
+
--lr-decay-samples $TRAIN_SAMPLES \
|
79 |
+
--lr-warmup-samples 0 \
|
80 |
+
--clip-grad 1.0 \
|
81 |
+
--weight-decay 1e-1 \
|
82 |
+
--no-load-optim \
|
83 |
+
--reset-progress \
|
84 |
+
--override-lr-scheduler \
|
85 |
+
"
|
86 |
+
|
87 |
+
GPT_ARGS=" \
|
88 |
+
--num-layers $NLAYERS \
|
89 |
+
--hidden-size $NHIDDEN \
|
90 |
+
--num-attention-heads $NHEADS \
|
91 |
+
--kv-channels $KV_SIZE \
|
92 |
+
--ffn-hidden-size $FFN_HIDDEN_SIZE \
|
93 |
+
--seq-length $SEQ_LEN \
|
94 |
+
--max-position-embeddings $SEQ_LEN \
|
95 |
+
--micro-batch-size $MICRO_BATCH_SIZE \
|
96 |
+
--global-batch-size $GLOBAL_BATCH_SIZE \
|
97 |
+
--train-samples $TRAIN_SAMPLES \
|
98 |
+
--vocab-file $VOCAB_FILE \
|
99 |
+
--merge-file $MERGE_FILE \
|
100 |
+
--clip-grad 1.0 \
|
101 |
+
--kill-switch-path $KILL_SWITCH_PATH \
|
102 |
+
--bf16 \
|
103 |
+
$OPTIMIZER_ARGS \
|
104 |
+
"
|
105 |
+
|
106 |
+
OUTPUT_ARGS=" \
|
107 |
+
--log-interval 10 \
|
108 |
+
--save-interval $SAVE_INTERVAL \
|
109 |
+
--eval-interval 1 \
|
110 |
+
--eval-only true \
|
111 |
+
--eval-iters 100 \
|
112 |
+
--tensorboard-dir $TENSORBOARD_PATH \
|
113 |
+
--tensorboard-queue-size 5 \
|
114 |
+
--log-timers-to-tensorboard \
|
115 |
+
--log-batch-size-to-tensorboard \
|
116 |
+
--log-validation-ppl-to-tensorboard \
|
117 |
+
"
|
118 |
+
|
119 |
+
ZERO_STAGE=0
|
120 |
+
|
121 |
+
mkdir -p ds_configs
|
122 |
+
DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
|
123 |
+
|
124 |
+
cat <<EOF > $DS_CONFIG_PATH
|
125 |
+
{
|
126 |
+
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
|
127 |
+
"train_batch_size": $GLOBAL_BATCH_SIZE,
|
128 |
+
"gradient_clipping": 1.0,
|
129 |
+
"zero_optimization": {
|
130 |
+
"stage": $ZERO_STAGE
|
131 |
+
},
|
132 |
+
"bf16": {
|
133 |
+
"enabled": true
|
134 |
+
},
|
135 |
+
"steps_per_print": 2000,
|
136 |
+
"wall_clock_breakdown": false
|
137 |
+
}
|
138 |
+
EOF
|
139 |
+
|
140 |
+
DEEPSPEED_ARGS=" \
|
141 |
+
--deepspeed \
|
142 |
+
--deepspeed_config $DS_CONFIG_PATH \
|
143 |
+
--zero-stage $ZERO_STAGE \
|
144 |
+
"
|
145 |
+
|
146 |
+
CMD=" \
|
147 |
+
Megatron-DeepSpeed/pretrain_gpt.py \
|
148 |
+
--tensor-model-parallel-size $TP_SIZE \
|
149 |
+
--pipeline-model-parallel-size $PP_SIZE \
|
150 |
+
$GPT_ARGS \
|
151 |
+
$OUTPUT_ARGS \
|
152 |
+
--save $CHECKPOINT_PATH \
|
153 |
+
--load $CHECKPOINT_PATH \
|
154 |
+
--train-weighted-split-paths-path $TRAIN_DATA_PATH \
|
155 |
+
--valid-weighted-split-paths-path $VALID_DATA_PATH \
|
156 |
+
--data-impl mmap \
|
157 |
+
$DEEPSPEED_ARGS \
|
158 |
+
"
|
159 |
+
|
160 |
+
echo $CMD
|
161 |
+
|
162 |
+
echo "START $SLURM_JOBID: $(date)"
|
163 |
+
|
164 |
+
# bash launch_srun.sh $CMD
|
165 |
+
srun --label launch.sh $CMD
|
166 |
+
|
167 |
+
echo "END $SLURM_JOBID: $(date)"
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679051664.nid006529.96495.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d8a384f1edb99245e32c8e09b0b98d0e785f7fcc795b02ae7979e700b7bf876
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679051664.nid006860.1508.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b5e7546168f06f9f3997d509636c095c97ad934478f199f0fb05746159f099a3
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679054214.nid005116.13183.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86485eaeafb6eadfdec07df2066d12246f64fd1e41a62d7889ad18b31b08e77d
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793013.nid006063.122512.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:54a286529cce0b3c873c9a9f87735d562748c34cdd700798b63d72f4c6cf4b04
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793015.nid006273.117983.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:14a468073411f7d913b12b12958d64739bbd2a3c29f1c94b9ecfc8d325ed5da8
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793016.nid005651.127873.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:207df892df43e9394cb11e2ff1b4f7f95d0039b25b74c6a900e0a8bd5208c8d7
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793016.nid006265.117563.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe26d220f3723b40d1eecd1dff70d045ca64c392ecc445005c44ea1a44a7c8fd
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793017.nid006567.56933.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:107639576d96fa795148cb8550a24f8fb4843cb1fea76884db17ed6a8830cb0d
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793017.nid006575.55528.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b7b3238afa138c1766b78154d5126f367ee9ee615cbceca0031b11ef351dde6
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793022.nid005643.128637.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eb81c8fea8287a723a191dcd2cac25f58dae45ecb292239565daeff7b5744b59
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793023.nid005499.130838.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c37ee75762d5906a51e76ad3873f3c60ca442e17373bfaa1ab28f2a8bf994cb
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793030.nid006090.118840.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d4c4baad370c93e5d77d108957721ff2d317a621ef1af7d7ff68613b258bc9d
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793032.nid006082.119082.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2dc49565466861ca44cbcc7856fbd1e612b5ef85984b213f5bcd98f029edfac3
|
3 |
+
size 40
|
220m200b1b5/tensorboard_220m200b1b5val/events.out.tfevents.1679048250.nid005617.100428.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f54193a536aeffe88bfa5a335d40b17950d0849e95ee52fed089d78d34b17375
|
3 |
+
size 980
|
2b812b4b/tensorboard_2b812b4bval/events.out.tfevents.1683534410.nid005943.115511.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f30d1b2ad94b178af57f6d4acf4195d63335c5830d80ccca348d5f14396be1b
|
3 |
+
size 980
|
2b816b4b/tensorboard_2b816b4bval/events.out.tfevents.1683561719.nid006565.21977.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1cc546baa926fa8573d48c6b8a9b97e0852cffcc2f6a5b9d670e04b65412704
|
3 |
+
size 980
|
4b248b12b/3490059.err
ADDED
The diff for this file is too large to render.
See raw diff
|
|
4b248b12b/3490059.out
ADDED
The diff for this file is too large to render.
See raw diff
|
|
4b248b12b/global_step45776/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e5a466022447ef149b3b3c49260ed33208121c2bbd10b475858ef541a553b07
|
3 |
+
size 199058647
|
4b248b12b/global_step45776/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e9377c402647d084fe9ee9c4767817b75eaed42cf13d5b0723e40080a55b5faf
|
3 |
+
size 199058647
|
4b248b12b/global_step45776/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a67f8bdad178e3d68ab5266bd869a9c5d7a3d31bb2fb0c50fbf134ef8a713d01
|
3 |
+
size 199058733
|
4b248b12b/global_step45776/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:270ebd675f83f6d8795726eb380cf4d49936061eea440afea1a7a8e6a8ff3ea8
|
3 |
+
size 199058733
|
4b248b12b/global_step45776/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b535b3009b9c39e0f2cae44eae00ebaf3a7dbfa970ed32c2c65878172e6c3e1
|
3 |
+
size 199058797
|