Muennighoff commited on
Commit
7f89c55
·
1 Parent(s): e19438b
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. {14m2b7100mv → 14m2b7100m}/3307601.err +0 -0
  2. {14m2b7100mv → 14m2b7100m}/3307601.out +0 -0
  3. 21m400m400m/3487337.err +141 -0
  4. 21m400m400m/3487337.out +852 -0
  5. 21m400m400m/global_step762/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  6. 21m400m400m/global_step762/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  7. 21m400m400m/global_step762/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  8. 21m400m400m/global_step762/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  9. 21m400m400m/global_step762/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  10. 21m400m400m/global_step762/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  11. 21m400m400m/global_step762/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  12. 21m400m400m/global_step762/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  13. 21m400m400m/global_step762/layer_01-model_00-model_states.pt +3 -0
  14. 21m400m400m/global_step762/layer_03-model_00-model_states.pt +3 -0
  15. 21m400m400m/global_step762/layer_04-model_00-model_states.pt +3 -0
  16. 21m400m400m/global_step762/layer_05-model_00-model_states.pt +3 -0
  17. 21m400m400m/global_step762/layer_06-model_00-model_states.pt +3 -0
  18. 21m400m400m/global_step762/layer_07-model_00-model_states.pt +3 -0
  19. 21m400m400m/global_step762/layer_09-model_00-model_states.pt +3 -0
  20. 21m400m400m/global_step762/mp_rank_00_model_states.pt +3 -0
  21. 21m400m400m/sbatch_21m400m400m.sh +172 -0
  22. 21m400m400m/sbatch_21m400m400mval.sh +167 -0
  23. 21m400m400m/tensorboard_21m400m400m/events.out.tfevents.1683665937.nid005223.63334.0 +3 -0
  24. 21m400m400m/tensorboard_21m400m400mval/events.out.tfevents.1683666595.nid007269.50912.0 +3 -0
  25. 21m400m400m/tensorboard_21m400m400mval/events.out.tfevents.1683666830.nid007269.54799.0 +3 -0
  26. 220m200b1b5/sbatch_220m200b1b5.sh +166 -0
  27. 220m200b1b5/sbatch_220m200b1b5val.sh +167 -0
  28. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679051664.nid006529.96495.0 +3 -0
  29. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679051664.nid006860.1508.0 +3 -0
  30. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679054214.nid005116.13183.0 +3 -0
  31. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793013.nid006063.122512.0 +3 -0
  32. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793015.nid006273.117983.0 +3 -0
  33. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793016.nid005651.127873.0 +3 -0
  34. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793016.nid006265.117563.0 +3 -0
  35. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793017.nid006567.56933.0 +3 -0
  36. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793017.nid006575.55528.0 +3 -0
  37. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793022.nid005643.128637.0 +3 -0
  38. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793023.nid005499.130838.0 +3 -0
  39. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793030.nid006090.118840.0 +3 -0
  40. 220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793032.nid006082.119082.0 +3 -0
  41. 220m200b1b5/tensorboard_220m200b1b5val/events.out.tfevents.1679048250.nid005617.100428.0 +3 -0
  42. 2b812b4b/tensorboard_2b812b4bval/events.out.tfevents.1683534410.nid005943.115511.0 +3 -0
  43. 2b816b4b/tensorboard_2b816b4bval/events.out.tfevents.1683561719.nid006565.21977.0 +3 -0
  44. 4b248b12b/3490059.err +0 -0
  45. 4b248b12b/3490059.out +0 -0
  46. 4b248b12b/global_step45776/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  47. 4b248b12b/global_step45776/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt +3 -0
  48. 4b248b12b/global_step45776/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +3 -0
  49. 4b248b12b/global_step45776/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt +3 -0
  50. 4b248b12b/global_step45776/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +3 -0
{14m2b7100mv → 14m2b7100m}/3307601.err RENAMED
File without changes
{14m2b7100mv → 14m2b7100m}/3307601.out RENAMED
File without changes
21m400m400m/3487337.err ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0: 2023-05-10 00:12:26.642566: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
2
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
3
+ 0: 2023-05-10 00:12:26.642574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
4
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
5
+ 0: 2023-05-10 00:12:26.642587: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
6
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
7
+ 0: 2023-05-10 00:12:26.642609: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
8
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
9
+ 0: 2023-05-10 00:12:26.642609: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
10
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
11
+ 0: 2023-05-10 00:12:26.642625: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
12
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
13
+ 0: 2023-05-10 00:12:26.642632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
14
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
15
+ 0: 2023-05-10 00:12:26.642626: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
16
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
17
+ 0: 2023-05-10 00:12:35.492522: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
18
+ 0: 2023-05-10 00:12:35.492558: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
19
+ 0: 2023-05-10 00:12:35.492577: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
20
+ 0: 2023-05-10 00:12:35.492623: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
21
+ 0: 2023-05-10 00:12:35.492620: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
22
+ 0: 2023-05-10 00:12:35.492624: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
23
+ 0: 2023-05-10 00:12:35.492635: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
24
+ 0: 2023-05-10 00:12:35.492657: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
25
+ 0: 2023-05-10 00:12:35.493431: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
26
+ 0: 2023-05-10 00:12:35.493451: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
27
+ 0: 2023-05-10 00:12:35.493467: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
28
+ 0: 2023-05-10 00:12:35.493463: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
29
+ 0: 2023-05-10 00:12:35.493489: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
30
+ 0: 2023-05-10 00:12:35.493493: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
31
+ 0: 2023-05-10 00:12:35.493496: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
32
+ 0: 2023-05-10 00:12:35.493501: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
33
+ 0: 2023-05-10 00:12:57.097945: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
34
+ 0: 2023-05-10 00:12:57.097970: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
35
+ 0: 2023-05-10 00:12:57.097996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
36
+ 0: 2023-05-10 00:12:57.098006: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
37
+ 0: 2023-05-10 00:12:57.098019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
38
+ 0: 2023-05-10 00:12:57.098024: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
39
+ 0: 2023-05-10 00:12:57.098033: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
40
+ 0: 2023-05-10 00:12:57.098214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
41
+ 0: 2023-05-10 00:12:57.118445: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
42
+ 0: 2023-05-10 00:12:57.118446: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
43
+ 0: 2023-05-10 00:12:57.118459: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
44
+ 0: 2023-05-10 00:12:57.118463: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
45
+ 0: 2023-05-10 00:12:57.118466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
46
+ 0: 2023-05-10 00:12:57.118480: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
47
+ 0: 2023-05-10 00:12:57.118479: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
48
+ 0: 2023-05-10 00:12:57.118472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
49
+ 0: 2023-05-10 00:12:57.118472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
50
+ 0: 2023-05-10 00:12:57.118473: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps-rocm-5.2.3/aws-ofi-rccl:/opt/rocm/lib64:/opt/rocm/lib:/opt/rocm/rocprofiler/lib:/opt/rocm/rocprofiler/tool:/opt/rocm/roctracer/lib:/opt/rocm/roctracer/tool:/opt/rocm/hip/lib:/opt/cray/pe/python/3.9.13.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.2.0/lib64
51
+ 0: 2023-05-10 00:12:57.118504: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
52
+ 0: 2023-05-10 00:12:57.118507: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
53
+ 0: 2023-05-10 00:12:57.118503: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
54
+ 0: 2023-05-10 00:12:57.118515: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
55
+ 0: 2023-05-10 00:12:57.118516: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
56
+ 0: 2023-05-10 00:12:57.118517: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
57
+ 0: Successfully preprocessed all matching files.
58
+ 0: Detected CUDA files, patching ldflags
59
+ 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
60
+ 0: Building extension module scaled_upper_triang_masked_softmax_cuda...
61
+ 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
62
+ 0: Loading extension module scaled_upper_triang_masked_softmax_cuda...
63
+ 0: Successfully preprocessed all matching files.
64
+ 0: Detected CUDA files, patching ldflags
65
+ 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
66
+ 0: Building extension module scaled_masked_softmax_cuda...
67
+ 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
68
+ 0: Loading extension module scaled_masked_softmax_cuda...
69
+ 0: Successfully preprocessed all matching files.
70
+ 0: Detected CUDA files, patching ldflags
71
+ 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
72
+ 0: Building extension module fused_mix_prec_layer_norm_cuda...
73
+ 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
74
+ 0: Loading extension module fused_mix_prec_layer_norm_cuda...
75
+ 0: Successfully preprocessed all matching files.
76
+ 0: Successfully preprocessed all matching files.
77
+ 0: Successfully preprocessed all matching files.
78
+ 0: Successfully preprocessed all matching files.
79
+ 0: Successfully preprocessed all matching files.
80
+ 0: Successfully preprocessed all matching files.
81
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
82
+ 0: warnings.warn(
83
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
84
+ 0: warnings.warn(
85
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
86
+ 0: warnings.warn(
87
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
88
+ 0: warnings.warn(
89
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
90
+ 0: warnings.warn(
91
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
92
+ 0: warnings.warn(
93
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
94
+ 0: warnings.warn(
95
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
96
+ 0: warnings.warn(
97
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
98
+ 0:
99
+ 0:
100
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
101
+ 0:
102
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
103
+ 0:
104
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
105
+ 0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja...
106
+ 0: Building extension module utils...
107
+ 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
108
+ 0: Loading extension module utils...
109
+ 0: Loading extension module utils...
110
+ 0: Loading extension module utils...
111
+ 0: Loading extension module utils...
112
+ 0: Loading extension module utils...
113
+ 0: Loading extension module utils...
114
+ 0: Loading extension module utils...
115
+ 0: Loading extension module utils...
116
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
117
+ 0:
118
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
119
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
120
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
121
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
122
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
123
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...
124
+ 0:
125
+ 0: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...
126
+ 0:
127
+ 0:
128
+ 0: Loading extension module utils...
129
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
130
+ 0: Loading extension module utils...
131
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
132
+ 0: Loading extension module utils...
133
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
134
+ 0: Loading extension module utils...
135
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
136
+ 0: Loading extension module utils...
137
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
138
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
139
+ 0: Loading extension module utils...
140
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings
141
+ 0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings")
21m400m400m/3487337.out ADDED
@@ -0,0 +1,852 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model parameters: d_model 288 ffw_size 1152 kv_size 32 n_heads 7 n_layers 5
2
+ Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 5 --hidden-size 288 --num-attention-heads 7 --kv-channels 32 --ffn-hidden-size 1152 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 32 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-21m400m400mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --override-lr-scheduler --no-load-optim --reset-progress --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --tensorboard-dir tensorboard_21m400m400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_21m400m400m --load checkpoints_21m400m400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3487337.json --zero-stage 0
3
+ START 3487337: Wed 10 May 2023 12:11:40 AM EEST
4
+ 0:
5
+ 0:
6
+ 0: ======================= ROCm System Management Interface =======================
7
+ 0: ================================= Concise Info =================================
8
+ 0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
9
+ 0: 0 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
10
+ 0: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
11
+ 0: 2 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
12
+ 0: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
13
+ 0: 4 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
14
+ 0: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
15
+ 0: 6 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
16
+ 0: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
17
+ 0: ================================================================================
18
+ 0: ============================= End of ROCm SMI Log ==============================
19
+ 0: Launching on nid007269 (0/1), master nid007269 port 9999, GPUs 8, CUDA: True
20
+ 0: using world size: 8, data-parallel-size: 8, tensor-model-parallel size: 1, pipeline-model-parallel size: 1
21
+ 0: accumulate and all-reduce gradients in fp32 for bfloat16 data type.
22
+ 0: using torch.bfloat16 for parameters ...
23
+ 0: ------------------------ arguments ------------------------
24
+ 0: abort_on_unmet_fused_kernel_constraints ......... False
25
+ 0: accumulate_allreduce_grads_in_fp32 .............. True
26
+ 0: adam_beta1 ...................................... 0.9
27
+ 0: adam_beta2 ...................................... 0.999
28
+ 0: adam_eps ........................................ 1e-08
29
+ 0: adlr_autoresume ................................. False
30
+ 0: adlr_autoresume_interval ........................ 1000
31
+ 0: apply_query_key_layer_scaling ................... True
32
+ 0: apply_residual_connection_post_layernorm ........ False
33
+ 0: attention_dropout ............................... 0.1
34
+ 0: attention_softmax_in_fp32 ....................... False
35
+ 0: bert_binary_head ................................ True
36
+ 0: bert_load ....................................... None
37
+ 0: bf16 ............................................ True
38
+ 0: bias_dropout_fusion ............................. True
39
+ 0: bias_gelu_fusion ................................ True
40
+ 0: biencoder_projection_dim ........................ 0
41
+ 0: biencoder_shared_query_context_model ............ False
42
+ 0: block_data_path ................................. None
43
+ 0: checkpoint_activations .......................... False
44
+ 0: checkpoint_in_cpu ............................... False
45
+ 0: checkpoint_num_layers ........................... 1
46
+ 0: clip_grad ....................................... 1.0
47
+ 0: codecarbon_dir .................................. None
48
+ 0: consumed_train_samples .......................... 0
49
+ 0: consumed_train_tokens ........................... 0
50
+ 0: consumed_valid_samples .......................... 0
51
+ 0: contigious_checkpointing ........................ False
52
+ 0: cpu_optimizer ................................... False
53
+ 0: cpu_torch_adam .................................. False
54
+ 0: curriculum_learning ............................. False
55
+ 0: data_impl ....................................... mmap
56
+ 0: data_parallel_size .............................. 8
57
+ 0: data_path ....................................... None
58
+ 0: dataloader_type ................................. single
59
+ 0: DDP_impl ........................................ local
60
+ 0: decoder_seq_length .............................. None
61
+ 0: deepscale ....................................... False
62
+ 0: deepscale_config ................................ None
63
+ 0: deepspeed ....................................... True
64
+ 0: deepspeed_activation_checkpointing .............. False
65
+ 0: deepspeed_config ................................ ds_configs/3487337.json
66
+ 0: deepspeed_mpi ................................... False
67
+ 0: distribute_checkpointed_activations ............. False
68
+ 0: distributed_backend ............................. nccl
69
+ 0: embed_layernorm ................................. False
70
+ 0: embedding_path .................................. None
71
+ 0: encoder_seq_length .............................. 2048
72
+ 0: eod_mask_loss ................................... False
73
+ 0: eval_interval ................................... 1
74
+ 0: eval_iters ...................................... 100
75
+ 0: eval_only ....................................... None
76
+ 0: evidence_data_path .............................. None
77
+ 0: exit_duration_in_mins ........................... None
78
+ 0: exit_interval ................................... None
79
+ 0: ffn_hidden_size ................................. 1152
80
+ 0: finetune ........................................ False
81
+ 0: fp16 ............................................ False
82
+ 0: fp16_lm_cross_entropy ........................... False
83
+ 0: fp32_residual_connection ........................ False
84
+ 0: gigaflos_no_embeds .............................. 0
85
+ 0: global_batch_size ............................... 256
86
+ 0: glu_activation .................................. None
87
+ 0: hidden_dropout .................................. 0.1
88
+ 0: hidden_size ..................................... 288
89
+ 0: hysteresis ...................................... 2
90
+ 0: ict_head_size ................................... None
91
+ 0: ict_load ........................................ None
92
+ 0: img_dim ......................................... 224
93
+ 0: indexer_batch_size .............................. 128
94
+ 0: indexer_log_interval ............................ 1000
95
+ 0: inference ....................................... False
96
+ 0: init_method_std ................................. 0.02
97
+ 0: init_method_xavier_uniform ...................... False
98
+ 0: initial_loss_scale .............................. 4294967296
99
+ 0: kill_switch_path ................................ kill-switch-21m400m400mval
100
+ 0: kv_channels ..................................... 32
101
+ 0: layer_norm_fusion ............................... True
102
+ 0: layernorm_epsilon ............................... 1e-05
103
+ 0: lazy_mpu_init ................................... None
104
+ 0: load ............................................ checkpoints_21m400m400m
105
+ 0: local_rank ...................................... None
106
+ 0: log_batch_size_to_tensorboard ................... True
107
+ 0: log_interval .................................... 10
108
+ 0: log_learning_rate_to_tensorboard ................ True
109
+ 0: log_level ....................................... None
110
+ 0: log_level_replica ............................... None
111
+ 0: log_loss_scale_to_tensorboard ................... True
112
+ 0: log_num_zeros_in_grad ........................... False
113
+ 0: log_params_norm ................................. False
114
+ 0: log_path ........................................ None
115
+ 0: log_timers_to_tensorboard ....................... True
116
+ 0: log_validation_ppl_to_tensorboard ............... True
117
+ 0: loss_on_targets_only ............................ False
118
+ 0: loss_scale ...................................... None
119
+ 0: loss_scale_window ............................... 1000
120
+ 0: lr .............................................. 0.0002
121
+ 0: lr_decay_iters .................................. None
122
+ 0: lr_decay_samples ................................ 1
123
+ 0: lr_decay_style .................................. cosine
124
+ 0: lr_decay_tokens ................................. None
125
+ 0: lr_warmup_fraction .............................. None
126
+ 0: lr_warmup_iters ................................. 0
127
+ 0: lr_warmup_samples ............................... 0
128
+ 0: make_vocab_size_divisible_by .................... 128
129
+ 0: mask_prob ....................................... 0.15
130
+ 0: masked_softmax_fusion ........................... True
131
+ 0: max_position_embeddings ......................... 2048
132
+ 0: mean_noise_span_length .......................... None
133
+ 0: memory_centric_tiled_linear ..................... False
134
+ 0: merge_file ...................................... gpt2/merges.txt
135
+ 0: micro_batch_size ................................ 32
136
+ 0: min_loss_scale .................................. 1.0
137
+ 0: min_lr .......................................... 2e-05
138
+ 0: mmap_warmup ..................................... False
139
+ 0: no_load_optim ................................... True
140
+ 0: no_load_rng ..................................... None
141
+ 0: no_save_optim ................................... None
142
+ 0: no_save_rng ..................................... None
143
+ 0: noise_density ................................... None
144
+ 0: num_attention_heads ............................. 7
145
+ 0: num_channels .................................... 3
146
+ 0: num_classes ..................................... 1000
147
+ 0: num_layers ...................................... 5
148
+ 0: num_layers_per_virtual_pipeline_stage ........... None
149
+ 0: num_workers ..................................... 2
150
+ 0: onnx_safe ....................................... None
151
+ 0: openai_gelu ..................................... False
152
+ 0: optimizer ....................................... adam
153
+ 0: optimizer_fusion ................................ True
154
+ 0: override_lr_scheduler ........................... True
155
+ 0: pad_vocab_size_to ............................... None
156
+ 0: params_dtype .................................... torch.bfloat16
157
+ 0: partition_activations ........................... False
158
+ 0: patch_dim ....................................... 16
159
+ 0: pipeline_model_parallel_size .................... 1
160
+ 0: position_embedding_type ......................... PositionEmbeddingType.absolute
161
+ 0: pp_partition_method ............................. None
162
+ 0: profile_backward ................................ False
163
+ 0: query_in_block_prob ............................. 0.1
164
+ 0: rampup_batch_size ............................... None
165
+ 0: rank ............................................ 0
166
+ 0: remote_device ................................... none
167
+ 0: reset_attention_mask ............................ False
168
+ 0: reset_position_ids .............................. False
169
+ 0: reset_progress .................................. True
170
+ 0: retriever_report_topk_accuracies ................ []
171
+ 0: retriever_score_scaling ......................... False
172
+ 0: retriever_seq_length ............................ 256
173
+ 0: reweight_loss_based_on_position_frequency ....... False
174
+ 0: sample_rate ..................................... 1.0
175
+ 0: save ............................................ checkpoints_21m400m400m
176
+ 0: save_interval ................................... 1000
177
+ 0: scatter_gather_tensors_in_pipeline .............. True
178
+ 0: scattered_embeddings ............................ False
179
+ 0: seed ............................................ 1234
180
+ 0: seq_length ...................................... 2048
181
+ 0: sgd_momentum .................................... 0.9
182
+ 0: short_seq_prob .................................. 0.1
183
+ 0: skip_train_iteration_range ...................... None
184
+ 0: split ........................................... None
185
+ 0: split_transformers .............................. False
186
+ 0: sync_tp_duplicated_parameters ................... False
187
+ 0: synchronize_each_layer .......................... False
188
+ 0: tensor_model_parallel_size ...................... 1
189
+ 0: tensorboard_dir ................................. tensorboard_21m400m400mval
190
+ 0: tensorboard_log_interval ........................ 1
191
+ 0: tensorboard_queue_size .......................... 5
192
+ 0: test_weighted_split_paths ....................... None
193
+ 0: test_weighted_split_paths_path .................. None
194
+ 0: tile_factor ..................................... 1
195
+ 0: titles_data_path ................................ None
196
+ 0: tokenizer_name_or_path .......................... None
197
+ 0: tokenizer_type .................................. GPT2BPETokenizer
198
+ 0: train_iters ..................................... None
199
+ 0: train_samples ................................... 1
200
+ 0: train_tokens .................................... None
201
+ 0: train_weighted_split_names ...................... ['train']
202
+ 0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']]
203
+ 0: train_weighted_split_paths_path ................. None
204
+ 0: train_weighted_split_splits ..................... [['0:1']]
205
+ 0: train_weighted_split_weights .................... [['1.0']]
206
+ 0: universal_checkpoint ............................ False
207
+ 0: use_bnb_optimizer ............................... False
208
+ 0: use_checkpoint_lr_scheduler ..................... False
209
+ 0: use_contiguous_buffers_in_ddp ................... True
210
+ 0: use_cpu_initialization .......................... None
211
+ 0: use_one_sent_docs ............................... False
212
+ 0: use_pin_memory .................................. False
213
+ 0: valid_num_workers ............................... 2
214
+ 0: valid_weighted_split_names ...................... ['validation']
215
+ 0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']]
216
+ 0: valid_weighted_split_paths_path ................. None
217
+ 0: valid_weighted_split_splits ..................... [['0:1']]
218
+ 0: valid_weighted_split_weights .................... [['1.0']]
219
+ 0: virtual_pipeline_model_parallel_size ............ None
220
+ 0: vocab_extra_ids ................................. 0
221
+ 0: vocab_file ...................................... gpt2/vocab.json
222
+ 0: weight_decay .................................... 0.1
223
+ 0: world_size ...................................... 8
224
+ 0: zero_allgather_bucket_size ...................... 0.0
225
+ 0: zero_contigious_gradients ....................... False
226
+ 0: zero_reduce_bucket_size ......................... 0.0
227
+ 0: zero_reduce_scatter ............................. False
228
+ 0: zero_stage ...................................... 0
229
+ 0: -------------------- end of arguments ---------------------
230
+ 0: setting number of micro-batches to constant 1
231
+ 0: > building GPT2BPETokenizer tokenizer ...
232
+ 0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
233
+ 0: DeepSpeed general environment info:
234
+ 0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch']
235
+ 0: torch version .................... 1.13.0+rocm5.2
236
+ 0: torch cuda version ............... None
237
+ 0: torch hip version ................ 5.2.21151-afdc89f8
238
+ 0: nvcc version ..................... None
239
+ 0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed']
240
+ 0: deepspeed info ................... 0.7.5, unknown, unknown
241
+ 0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1
242
+ 0: **** Git info for Megatron: git_hash=unknown git_branch=unknown ****
243
+ 0: > initializing torch distributed ...
244
+ 0: [2023-05-10 00:13:50,262] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
245
+ 0: > setting tensorboard ...
246
+ 0: > initializing tensor model parallel with size 1
247
+ 0: > initializing pipeline model parallel with size 1
248
+ 0: > setting random seeds to 1234 ...
249
+ 0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
250
+ 0: > compiling dataset index builder ...
251
+ 0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
252
+ 0: make: Nothing to be done for 'default'.
253
+ 0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
254
+ 0: >>> done with dataset index builder. Compilation time: 0.118 seconds
255
+ 0: > compiling and loading fused kernels ...
256
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified]
257
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
258
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
259
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
260
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified]
261
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
262
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
263
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
264
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
265
+ 0: Total number of unsupported CUDA function calls: 0
266
+ 0:
267
+ 0:
268
+ 0: Total number of replaced kernel launches: 87
269
+ 0: ninja: no work to do.
270
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified]
271
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified]
272
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
273
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
274
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
275
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
276
+ 0: Total number of unsupported CUDA function calls: 0
277
+ 0:
278
+ 0:
279
+ 0: Total number of replaced kernel launches: 63
280
+ 0: ninja: no work to do.
281
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes]
282
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified]
283
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
284
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
285
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
286
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
287
+ 0: Total number of unsupported CUDA function calls: 0
288
+ 0:
289
+ 0:
290
+ 0: Total number of replaced kernel launches: 67
291
+ 0: [1/1] c++ layer_norm_cuda.o layer_norm_hip_kernel.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/opt/rocm/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so
292
+ 0: >>> done with compiling and loading fused kernels. Compilation time: 11.020 seconds
293
+ 0: time to initialize megatron (seconds): 26.195
294
+ 0: [after megatron is initialized] datetime: 2023-05-10 00:14:02
295
+ 0: building GPT model ...
296
+ 0: [2023-05-10 00:14:02,324] [INFO] [utils.py:827:see_memory_usage] Before Building Model
297
+ 0: [2023-05-10 00:14:02,325] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB
298
+ 0: [2023-05-10 00:14:02,325] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.4 GB, percent = 7.4%
299
+ 0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None
300
+ 0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7}
301
+ 0: [2023-05-10 00:14:02,576] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer
302
+ 0: stage=0 layers=12
303
+ 0: 0: _to_float16
304
+ 0: 1: EmbeddingPipe
305
+ 0: 2: <lambda>
306
+ 0: 3: ParallelTransformerLayerPipe
307
+ 0: 4: ParallelTransformerLayerPipe
308
+ 0: 5: ParallelTransformerLayerPipe
309
+ 0: 6: ParallelTransformerLayerPipe
310
+ 0: 7: ParallelTransformerLayerPipe
311
+ 0: 8: undo
312
+ 0: 9: MixedFusedLayerNorm
313
+ 0: 10: EmbeddingPipe
314
+ 0: 11: float16_to_fp32
315
+ 0: loss: CrossEntropy
316
+ 0: [2023-05-10 00:14:03,105] [INFO] [utils.py:827:see_memory_usage] After Building Model
317
+ 0: [2023-05-10 00:14:03,106] [INFO] [utils.py:828:see_memory_usage] MA 0.04 GB Max_MA 0.04 GB CA 0.06 GB Max_CA 0 GB
318
+ 0: [2023-05-10 00:14:03,106] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.41 GB, percent = 7.4%
319
+ 0: setting training iterations to 0
320
+ 0: > learning rate decay style: cosine
321
+ 0: DeepSpeed is enabled.
322
+ 0: [2023-05-10 00:14:03,107] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown
323
+ 0: [2023-05-10 00:14:08,294] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
324
+ 0: [2023-05-10 00:14:08,295] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer
325
+ 0: [2023-05-10 00:14:08,295] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer
326
+ 0: [2023-05-10 00:14:08,296] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
327
+ 0: [2023-05-10 00:14:08,296] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer
328
+ 0: [2023-05-10 00:14:08,414] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer
329
+ 0: [2023-05-10 00:14:08,415] [INFO] [utils.py:828:see_memory_usage] MA 0.04 GB Max_MA 0.04 GB CA 0.06 GB Max_CA 0 GB
330
+ 0: [2023-05-10 00:14:08,415] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 38.96 GB, percent = 7.7%
331
+ 0: ninja: no work to do.
332
+ 0: Time to load utils op: 0.7600808143615723 seconds
333
+ 0: Time to load utils op: 0.7584981918334961 seconds
334
+ 0: Time to load utils op: 0.7595300674438477 seconds
335
+ 0: Time to load utils op: 0.7585179805755615 seconds
336
+ 0: Time to load utils op: 0.7589340209960938 seconds
337
+ 0: Time to load utils op: 0.7584795951843262 seconds
338
+ 0: Time to load utils op: 0.6399149894714355 seconds
339
+ 0: Time to load utils op: 0.760486364364624 seconds
340
+ 0: [2023-05-10 00:14:09,166] [INFO] [utils.py:827:see_memory_usage] before initializing group 0
341
+ 0: [2023-05-10 00:14:09,167] [INFO] [utils.py:828:see_memory_usage] MA 0.04 GB Max_MA 0.04 GB CA 0.06 GB Max_CA 0 GB
342
+ 0: [2023-05-10 00:14:09,167] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 38.29 GB, percent = 7.6%
343
+ 0: Time to load utils op: 0.0005950927734375 secondsTime to load utils op: 0.00048732757568359375 seconds
344
+ 0:
345
+ 0: Time to load utils op: 0.00044083595275878906 seconds
346
+ 0: Time to load utils op: 0.0006310939788818359 seconds
347
+ 0: Time to load utils op: 0.0004439353942871094 seconds
348
+ 0: Time to load utils op: 0.0006113052368164062 seconds
349
+ 0: Time to load utils op: 0.0011534690856933594 seconds
350
+ 0: [2023-05-10 00:14:10,015] [INFO] [utils.py:827:see_memory_usage] after initializing group 0
351
+ 0: [2023-05-10 00:14:10,016] [INFO] [utils.py:828:see_memory_usage] MA 0.11 GB Max_MA 0.11 GB CA 0.15 GB Max_CA 0 GB
352
+ 0: [2023-05-10 00:14:10,016] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.55 GB, percent = 7.5%
353
+ 0: [2023-05-10 00:14:10,123] [INFO] [utils.py:827:see_memory_usage] before initializing group 1
354
+ 0: [2023-05-10 00:14:10,124] [INFO] [utils.py:828:see_memory_usage] MA 0.11 GB Max_MA 0.11 GB CA 0.15 GB Max_CA 0 GB
355
+ 0: [2023-05-10 00:14:10,124] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
356
+ 0: [2023-05-10 00:14:10,226] [INFO] [utils.py:827:see_memory_usage] after initializing group 1
357
+ 0: [2023-05-10 00:14:10,226] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.15 GB Max_CA 0 GB
358
+ 0: [2023-05-10 00:14:10,227] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
359
+ 0: [2023-05-10 00:14:10,327] [INFO] [utils.py:827:see_memory_usage] before initializing group 2
360
+ 0: [2023-05-10 00:14:10,328] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.15 GB Max_CA 0 GB
361
+ 0: [2023-05-10 00:14:10,328] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
362
+ 0: [2023-05-10 00:14:10,430] [INFO] [utils.py:827:see_memory_usage] after initializing group 2
363
+ 0: [2023-05-10 00:14:10,430] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.15 GB Max_CA 0 GB
364
+ 0: [2023-05-10 00:14:10,430] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
365
+ 0: [2023-05-10 00:14:10,531] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer
366
+ 0: [2023-05-10 00:14:10,531] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.15 GB Max_CA 0 GB
367
+ 0: [2023-05-10 00:14:10,531] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
368
+ 0: [2023-05-10 00:14:10,637] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer
369
+ 0: [2023-05-10 00:14:10,638] [INFO] [utils.py:828:see_memory_usage] MA 0.14 GB Max_MA 0.14 GB CA 0.15 GB Max_CA 0 GB
370
+ 0: [2023-05-10 00:14:10,638] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
371
+ 0: [2023-05-10 00:14:10,738] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer
372
+ 0: [2023-05-10 00:14:10,739] [INFO] [utils.py:828:see_memory_usage] MA 0.14 GB Max_MA 0.14 GB CA 0.15 GB Max_CA 0 GB
373
+ 0: [2023-05-10 00:14:10,739] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.53 GB, percent = 7.5%
374
+ 0: [2023-05-10 00:14:10,739] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam
375
+ 0: [2023-05-10 00:14:10,739] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler
376
+ 0: [2023-05-10 00:14:10,739] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = <megatron.learning_rates.AnnealingLR object at 0x14b88d151fd0>
377
+ 0: [2023-05-10 00:14:10,739] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)]
378
+ 0: [2023-05-10 00:14:10,739] [INFO] [config.py:1007:print] DeepSpeedEngine configuration:
379
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] activation_checkpointing_config {
380
+ 0: "partition_activations": false,
381
+ 0: "contiguous_memory_optimization": false,
382
+ 0: "cpu_checkpointing": false,
383
+ 0: "number_checkpoints": null,
384
+ 0: "synchronize_checkpoint_boundary": false,
385
+ 0: "profile": false
386
+ 0: }
387
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
388
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] amp_enabled .................. False
389
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] amp_params ................... False
390
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] autotuning_config ............ {
391
+ 0: "enabled": false,
392
+ 0: "start_step": null,
393
+ 0: "end_step": null,
394
+ 0: "metric_path": null,
395
+ 0: "arg_mappings": null,
396
+ 0: "metric": "throughput",
397
+ 0: "model_info": null,
398
+ 0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results",
399
+ 0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps",
400
+ 0: "overwrite": true,
401
+ 0: "fast": true,
402
+ 0: "start_profile_step": 3,
403
+ 0: "end_profile_step": 5,
404
+ 0: "tuner_type": "gridsearch",
405
+ 0: "tuner_early_stopping": 5,
406
+ 0: "tuner_num_trials": 50,
407
+ 0: "model_info_path": null,
408
+ 0: "mp_size": 1,
409
+ 0: "max_train_batch_size": null,
410
+ 0: "min_train_batch_size": 1,
411
+ 0: "max_train_micro_batch_size_per_gpu": 1.024000e+03,
412
+ 0: "min_train_micro_batch_size_per_gpu": 1,
413
+ 0: "num_tuning_micro_batch_sizes": 3
414
+ 0: }
415
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] bfloat16_enabled ............. True
416
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False
417
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True
418
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False
419
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x14b88d151d60>
420
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] communication_data_type ...... None
421
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa
422
+ 0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
423
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] curriculum_enabled ........... False
424
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] curriculum_params ............ False
425
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] dataloader_drop_last ......... False
426
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] disable_allgather ............ False
427
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] dump_state ................... False
428
+ 0: [2023-05-10 00:14:10,740] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None
429
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False
430
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1
431
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer
432
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0
433
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100
434
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06
435
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01
436
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False
437
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] elasticity_enabled ........... False
438
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] flops_profiler_config ........ {
439
+ 0: "enabled": false,
440
+ 0: "profile_step": 1,
441
+ 0: "module_depth": -1,
442
+ 0: "top_modules": 1,
443
+ 0: "detailed": true,
444
+ 0: "output_file": null
445
+ 0: }
446
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] fp16_auto_cast ............... None
447
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] fp16_enabled ................. False
448
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False
449
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] global_rank .................. 0
450
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1
451
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0
452
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0
453
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1
454
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] load_universal_checkpoint .... False
455
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] loss_scale ................... 1.0
456
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] memory_breakdown ............. False
457
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] monitor_config ............... <deepspeed.monitor.config.DeepSpeedMonitorConfig object at 0x14b88d151ca0>
458
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] nebula_config ................ {
459
+ 0: "enabled": false,
460
+ 0: "persistent_storage_path": null,
461
+ 0: "persistent_time_interval": 100,
462
+ 0: "num_of_version_in_retention": 2,
463
+ 0: "enable_nebula_load": true,
464
+ 0: "load_path": null
465
+ 0: }
466
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False
467
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] optimizer_name ............... None
468
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] optimizer_params ............. None
469
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
470
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] pld_enabled .................. False
471
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] pld_params ................... False
472
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] prescale_gradients ........... False
473
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] scheduler_name ............... None
474
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] scheduler_params ............. None
475
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] sparse_attention ............. None
476
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False
477
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] steps_per_print .............. 2000
478
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] train_batch_size ............. 256
479
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 32
480
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] use_node_local_storage ....... False
481
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False
482
+ 0: [2023-05-10 00:14:10,741] [INFO] [config.py:1011:print] world_size ................... 8
483
+ 0: [2023-05-10 00:14:10,742] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False
484
+ 0: [2023-05-10 00:14:10,742] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False
485
+ 0: [2023-05-10 00:14:10,742] [INFO] [config.py:1011:print] zero_enabled ................. False
486
+ 0: [2023-05-10 00:14:10,742] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0
487
+ 0: [2023-05-10 00:14:10,742] [INFO] [config.py:996:print_user_config] json = {
488
+ 0: "train_micro_batch_size_per_gpu": 32,
489
+ 0: "train_batch_size": 256,
490
+ 0: "gradient_clipping": 1.0,
491
+ 0: "zero_optimization": {
492
+ 0: "stage": 0
493
+ 0: },
494
+ 0: "bf16": {
495
+ 0: "enabled": true
496
+ 0: },
497
+ 0: "steps_per_print": 2.000000e+03,
498
+ 0: "wall_clock_breakdown": false
499
+ 0: }
500
+ 0: Time to load utils op: 0.00042510032653808594 seconds
501
+ 0: [2023-05-10 00:14:10,742] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=32
502
+ 0: [2023-05-10 00:14:10,754] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=12 [0, 12) STAGE_PARAMS=19703712 (19.704M) TOTAL_PARAMS=19703712 (19.704M) UNIQUE_PARAMS=19703712 (19.704M)
503
+ 0: [2023-05-10 00:14:10,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
504
+ 0: [2023-05-10 00:14:10,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
505
+ 0: [2023-05-10 00:14:10,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
506
+ 0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
507
+ 0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
508
+ 0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
509
+ 0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
510
+ 0: [2023-05-10 00:14:10,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
511
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
512
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
513
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
514
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
515
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
516
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
517
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
518
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
519
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
520
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
521
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
522
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
523
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
524
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
525
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
526
+ 0: [2023-05-10 00:14:10,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt...
527
+ 0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
528
+ 0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
529
+ 0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
530
+ 0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
531
+ 0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
532
+ 0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
533
+ 0: [2023-05-10 00:14:10,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
534
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
535
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/mp_rank_00_model_states.pt.
536
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
537
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
538
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
539
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
540
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
541
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
542
+ 0: [2023-05-10 00:14:10,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
543
+ 0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
544
+ 0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
545
+ 0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
546
+ 0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
547
+ 0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
548
+ 0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
549
+ 0: [2023-05-10 00:14:11,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
550
+ 0: [2023-05-10 00:14:11,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
551
+ 0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
552
+ 0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
553
+ 0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
554
+ 0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
555
+ 0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
556
+ 0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
557
+ 0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
558
+ 0: [2023-05-10 00:14:11,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt...
559
+ 0: [2023-05-10 00:14:11,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
560
+ 0: [2023-05-10 00:14:11,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
561
+ 0: [2023-05-10 00:14:11,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
562
+ 0: [2023-05-10 00:14:11,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
563
+ 0: [2023-05-10 00:14:11,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
564
+ 0: [2023-05-10 00:14:11,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
565
+ 0: [2023-05-10 00:14:11,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
566
+ 0: [2023-05-10 00:14:11,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_01-model_00-model_states.pt.
567
+ 0: [2023-05-10 00:14:11,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
568
+ 0: [2023-05-10 00:14:11,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
569
+ 0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
570
+ 0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
571
+ 0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
572
+ 0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
573
+ 0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
574
+ 0: [2023-05-10 00:14:11,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
575
+ 0: [2023-05-10 00:14:11,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
576
+ 0: [2023-05-10 00:14:11,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
577
+ 0: [2023-05-10 00:14:11,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
578
+ 0: [2023-05-10 00:14:11,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
579
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
580
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
581
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
582
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
583
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
584
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
585
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
586
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
587
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
588
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
589
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
590
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
591
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt...
592
+ 0: [2023-05-10 00:14:11,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
593
+ 0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
594
+ 0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
595
+ 0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
596
+ 0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
597
+ 0: [2023-05-10 00:14:11,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
598
+ 0: [2023-05-10 00:14:11,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
599
+ 0: [2023-05-10 00:14:11,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
600
+ 0: [2023-05-10 00:14:11,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_03-model_00-model_states.pt.
601
+ 0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
602
+ 0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
603
+ 0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
604
+ 0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
605
+ 0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
606
+ 0: [2023-05-10 00:14:11,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
607
+ 0: [2023-05-10 00:14:11,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
608
+ 0: [2023-05-10 00:14:11,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
609
+ 0: [2023-05-10 00:14:11,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
610
+ 0: [2023-05-10 00:14:11,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
611
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
612
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
613
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
614
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
615
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
616
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
617
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
618
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
619
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
620
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
621
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
622
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
623
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt...
624
+ 0: [2023-05-10 00:14:11,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
625
+ 0: [2023-05-10 00:14:11,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
626
+ 0: [2023-05-10 00:14:11,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
627
+ 0: [2023-05-10 00:14:11,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
628
+ 0: [2023-05-10 00:14:11,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
629
+ 0: [2023-05-10 00:14:11,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
630
+ 0: [2023-05-10 00:14:11,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
631
+ 0: [2023-05-10 00:14:11,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
632
+ 0: [2023-05-10 00:14:11,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_04-model_00-model_states.pt.
633
+ 0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
634
+ 0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
635
+ 0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
636
+ 0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
637
+ 0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
638
+ 0: [2023-05-10 00:14:11,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
639
+ 0: [2023-05-10 00:14:11,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
640
+ 0: [2023-05-10 00:14:11,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
641
+ 0: [2023-05-10 00:14:11,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
642
+ 0: [2023-05-10 00:14:11,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
643
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
644
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
645
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
646
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
647
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
648
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
649
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
650
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
651
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
652
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
653
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
654
+ 0: [2023-05-10 00:14:11,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt...
655
+ 0: [2023-05-10 00:14:11,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
656
+ 0: [2023-05-10 00:14:11,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
657
+ 0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
658
+ 0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
659
+ 0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
660
+ 0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
661
+ 0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
662
+ 0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
663
+ 0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
664
+ 0: [2023-05-10 00:14:11,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_05-model_00-model_states.pt.
665
+ 0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
666
+ 0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
667
+ 0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
668
+ 0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
669
+ 0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
670
+ 0: [2023-05-10 00:14:11,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
671
+ 0: [2023-05-10 00:14:11,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
672
+ 0: [2023-05-10 00:14:11,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
673
+ 0: [2023-05-10 00:14:11,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
674
+ 0: [2023-05-10 00:14:11,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
675
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
676
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
677
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
678
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
679
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
680
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
681
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
682
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
683
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
684
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
685
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
686
+ 0: [2023-05-10 00:14:11,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt...
687
+ 0: [2023-05-10 00:14:11,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
688
+ 0: [2023-05-10 00:14:11,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
689
+ 0: [2023-05-10 00:14:11,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
690
+ 0: [2023-05-10 00:14:11,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
691
+ 0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
692
+ 0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
693
+ 0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
694
+ 0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
695
+ 0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
696
+ 0: [2023-05-10 00:14:11,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_06-model_00-model_states.pt.
697
+ 0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
698
+ 0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
699
+ 0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
700
+ 0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
701
+ 0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
702
+ 0: [2023-05-10 00:14:11,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
703
+ 0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
704
+ 0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
705
+ 0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
706
+ 0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
707
+ 0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
708
+ 0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
709
+ 0: [2023-05-10 00:14:11,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
710
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
711
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
712
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
713
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
714
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
715
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
716
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
717
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
718
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt...
719
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
720
+ 0: [2023-05-10 00:14:11,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
721
+ 0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
722
+ 0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
723
+ 0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
724
+ 0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
725
+ 0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
726
+ 0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_07-model_00-model_states.pt.
727
+ 0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
728
+ 0: [2023-05-10 00:14:11,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
729
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
730
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
731
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
732
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
733
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
734
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
735
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
736
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
737
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
738
+ 0: [2023-05-10 00:14:11,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
739
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
740
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
741
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
742
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
743
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
744
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
745
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
746
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
747
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
748
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
749
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
750
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
751
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
752
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt...
753
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
754
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
755
+ 0: [2023-05-10 00:14:11,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
756
+ 0: [2023-05-10 00:14:11,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
757
+ 0: [2023-05-10 00:14:11,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
758
+ 0: [2023-05-10 00:14:11,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/layer_09-model_00-model_states.pt.
759
+ 0: > overriding learning rate value to 0.0002
760
+ 0: > overriding minimum learning rate value to 2e-05
761
+ 0: > overriding warmup iterations value to 0
762
+ 0: > overriding total number of iterations value to 1
763
+ 0: > overriding decay style value to cosine
764
+ 0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
765
+ 0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
766
+ 0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
767
+ 0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
768
+ 0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
769
+ 0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
770
+ 0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
771
+ 0: [2023-05-10 00:14:11,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
772
+ 0: [2023-05-10 00:14:11,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
773
+ 0: [2023-05-10 00:14:11,363] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 4
774
+ 0: [2023-05-10 00:14:11,365] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 4
775
+ 0: [2023-05-10 00:14:11,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
776
+ 0: [2023-05-10 00:14:11,376] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 3
777
+ 0: [2023-05-10 00:14:11,378] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 3
778
+ 0: [2023-05-10 00:14:11,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
779
+ 0: [2023-05-10 00:14:11,387] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 0
780
+ 0: [2023-05-10 00:14:11,390] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 0
781
+ 0: could not find arguments in the checkpoint ...
782
+ 0: checkpoint version 3.0
783
+ 0: [2023-05-10 00:14:11,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
784
+ 0: [2023-05-10 00:14:11,395] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 5
785
+ 0: [2023-05-10 00:14:11,397] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 5
786
+ 0: [2023-05-10 00:14:11,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
787
+ 0: [2023-05-10 00:14:11,398] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 2
788
+ 0: [2023-05-10 00:14:11,400] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 2
789
+ 0: [2023-05-10 00:14:11,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
790
+ 0: [2023-05-10 00:14:11,409] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 1
791
+ 0: [2023-05-10 00:14:11,411] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 1
792
+ 0: [2023-05-10 00:14:11,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
793
+ 0: [2023-05-10 00:14:11,444] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 7
794
+ 0: [2023-05-10 00:14:11,445] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 7
795
+ 0: [2023-05-10 00:14:11,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_21m400m400m/global_step762/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
796
+ 0: [2023-05-10 00:14:11,454] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 6
797
+ 0: [2023-05-10 00:14:11,455] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 6
798
+ 0: successfully loaded checkpoint from checkpoints_21m400m400m at iteration 0
799
+ 0: time (ms) | load-checkpoint: 701.61
800
+ 0: estimated model parameters: 0.019703712
801
+ 0: estimated model parameters without embeddings: 0.004626336
802
+ 0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-05-10 00:14:12
803
+ 0: > building train, validation, and test datasets ...
804
+ 0: > datasets target sizes (minimum size):
805
+ 0: train: 1
806
+ 0: validation: 25600
807
+ 0: test: 25600
808
+ 0: > building train, validation, and test datasets for GPT ...
809
+ 0: > building dataset index ...
810
+ 0: reading sizes...
811
+ 0: reading pointers...
812
+ 0: reading document index...
813
+ 0: creating numpy buffer of mmap...
814
+ 0: creating memory view of numpy buffer...
815
+ 0: > finished creating indexed dataset in 0.036932 seconds
816
+ 0: number of documents: 835726
817
+ 0: > dataset split:
818
+ 0: train:
819
+ 0: document indices in [0, 835726) total of 835726 documents
820
+ 0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy
821
+ 0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy
822
+ 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy
823
+ 0: loaded indexed file in 0.113 seconds
824
+ 0: total number of samples: 195101
825
+ 0: total number of epochs: 1
826
+ 0: > building dataset index ...
827
+ 0: reading sizes...
828
+ 0: reading pointers...
829
+ 0: reading document index...
830
+ 0: creating numpy buffer of mmap...
831
+ 0: creating memory view of numpy buffer...
832
+ 0: > finished creating indexed dataset in 0.110965 seconds
833
+ 0: number of documents: 364608
834
+ 0: > dataset split:
835
+ 0: validation:
836
+ 0: document indices in [0, 364608) total of 364608 documents
837
+ 0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy
838
+ 0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy
839
+ 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy
840
+ 0: loaded indexed file in 0.096 seconds
841
+ 0: total number of samples: 84978
842
+ 0: total number of epochs: 1
843
+ 0: > finished creating GPT datasets ...
844
+ 0: time (ms) | model-and-optimizer-setup: 9807.69 | train/valid/test-data-iterators-setup: 7326.47
845
+ 0: [after dataloaders are built] datetime: 2023-05-10 00:14:19
846
+ 0: done with setup ...
847
+ 0: training ...
848
+ 0: [after training is done] datetime: 2023-05-10 00:14:19
849
+ 0: -----------------------------------------------------------------------------------------------------------------
850
+ 0: validation loss at the end of training for val data | lm loss value: 6.096268E+00 | lm loss PPL: 4.441970E+02 |
851
+ 0: -----------------------------------------------------------------------------------------------------------------
852
+ END 3487337: Wed 10 May 2023 12:15:00 AM EEST
21m400m400m/global_step762/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac95f09abe87dd9c68b9b4a9829080de26ac03ebe49e9fab6898e669e10204ca
3
+ size 29560343
21m400m400m/global_step762/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efb818b085f236a1a1f791c2f1a795db756a52a66ab08d0286bf5b8006b76794
3
+ size 29560215
21m400m400m/global_step762/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ef244aa36694f385dc28084076b72043fa3814eca2d07cf51b033500ef3989e
3
+ size 29560343
21m400m400m/global_step762/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1317c9587f00bee62633072cbf5394ec9ee83838a437729fb54847887465bfee
3
+ size 29560343
21m400m400m/global_step762/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6913ca4a0ae0e6991b668a9c7a92acc87ea502c5a5c096c07d8195a5cd06f69b
3
+ size 29560279
21m400m400m/global_step762/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ef2876867f1b52a3d6362517c9d58f2b70641956bff4a03cea9b7a09fabb83c
3
+ size 29560343
21m400m400m/global_step762/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47ed8dd698a7b82205d723acc34b2f90ff96e074b123b252f2db5bd902c18174
3
+ size 29560471
21m400m400m/global_step762/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f5f4b3628d526b6ca81f845847cc6c26fafc52052188540565a0478e03ebf0a
3
+ size 29560727
21m400m400m/global_step762/layer_01-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88174b136681638ce4da8cddad57423ca02d50894fedede8a82b5ed6b21df9a5
3
+ size 30156035
21m400m400m/global_step762/layer_03-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da290e4e9bdc0b0bf908d9233566700072a2485174cf204ca66121dc8e16afe1
3
+ size 1854659
21m400m400m/global_step762/layer_04-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:248612036a8e9e67d8a9b5fd1ee266df4e0bc31afc9b7162f40554f7ebd5ff96
3
+ size 1854659
21m400m400m/global_step762/layer_05-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82a70fc53941272564fca38b712423a55b937d3b1395524d24a8e0b42cf2790
3
+ size 1854659
21m400m400m/global_step762/layer_06-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa99a8168c79f748cd8639466fd29bbb78f68c770fbf4b24763f6d72d2345b91
3
+ size 1854659
21m400m400m/global_step762/layer_07-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb0d4bf06d9d68253b1b5556324c495446c93f1237ad79195f56ff4b1cf4ecf
3
+ size 1854659
21m400m400m/global_step762/layer_09-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2670512cabbec37a36b100140fc6e12d6f75b5c85108026ccf5971554ee687f
3
+ size 2371
21m400m400m/global_step762/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9a98d0d201d95a08d421020b41ee795e8f824d673cf1d88e81bdc3733007d47
3
+ size 27827
21m400m400m/sbatch_21m400m400m.sh ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
3
+ #SBATCH --nodes=1
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=32
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 2-0:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=21m400m400m
17
+
18
+ # if run without sbatch, invoke here
19
+ if [ -z $SLURM_JOB_ID ]; then
20
+ mkdir -p logs
21
+ sbatch "$0"
22
+ exit
23
+ fi
24
+
25
+ set -euo pipefail
26
+
27
+ # symlink logs/latest.out and logs/latest.err
28
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
29
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
30
+
31
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
32
+ CHECKPOINT_PATH=checkpoints_$VARIANT
33
+ TENSORBOARD_PATH=tensorboard_$VARIANT
34
+ mkdir -p $CHECKPOINT_PATH
35
+ mkdir -p $TENSORBOARD_PATH
36
+
37
+ # Data
38
+ VOCAB_FILE="gpt2/vocab.json"
39
+ MERGE_FILE="gpt2/merges.txt"
40
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
41
+ TRAIN_DATA_PATH=train400m.txt
42
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document"
43
+ VALID_DATA_PATH=val.txt
44
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
45
+
46
+
47
+ PP_SIZE=1
48
+ TP_SIZE=1
49
+
50
+ MICRO_BATCH_SIZE=32
51
+ GRADIENT_ACCUMULATION_STEPS=1
52
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
53
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
54
+
55
+ # Model parameters
56
+ source model_params.sh
57
+ MODEL_PARAM=("${PARAM_21M[@]}")
58
+ NHIDDEN=${MODEL_PARAM[0]}
59
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
60
+ KV_SIZE=${MODEL_PARAM[2]}
61
+ NHEADS=${MODEL_PARAM[3]}
62
+ NLAYERS=${MODEL_PARAM[4]}
63
+ SEQ_LEN=2048
64
+
65
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
66
+
67
+ SAVE_INTERVAL=10000
68
+
69
+ # Tokens: 100 000 000
70
+ # -> Samples: 48828.125
71
+ #TRAIN_SAMPLES=48_828
72
+ # Tokens: 400M
73
+ # 195312.5
74
+ TRAIN_SAMPLES=195_313
75
+
76
+
77
+ OPTIMIZER_ARGS=" \
78
+ --optimizer adam \
79
+ --adam-beta1 0.9 \
80
+ --adam-beta2 0.999 \
81
+ --adam-eps 1e-8 \
82
+ --lr 2e-4 \
83
+ --min-lr 2e-5 \
84
+ --lr-decay-style cosine \
85
+ --lr-decay-samples $TRAIN_SAMPLES \
86
+ --lr-warmup-samples 1953 \
87
+ --clip-grad 1.0 \
88
+ --weight-decay 1e-1 \
89
+ "
90
+
91
+ GPT_ARGS=" \
92
+ --num-layers $NLAYERS \
93
+ --hidden-size $NHIDDEN \
94
+ --num-attention-heads $NHEADS \
95
+ --kv-channels $KV_SIZE \
96
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
97
+ --seq-length $SEQ_LEN \
98
+ --max-position-embeddings $SEQ_LEN \
99
+ --micro-batch-size $MICRO_BATCH_SIZE \
100
+ --global-batch-size $GLOBAL_BATCH_SIZE \
101
+ --train-samples $TRAIN_SAMPLES \
102
+ --vocab-file $VOCAB_FILE \
103
+ --merge-file $MERGE_FILE \
104
+ --loss-scale 12 \
105
+ --clip-grad 1.0 \
106
+ --kill-switch-path $KILL_SWITCH_PATH \
107
+ --bf16 \
108
+ --checkpoint-activations \
109
+ $OPTIMIZER_ARGS \
110
+ "
111
+
112
+ OUTPUT_ARGS=" \
113
+ --log-interval 10 \
114
+ --save-interval $SAVE_INTERVAL \
115
+ --eval-interval 1000 \
116
+ --eval-iters 1 \
117
+ --tensorboard-dir $TENSORBOARD_PATH \
118
+ --tensorboard-queue-size 5 \
119
+ --log-timers-to-tensorboard \
120
+ --log-batch-size-to-tensorboard \
121
+ --log-validation-ppl-to-tensorboard \
122
+ "
123
+
124
+ ZERO_STAGE=0
125
+
126
+ mkdir -p ds_configs
127
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
128
+
129
+ cat <<EOF > $DS_CONFIG_PATH
130
+ {
131
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
132
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
133
+ "gradient_clipping": 1.0,
134
+ "zero_optimization": {
135
+ "stage": $ZERO_STAGE
136
+ },
137
+ "bf16": {
138
+ "enabled": true
139
+ },
140
+ "steps_per_print": 2000,
141
+ "wall_clock_breakdown": false
142
+ }
143
+ EOF
144
+
145
+ DEEPSPEED_ARGS=" \
146
+ --deepspeed \
147
+ --deepspeed_config $DS_CONFIG_PATH \
148
+ --zero-stage $ZERO_STAGE \
149
+ "
150
+
151
+ CMD=" \
152
+ Megatron-DeepSpeed/pretrain_gpt.py \
153
+ --tensor-model-parallel-size $TP_SIZE \
154
+ --pipeline-model-parallel-size $PP_SIZE \
155
+ $GPT_ARGS \
156
+ $OUTPUT_ARGS \
157
+ --save $CHECKPOINT_PATH \
158
+ --load $CHECKPOINT_PATH \
159
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
160
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
161
+ --data-impl mmap \
162
+ $DEEPSPEED_ARGS \
163
+ "
164
+
165
+ echo $CMD
166
+
167
+ echo "START $SLURM_JOBID: $(date)"
168
+
169
+ # bash launch_srun.sh $CMD
170
+ srun --label launch.sh $CMD
171
+
172
+ echo "END $SLURM_JOBID: $(date)"
21m400m400m/sbatch_21m400m400mval.sh ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
3
+ #SBATCH --nodes=1
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=32
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p small-g
8
+ #SBATCH -t 12:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=21m400m400mval
17
+ VARIANT_CKPT=21m400m400m
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+ TRAIN_DATA_PATH=train400m.txt
41
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document"
42
+ VALID_DATA_PATH=val.txt
43
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
44
+
45
+ PP_SIZE=1
46
+ TP_SIZE=1
47
+
48
+ MICRO_BATCH_SIZE=32
49
+ GRADIENT_ACCUMULATION_STEPS=1
50
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
51
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
52
+
53
+ # Model parameters
54
+ source model_params.sh
55
+ MODEL_PARAM=("${PARAM_20M[@]}")
56
+ NHIDDEN=${MODEL_PARAM[0]}
57
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
58
+ KV_SIZE=${MODEL_PARAM[2]}
59
+ NHEADS=${MODEL_PARAM[3]}
60
+ NLAYERS=${MODEL_PARAM[4]}
61
+ SEQ_LEN=2048
62
+
63
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
64
+
65
+ SAVE_INTERVAL=1000
66
+
67
+ # Tokens: 31633480000
68
+ # -> Samples: 15446035
69
+ TRAIN_SAMPLES=1
70
+
71
+ OPTIMIZER_ARGS=" \
72
+ --optimizer adam \
73
+ --adam-beta1 0.9 \
74
+ --adam-beta2 0.999 \
75
+ --adam-eps 1e-8 \
76
+ --lr 2e-4 \
77
+ --min-lr 2e-5 \
78
+ --lr-decay-style cosine \
79
+ --lr-decay-samples $TRAIN_SAMPLES \
80
+ --lr-warmup-samples 0 \
81
+ --clip-grad 1.0 \
82
+ --weight-decay 1e-1 \
83
+ --override-lr-scheduler \
84
+ --no-load-optim \
85
+ --reset-progress \
86
+ "
87
+
88
+ GPT_ARGS=" \
89
+ --num-layers $NLAYERS \
90
+ --hidden-size $NHIDDEN \
91
+ --num-attention-heads $NHEADS \
92
+ --kv-channels $KV_SIZE \
93
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
94
+ --seq-length $SEQ_LEN \
95
+ --max-position-embeddings $SEQ_LEN \
96
+ --micro-batch-size $MICRO_BATCH_SIZE \
97
+ --global-batch-size $GLOBAL_BATCH_SIZE \
98
+ --train-samples $TRAIN_SAMPLES \
99
+ --vocab-file $VOCAB_FILE \
100
+ --merge-file $MERGE_FILE \
101
+ --clip-grad 1.0 \
102
+ --kill-switch-path $KILL_SWITCH_PATH \
103
+ --bf16 \
104
+ $OPTIMIZER_ARGS \
105
+ "
106
+
107
+ OUTPUT_ARGS=" \
108
+ --log-interval 10 \
109
+ --save-interval $SAVE_INTERVAL \
110
+ --eval-interval 1 \
111
+ --eval-iters 100 \
112
+ --tensorboard-dir $TENSORBOARD_PATH \
113
+ --tensorboard-queue-size 5 \
114
+ --log-timers-to-tensorboard \
115
+ --log-batch-size-to-tensorboard \
116
+ --log-validation-ppl-to-tensorboard \
117
+ "
118
+
119
+ ZERO_STAGE=0
120
+
121
+ mkdir -p ds_configs
122
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
123
+
124
+ cat <<EOF > $DS_CONFIG_PATH
125
+ {
126
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
127
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
128
+ "gradient_clipping": 1.0,
129
+ "zero_optimization": {
130
+ "stage": $ZERO_STAGE
131
+ },
132
+ "bf16": {
133
+ "enabled": true
134
+ },
135
+ "steps_per_print": 2000,
136
+ "wall_clock_breakdown": false
137
+ }
138
+ EOF
139
+
140
+ DEEPSPEED_ARGS=" \
141
+ --deepspeed \
142
+ --deepspeed_config $DS_CONFIG_PATH \
143
+ --zero-stage $ZERO_STAGE \
144
+ "
145
+
146
+ CMD=" \
147
+ Megatron-DeepSpeed/pretrain_gpt.py \
148
+ --tensor-model-parallel-size $TP_SIZE \
149
+ --pipeline-model-parallel-size $PP_SIZE \
150
+ $GPT_ARGS \
151
+ $OUTPUT_ARGS \
152
+ --save $CHECKPOINT_PATH \
153
+ --load $CHECKPOINT_PATH \
154
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
155
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
156
+ --data-impl mmap \
157
+ $DEEPSPEED_ARGS \
158
+ "
159
+
160
+ echo $CMD
161
+
162
+ echo "START $SLURM_JOBID: $(date)"
163
+
164
+ # bash launch_srun.sh $CMD
165
+ srun --label launch.sh $CMD
166
+
167
+ echo "END $SLURM_JOBID: $(date)"
21m400m400m/tensorboard_21m400m400m/events.out.tfevents.1683665937.nid005223.63334.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e21f3a911a130ca44d1743a160abe95a79e77640e38e8542149a3c89501d6f03
3
+ size 1369058
21m400m400m/tensorboard_21m400m400mval/events.out.tfevents.1683666595.nid007269.50912.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65257e05e3a25f4bf5fa93218d0fc6e10e8040d0f4501e532d87436fb186a6e6
3
+ size 40
21m400m400m/tensorboard_21m400m400mval/events.out.tfevents.1683666830.nid007269.54799.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e683f0205510ce039d2a2d0816282b94c3cdfb5b71fdac91e5280090b1a264b
3
+ size 980
220m200b1b5/sbatch_220m200b1b5.sh ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --nodes=8
3
+ #SBATCH --ntasks-per-node=1
4
+ #SBATCH --cpus-per-task=32
5
+ #SBATCH --mem=256G
6
+ #SBATCH -p standard-g
7
+ #SBATCH -t 48:00:00
8
+ #SBATCH --gpus-per-node=mi250:8
9
+ #SBATCH --exclusive=user
10
+ #SBATCH --hint=nomultithread
11
+ #SBATCH --account=project_462000119
12
+ #SBATCH -o logs/%j.out
13
+ #SBATCH -e logs/%j.err
14
+
15
+ VARIANT=220m200b1b5
16
+
17
+ # if run without sbatch, invoke here
18
+ if [ -z $SLURM_JOB_ID ]; then
19
+ mkdir -p logs
20
+ sbatch "$0"
21
+ exit
22
+ fi
23
+
24
+ set -euo pipefail
25
+
26
+ # symlink logs/latest.out and logs/latest.err
27
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
28
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
29
+
30
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
31
+ CHECKPOINT_PATH=checkpoints_$VARIANT
32
+ TENSORBOARD_PATH=tensorboard_$VARIANT
33
+ # Start from scratch
34
+ # rm -rf "$CHECKPOINT_PATH" "$TENSORBOARD_PATH"
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ # DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+ TRAIN_DATA_PATH=train1b5.txt
41
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document"
42
+ VALID_DATA_PATH=val.txt
43
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
44
+
45
+ PP_SIZE=1
46
+ TP_SIZE=1
47
+
48
+ MICRO_BATCH_SIZE=4
49
+ GRADIENT_ACCUMULATION_STEPS=1
50
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
51
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
52
+
53
+ # Model parameters
54
+ source model_params.sh
55
+ MODEL_PARAM=("${PARAM_217M[@]}")
56
+ NHIDDEN=${MODEL_PARAM[0]}
57
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
58
+ KV_SIZE=${MODEL_PARAM[2]}
59
+ NHEADS=${MODEL_PARAM[3]}
60
+ NLAYERS=${MODEL_PARAM[4]}
61
+ SEQ_LEN=2048
62
+
63
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
64
+
65
+ SAVE_INTERVAL=20000
66
+
67
+ # Tokens: 7510000000
68
+ # -> Samples: 3_666_992
69
+ TRAIN_SAMPLES=3_666_992
70
+ # Tokens: 200e9
71
+ TRAIN_SAMPLES=97_656_250
72
+
73
+ OPTIMIZER_ARGS=" \
74
+ --optimizer adam \
75
+ --adam-beta1 0.9 \
76
+ --adam-beta2 0.999 \
77
+ --adam-eps 1e-8 \
78
+ --lr 2e-4 \
79
+ --min-lr 2e-5 \
80
+ --lr-decay-style cosine \
81
+ --lr-decay-samples $TRAIN_SAMPLES \
82
+ --lr-warmup-samples 976_563 \
83
+ --clip-grad 1.0 \
84
+ --weight-decay 1e-1 \
85
+ "
86
+
87
+ GPT_ARGS=" \
88
+ --num-layers $NLAYERS \
89
+ --hidden-size $NHIDDEN \
90
+ --num-attention-heads $NHEADS \
91
+ --kv-channels $KV_SIZE \
92
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
93
+ --seq-length $SEQ_LEN \
94
+ --max-position-embeddings $SEQ_LEN \
95
+ --micro-batch-size $MICRO_BATCH_SIZE \
96
+ --global-batch-size $GLOBAL_BATCH_SIZE \
97
+ --train-samples $TRAIN_SAMPLES \
98
+ --vocab-file $VOCAB_FILE \
99
+ --merge-file $MERGE_FILE \
100
+ --clip-grad 1.0 \
101
+ --kill-switch-path $KILL_SWITCH_PATH \
102
+ --bf16 \
103
+ $OPTIMIZER_ARGS \
104
+ "
105
+
106
+ OUTPUT_ARGS=" \
107
+ --log-interval 500 \
108
+ --save-interval $SAVE_INTERVAL \
109
+ --eval-interval 50000 \
110
+ --eval-iters 1 \
111
+ --tensorboard-dir $TENSORBOARD_PATH \
112
+ --tensorboard-queue-size 5 \
113
+ --log-timers-to-tensorboard \
114
+ --log-batch-size-to-tensorboard \
115
+ --log-validation-ppl-to-tensorboard \
116
+ "
117
+
118
+ ZERO_STAGE=0
119
+
120
+ mkdir -p ds_configs
121
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
122
+
123
+ cat <<EOF > $DS_CONFIG_PATH
124
+ {
125
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
126
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
127
+ "gradient_clipping": 1.0,
128
+ "zero_optimization": {
129
+ "stage": $ZERO_STAGE
130
+ },
131
+ "bf16": {
132
+ "enabled": true
133
+ },
134
+ "steps_per_print": 2000,
135
+ "wall_clock_breakdown": false
136
+ }
137
+ EOF
138
+
139
+ DEEPSPEED_ARGS=" \
140
+ --deepspeed \
141
+ --deepspeed_config $DS_CONFIG_PATH \
142
+ --zero-stage $ZERO_STAGE \
143
+ "
144
+
145
+ CMD=" \
146
+ Megatron-DeepSpeed2/pretrain_gpt.py \
147
+ --tensor-model-parallel-size $TP_SIZE \
148
+ --pipeline-model-parallel-size $PP_SIZE \
149
+ $GPT_ARGS \
150
+ $OUTPUT_ARGS \
151
+ --save $CHECKPOINT_PATH \
152
+ --load $CHECKPOINT_PATH \
153
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
154
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
155
+ --data-impl mmap \
156
+ $DEEPSPEED_ARGS \
157
+ "
158
+
159
+ echo $CMD
160
+
161
+ echo "START $SLURM_JOBID: $(date)"
162
+
163
+ # bash launch_srun.sh $CMD
164
+ srun --label launch.sh $CMD
165
+
166
+ echo "END $SLURM_JOBID: $(date)"
220m200b1b5/sbatch_220m200b1b5val.sh ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --nodes=8
3
+ #SBATCH --ntasks-per-node=1
4
+ #SBATCH --cpus-per-task=32
5
+ #SBATCH --mem=256G
6
+ #SBATCH -p standard-g
7
+ #SBATCH -t 48:00:00
8
+ #SBATCH --gpus-per-node=mi250:8
9
+ #SBATCH --exclusive=user
10
+ #SBATCH --hint=nomultithread
11
+ #SBATCH --account=project_462000119
12
+ #SBATCH -o logs/%j.out
13
+ #SBATCH -e logs/%j.err
14
+
15
+ VARIANT=220m200b1b5val
16
+ VARIANT_CKPT=220m200b1b5
17
+
18
+ # if run without sbatch, invoke here
19
+ if [ -z $SLURM_JOB_ID ]; then
20
+ mkdir -p logs
21
+ sbatch "$0"
22
+ exit
23
+ fi
24
+
25
+ set -euo pipefail
26
+
27
+ # symlink logs/latest.out and logs/latest.err
28
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
29
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
30
+
31
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
32
+ CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT
33
+ TENSORBOARD_PATH=tensorboard_$VARIANT
34
+
35
+ # Data
36
+ VOCAB_FILE="gpt2/vocab.json"
37
+ MERGE_FILE="gpt2/merges.txt"
38
+ # DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
39
+ TRAIN_DATA_PATH=train1b5.txt
40
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_7B5_text_document"
41
+ VALID_DATA_PATH=val.txt
42
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
43
+
44
+ PP_SIZE=1
45
+ TP_SIZE=1
46
+
47
+ MICRO_BATCH_SIZE=4
48
+ GRADIENT_ACCUMULATION_STEPS=1
49
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
50
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
51
+
52
+ # Model parameters
53
+ source model_params.sh
54
+ MODEL_PARAM=("${PARAM_217M[@]}")
55
+ NHIDDEN=${MODEL_PARAM[0]}
56
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
57
+ KV_SIZE=${MODEL_PARAM[2]}
58
+ NHEADS=${MODEL_PARAM[3]}
59
+ NLAYERS=${MODEL_PARAM[4]}
60
+ SEQ_LEN=2048
61
+
62
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
63
+
64
+ SAVE_INTERVAL=1000
65
+
66
+ # Tokens: 7510000000
67
+ # -> Samples: 3_666_992
68
+ TRAIN_SAMPLES=1
69
+
70
+ OPTIMIZER_ARGS=" \
71
+ --optimizer adam \
72
+ --adam-beta1 0.9 \
73
+ --adam-beta2 0.999 \
74
+ --adam-eps 1e-8 \
75
+ --lr 2e-4 \
76
+ --min-lr 2e-5 \
77
+ --lr-decay-style cosine \
78
+ --lr-decay-samples $TRAIN_SAMPLES \
79
+ --lr-warmup-samples 0 \
80
+ --clip-grad 1.0 \
81
+ --weight-decay 1e-1 \
82
+ --no-load-optim \
83
+ --reset-progress \
84
+ --override-lr-scheduler \
85
+ "
86
+
87
+ GPT_ARGS=" \
88
+ --num-layers $NLAYERS \
89
+ --hidden-size $NHIDDEN \
90
+ --num-attention-heads $NHEADS \
91
+ --kv-channels $KV_SIZE \
92
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
93
+ --seq-length $SEQ_LEN \
94
+ --max-position-embeddings $SEQ_LEN \
95
+ --micro-batch-size $MICRO_BATCH_SIZE \
96
+ --global-batch-size $GLOBAL_BATCH_SIZE \
97
+ --train-samples $TRAIN_SAMPLES \
98
+ --vocab-file $VOCAB_FILE \
99
+ --merge-file $MERGE_FILE \
100
+ --clip-grad 1.0 \
101
+ --kill-switch-path $KILL_SWITCH_PATH \
102
+ --bf16 \
103
+ $OPTIMIZER_ARGS \
104
+ "
105
+
106
+ OUTPUT_ARGS=" \
107
+ --log-interval 10 \
108
+ --save-interval $SAVE_INTERVAL \
109
+ --eval-interval 1 \
110
+ --eval-only true \
111
+ --eval-iters 100 \
112
+ --tensorboard-dir $TENSORBOARD_PATH \
113
+ --tensorboard-queue-size 5 \
114
+ --log-timers-to-tensorboard \
115
+ --log-batch-size-to-tensorboard \
116
+ --log-validation-ppl-to-tensorboard \
117
+ "
118
+
119
+ ZERO_STAGE=0
120
+
121
+ mkdir -p ds_configs
122
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
123
+
124
+ cat <<EOF > $DS_CONFIG_PATH
125
+ {
126
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
127
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
128
+ "gradient_clipping": 1.0,
129
+ "zero_optimization": {
130
+ "stage": $ZERO_STAGE
131
+ },
132
+ "bf16": {
133
+ "enabled": true
134
+ },
135
+ "steps_per_print": 2000,
136
+ "wall_clock_breakdown": false
137
+ }
138
+ EOF
139
+
140
+ DEEPSPEED_ARGS=" \
141
+ --deepspeed \
142
+ --deepspeed_config $DS_CONFIG_PATH \
143
+ --zero-stage $ZERO_STAGE \
144
+ "
145
+
146
+ CMD=" \
147
+ Megatron-DeepSpeed/pretrain_gpt.py \
148
+ --tensor-model-parallel-size $TP_SIZE \
149
+ --pipeline-model-parallel-size $PP_SIZE \
150
+ $GPT_ARGS \
151
+ $OUTPUT_ARGS \
152
+ --save $CHECKPOINT_PATH \
153
+ --load $CHECKPOINT_PATH \
154
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
155
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
156
+ --data-impl mmap \
157
+ $DEEPSPEED_ARGS \
158
+ "
159
+
160
+ echo $CMD
161
+
162
+ echo "START $SLURM_JOBID: $(date)"
163
+
164
+ # bash launch_srun.sh $CMD
165
+ srun --label launch.sh $CMD
166
+
167
+ echo "END $SLURM_JOBID: $(date)"
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679051664.nid006529.96495.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d8a384f1edb99245e32c8e09b0b98d0e785f7fcc795b02ae7979e700b7bf876
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679051664.nid006860.1508.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5e7546168f06f9f3997d509636c095c97ad934478f199f0fb05746159f099a3
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1679054214.nid005116.13183.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86485eaeafb6eadfdec07df2066d12246f64fd1e41a62d7889ad18b31b08e77d
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793013.nid006063.122512.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54a286529cce0b3c873c9a9f87735d562748c34cdd700798b63d72f4c6cf4b04
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793015.nid006273.117983.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14a468073411f7d913b12b12958d64739bbd2a3c29f1c94b9ecfc8d325ed5da8
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793016.nid005651.127873.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:207df892df43e9394cb11e2ff1b4f7f95d0039b25b74c6a900e0a8bd5208c8d7
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793016.nid006265.117563.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe26d220f3723b40d1eecd1dff70d045ca64c392ecc445005c44ea1a44a7c8fd
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793017.nid006567.56933.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:107639576d96fa795148cb8550a24f8fb4843cb1fea76884db17ed6a8830cb0d
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793017.nid006575.55528.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b7b3238afa138c1766b78154d5126f367ee9ee615cbceca0031b11ef351dde6
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793022.nid005643.128637.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb81c8fea8287a723a191dcd2cac25f58dae45ecb292239565daeff7b5744b59
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793023.nid005499.130838.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c37ee75762d5906a51e76ad3873f3c60ca442e17373bfaa1ab28f2a8bf994cb
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793030.nid006090.118840.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d4c4baad370c93e5d77d108957721ff2d317a621ef1af7d7ff68613b258bc9d
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5/events.out.tfevents.1680793032.nid006082.119082.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dc49565466861ca44cbcc7856fbd1e612b5ef85984b213f5bcd98f029edfac3
3
+ size 40
220m200b1b5/tensorboard_220m200b1b5val/events.out.tfevents.1679048250.nid005617.100428.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f54193a536aeffe88bfa5a335d40b17950d0849e95ee52fed089d78d34b17375
3
+ size 980
2b812b4b/tensorboard_2b812b4bval/events.out.tfevents.1683534410.nid005943.115511.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f30d1b2ad94b178af57f6d4acf4195d63335c5830d80ccca348d5f14396be1b
3
+ size 980
2b816b4b/tensorboard_2b816b4bval/events.out.tfevents.1683561719.nid006565.21977.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1cc546baa926fa8573d48c6b8a9b97e0852cffcc2f6a5b9d670e04b65412704
3
+ size 980
4b248b12b/3490059.err ADDED
The diff for this file is too large to render. See raw diff
 
4b248b12b/3490059.out ADDED
The diff for this file is too large to render. See raw diff
 
4b248b12b/global_step45776/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5a466022447ef149b3b3c49260ed33208121c2bbd10b475858ef541a553b07
3
+ size 199058647
4b248b12b/global_step45776/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9377c402647d084fe9ee9c4767817b75eaed42cf13d5b0723e40080a55b5faf
3
+ size 199058647
4b248b12b/global_step45776/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a67f8bdad178e3d68ab5266bd869a9c5d7a3d31bb2fb0c50fbf134ef8a713d01
3
+ size 199058733
4b248b12b/global_step45776/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:270ebd675f83f6d8795726eb380cf4d49936061eea440afea1a7a8e6a8ff3ea8
3
+ size 199058733
4b248b12b/global_step45776/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b535b3009b9c39e0f2cae44eae00ebaf3a7dbfa970ed32c2c65878172e6c3e1
3
+ size 199058797