diff --git "a/Pile_training_log.txt" "b/Pile_training_log.txt" new file mode 100644--- /dev/null +++ "b/Pile_training_log.txt" @@ -0,0 +1,1458 @@ +NeoXArgs.configure_distributed_args() using world size: 64 and model-parallel size: 1 +> building HFTokenizer tokenizer ... + > padded vocab (size: 50277) with 27 dummy tokens (new size: 50304) +> setting tensorboard ... +> initializing torch distributed ... +> initializing model parallel with size 1 +MPU DP: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] +MPU PP: [0] +MPU PP: [1] +MPU PP: [2] +MPU PP: [3] +MPU PP: [4] +MPU PP: [5] +MPU PP: [6] +MPU PP: [7] +MPU PP: [8] +MPU PP: [9] +MPU PP: [10] +MPU PP: [11] +MPU PP: [12] +MPU PP: [13] +MPU PP: [14] +MPU PP: [15] +MPU PP: [16] +MPU PP: [17] +MPU PP: [18] +MPU PP: [19] +MPU PP: [20] +MPU PP: [21] +MPU PP: [22] +MPU PP: [23] +MPU PP: [24] +MPU PP: [25] +MPU PP: [26] +MPU PP: [27] +MPU PP: [28] +MPU PP: [29] +MPU PP: [30] +MPU PP: [31] +MPU PP: [32] +MPU PP: [33] +MPU PP: [34] +MPU PP: [35] +MPU PP: [36] +MPU PP: [37] +MPU PP: [38] +MPU PP: [39] +MPU PP: [40] +MPU PP: [41] +MPU PP: [42] +MPU PP: [43] +MPU PP: [44] +MPU PP: [45] +MPU PP: [46] +MPU PP: [47] +MPU PP: [48] +MPU PP: [49] +MPU PP: [50] +MPU PP: [51] +MPU PP: [52] +MPU PP: [53] +MPU PP: [54] +MPU PP: [55] +MPU PP: [56] +MPU PP: [57] +MPU PP: [58] +MPU PP: [59] +MPU PP: [60] +MPU PP: [61] +MPU PP: [62] +MPU PP: [63] +MPU MP: [0] +MPU MP: [1] +MPU MP: [2] +MPU MP: [3] +MPU MP: [4] +MPU MP: [5] +MPU MP: [6] +MPU MP: [7] +MPU MP: [8] +MPU MP: [9] +MPU MP: [10] +MPU MP: [11] +MPU MP: [12] +MPU MP: [13] +MPU MP: [14] +MPU MP: [15] +MPU MP: [16] +MPU MP: [17] +MPU MP: [18] +MPU MP: [19] +MPU MP: [20] +MPU MP: [21] +MPU MP: [22] +MPU MP: [23] +MPU MP: [24] +MPU MP: [25] +MPU MP: [26] +MPU MP: [27] +MPU MP: [28] +MPU MP: [29] +MPU MP: [30] +MPU MP: [31] +MPU MP: [32] +MPU MP: [33] +MPU MP: [34] +MPU MP: [35] +MPU MP: [36] +MPU MP: [37] +MPU MP: [38] +MPU MP: [39] +MPU MP: [40] +MPU MP: [41] +MPU MP: [42] +MPU MP: [43] +MPU MP: [44] +MPU MP: [45] +MPU MP: [46] +MPU MP: [47] +MPU MP: [48] +MPU MP: [49] +MPU MP: [50] +MPU MP: [51] +MPU MP: [52] +MPU MP: [53] +MPU MP: [54] +MPU MP: [55] +MPU MP: [56] +MPU MP: [57] +MPU MP: [58] +MPU MP: [59] +MPU MP: [60] +MPU MP: [61] +MPU MP: [62] +MPU MP: [63] +> setting random seeds to 1234 ... +building GPT2 model ... +SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pipe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=46, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +stage=0 layers=29 + 0: EmbeddingPipe + 1: _pre_transformer_block + 2: ParallelFlownetLayerPipe + 3: ParallelFlownetLayerPipe + 4: ParallelFlownetLayerPipe + 5: ParallelFlownetLayerPipe + 6: ParallelFlownetLayerPipe + 7: ParallelFlownetLayerPipe + 8: ParallelFlownetLayerPipe + 9: ParallelFlownetLayerPipe + 10: ParallelFlownetLayerPipe + 11: ParallelFlownetLayerPipe + 12: ParallelFlownetLayerPipe + 13: ParallelFlownetLayerPipe + 14: ParallelFlownetLayerPipe + 15: ParallelFlownetLayerPipe + 16: ParallelFlownetLayerPipe + 17: ParallelFlownetLayerPipe + 18: ParallelFlownetLayerPipe + 19: ParallelFlownetLayerPipe + 20: ParallelFlownetLayerPipe + 21: ParallelFlownetLayerPipe + 22: ParallelFlownetLayerPipe + 23: ParallelFlownetLayerPipe + 24: ParallelFlownetLayerPipe + 25: ParallelFlownetLayerPipe + 26: _post_transformer_block + 27: NormPipe + 28: EmbeddingPipe + loss: partial +Configuring Optimizer type: Adam with params: {'lr': 0.0006, 'betas': [0.9, 0.95], 'eps': 1e-08} +> learning rate decay style: cosine +DeepSpeed is enabled. + > number of parameters on model parallel rank 0: 454166528 + > total params: 454,166,528 +Unable to load checkpoint. +Loading checkpoint and starting from iteration 0 +> building train, validation, and test datasets ... + reading sizes... + reading pointers... + reading document index... + creating numpy buffer of mmap... + creating memory view of numpy buffer... + > dataset split: + train: + document indices in [0, 130154259) total of 130154259 documents + validation: + document indices in [130154259, 134183803) total of 4029544 documents + test: + document indices in [134183803, 134318121) total of 134318 documents + > loading doc-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_train_indexmap_146432000ns_2048sl_1234s_doc_idx.npy + > loading sample-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_train_indexmap_146432000ns_2048sl_1234s_sample_idx.npy + > loading shuffle-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_train_indexmap_146432000ns_2048sl_1234s_shuffle_idx.npy + loaded indexed file in 0.148 seconds + total number of samples: 195915016 + total number of epochs: 2 + > loading doc-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_valid_indexmap_2969600ns_2048sl_1234s_doc_idx.npy + > loading sample-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_valid_indexmap_2969600ns_2048sl_1234s_sample_idx.npy + > loading shuffle-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_valid_indexmap_2969600ns_2048sl_1234s_shuffle_idx.npy + loaded indexed file in 0.264 seconds + total number of samples: 3097460 + total number of epochs: 1 + > loading doc-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_test_indexmap_102400ns_2048sl_1234s_doc_idx.npy + > loading sample-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_test_indexmap_102400ns_2048sl_1234s_sample_idx.npy + > loading shuffle-idx mapping from /u/wangh/workspace/dataset/language_dataset/pile/pile_0.87_deduped_text_document_test_indexmap_102400ns_2048sl_1234s_shuffle_idx.npy + loaded indexed file in 0.175 seconds + total number of samples: 102462 + total number of epochs: 1 +setting training data start iteration to 0 +setting validation data start iteration to 0 +done with setups ... +time (ms) | model and optimizer: 2883.87 | train/valid/test data iterators: 3989.32 +training ... +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step0 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step1 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step2 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step4 + samples/sec: 639.316 | iteration 500/ 143000 | elapsed time per iteration (ms): 1601.7 | learning rate: 2.098E-04 | approx flops per GPU: 72.2TFLOPS | lm_loss: 6.487831E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +after 500 iterations memory (MB) | allocated: 1832.7998046875 | max allocated: 12938.84716796875 | reserved: 14128.0 | max reserved: 14128.0 +time (ms) | forward: 348.95 | backward: 1186.39 | backward-backward: 1186.35 | backward-allreduce: 0.00 | optimizer: 42.13 | batch generator: 3.73 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step8 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step16 + samples/sec: 650.796 | iteration 1000/ 143000 | elapsed time per iteration (ms): 1573.5 | learning rate: 4.196E-04 | approx flops per GPU: 73.5TFLOPS | lm_loss: 3.860402E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.28 | backward: 1184.07 | backward-backward: 1184.03 | backward-allreduce: 0.00 | optimizer: 42.12 | batch generator: 2.91 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step32 + samples/sec: 651.925 | iteration 1500/ 143000 | elapsed time per iteration (ms): 1570.7 | learning rate: 6.000E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 3.226961E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.64 | backward: 1184.17 | backward-backward: 1184.13 | backward-allreduce: 0.00 | optimizer: 42.32 | batch generator: 2.41 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step64 + samples/sec: 652.116 | iteration 2000/ 143000 | elapsed time per iteration (ms): 1570.3 | learning rate: 6.000E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.954752E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.85 | backward: 1183.66 | backward-backward: 1183.62 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 2.45 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step128 + samples/sec: 652.036 | iteration 2500/ 143000 | elapsed time per iteration (ms): 1570.5 | learning rate: 5.999E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.808394E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.55 | backward: 1183.16 | backward-backward: 1183.12 | backward-allreduce: 0.00 | optimizer: 42.29 | batch generator: 3.18 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step256 + samples/sec: 651.111 | iteration 3000/ 143000 | elapsed time per iteration (ms): 1572.7 | learning rate: 5.998E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.719992E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.37 | backward: 1184.12 | backward-backward: 1184.08 | backward-allreduce: 0.00 | optimizer: 43.55 | batch generator: 2.90 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step500 + samples/sec: 651.980 | iteration 3500/ 143000 | elapsed time per iteration (ms): 1570.6 | learning rate: 5.997E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.664136E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.91 | backward: 1183.67 | backward-backward: 1183.64 | backward-allreduce: 0.00 | optimizer: 42.46 | batch generator: 2.49 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step512 + samples/sec: 652.045 | iteration 4000/ 143000 | elapsed time per iteration (ms): 1570.4 | learning rate: 5.996E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.622160E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.57 | backward: 1181.95 | backward-backward: 1181.91 | backward-allreduce: 0.00 | optimizer: 42.21 | batch generator: 3.99 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step1000 + samples/sec: 649.170 | iteration 4500/ 143000 | elapsed time per iteration (ms): 1577.4 | learning rate: 5.994E-04 | approx flops per GPU: 73.3TFLOPS | lm_loss: 2.590186E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 346.53 | backward: 1184.72 | backward-backward: 1184.68 | backward-allreduce: 0.00 | optimizer: 42.36 | batch generator: 8.20 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step1500 + samples/sec: 652.073 | iteration 5000/ 143000 | elapsed time per iteration (ms): 1570.4 | learning rate: 5.992E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.564515E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.04 | backward: 1182.55 | backward-backward: 1182.51 | backward-allreduce: 0.00 | optimizer: 42.24 | batch generator: 3.61 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step2000 +--------------------------------------------------------------------------------------------------------- + validation results at iteration 5000 | lm_loss value: 2.504278E+00 | lm_loss_ppl value: 1.223472E+01 | +--------------------------------------------------------------------------------------------------------- + samples/sec: 621.948 | iteration 5500/ 143000 | elapsed time per iteration (ms): 1646.4 | learning rate: 5.989E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.543121E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.24 | backward: 1186.24 | backward-backward: 1186.20 | backward-allreduce: 0.00 | optimizer: 42.20 | batch generator: 3.97 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step2500 + samples/sec: 652.270 | iteration 6000/ 143000 | elapsed time per iteration (ms): 1569.9 | learning rate: 5.986E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.524748E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.77 | backward: 1181.97 | backward-backward: 1181.93 | backward-allreduce: 0.00 | optimizer: 42.19 | batch generator: 3.35 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step3000 + samples/sec: 652.648 | iteration 6500/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 5.983E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.507319E+00 | loss scale: 65536.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 341.86 | backward: 1181.48 | backward-backward: 1181.44 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 3.48 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step3500 + samples/sec: 652.631 | iteration 7000/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 5.979E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.494932E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.59 | backward: 1181.76 | backward-backward: 1181.72 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 3.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step4000 + samples/sec: 652.488 | iteration 7500/ 143000 | elapsed time per iteration (ms): 1569.4 | learning rate: 5.976E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.483774E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.14 | backward: 1181.28 | backward-backward: 1181.24 | backward-allreduce: 0.00 | optimizer: 42.21 | batch generator: 3.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step4500 + samples/sec: 651.103 | iteration 8000/ 143000 | elapsed time per iteration (ms): 1572.7 | learning rate: 5.971E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.472301E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.95 | backward: 1185.06 | backward-backward: 1185.02 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 3.48 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step5000 + samples/sec: 652.730 | iteration 8500/ 143000 | elapsed time per iteration (ms): 1568.8 | learning rate: 5.967E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.463485E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.07 | backward: 1181.14 | backward-backward: 1181.11 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 3.53 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step5500 + samples/sec: 651.298 | iteration 9000/ 143000 | elapsed time per iteration (ms): 1572.2 | learning rate: 5.962E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.453571E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.07 | backward: 1184.35 | backward-backward: 1184.31 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 3.55 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step6000 + samples/sec: 652.792 | iteration 9500/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.957E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.445424E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.98 | backward: 1181.03 | backward-backward: 1180.99 | backward-allreduce: 0.00 | optimizer: 42.24 | batch generator: 3.54 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step6500 + samples/sec: 649.101 | iteration 10000/ 143000 | elapsed time per iteration (ms): 1577.6 | learning rate: 5.951E-04 | approx flops per GPU: 73.3TFLOPS | lm_loss: 2.437648E+00 | loss scale: 65536.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.23 | backward: 1189.53 | backward-backward: 1189.49 | backward-allreduce: 0.00 | optimizer: 42.36 | batch generator: 3.75 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step7000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 10000 | lm_loss value: 2.385643E+00 | lm_loss_ppl value: 1.086604E+01 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 621.113 | iteration 10500/ 143000 | elapsed time per iteration (ms): 1648.7 | learning rate: 5.946E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.431769E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.16 | backward: 1188.63 | backward-backward: 1188.59 | backward-allreduce: 0.00 | optimizer: 42.19 | batch generator: 4.09 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step7500 + samples/sec: 649.277 | iteration 11000/ 143000 | elapsed time per iteration (ms): 1577.1 | learning rate: 5.939E-04 | approx flops per GPU: 73.3TFLOPS | lm_loss: 2.425712E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.78 | backward: 1189.52 | backward-backward: 1189.48 | backward-allreduce: 0.00 | optimizer: 42.09 | batch generator: 3.45 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step8000 + samples/sec: 652.685 | iteration 11500/ 143000 | elapsed time per iteration (ms): 1568.9 | learning rate: 5.933E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.418842E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.84 | backward: 1181.15 | backward-backward: 1181.11 | backward-allreduce: 0.00 | optimizer: 42.17 | batch generator: 3.50 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step8500 + samples/sec: 651.268 | iteration 12000/ 143000 | elapsed time per iteration (ms): 1572.3 | learning rate: 5.926E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.416124E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.73 | backward: 1185.96 | backward-backward: 1185.92 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.54 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step9000 + samples/sec: 652.685 | iteration 12500/ 143000 | elapsed time per iteration (ms): 1568.9 | learning rate: 5.919E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.409112E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.48 | backward: 1181.62 | backward-backward: 1181.58 | backward-allreduce: 0.00 | optimizer: 42.27 | batch generator: 2.86 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step9500 + samples/sec: 652.810 | iteration 13000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.912E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.404251E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.07 | backward: 1181.92 | backward-backward: 1181.89 | backward-allreduce: 0.00 | optimizer: 42.21 | batch generator: 2.74 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step10000 + samples/sec: 652.789 | iteration 13500/ 143000 | elapsed time per iteration (ms): 1568.7 | learning rate: 5.904E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.399412E+00 | loss scale: 32768.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 341.03 | backward: 1182.03 | backward-backward: 1181.99 | backward-allreduce: 0.00 | optimizer: 41.97 | batch generator: 2.72 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step10500 + samples/sec: 652.830 | iteration 14000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.896E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.396646E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.34 | backward: 1181.55 | backward-backward: 1181.51 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 3.03 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step11000 + samples/sec: 652.705 | iteration 14500/ 143000 | elapsed time per iteration (ms): 1568.9 | learning rate: 5.887E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.391815E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.28 | backward: 1181.69 | backward-backward: 1181.65 | backward-allreduce: 0.00 | optimizer: 42.12 | batch generator: 2.87 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step11500 + samples/sec: 652.933 | iteration 15000/ 143000 | elapsed time per iteration (ms): 1568.3 | learning rate: 5.879E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.388972E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 340.87 | backward: 1182.01 | backward-backward: 1181.97 | backward-allreduce: 0.00 | optimizer: 41.91 | batch generator: 2.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step12000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 15000 | lm_loss value: 2.333726E+00 | lm_loss_ppl value: 1.031631E+01 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 624.238 | iteration 15500/ 143000 | elapsed time per iteration (ms): 1640.4 | learning rate: 5.870E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.383346E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.68 | backward: 1182.22 | backward-backward: 1182.19 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 2.66 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step12500 + samples/sec: 652.592 | iteration 16000/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 5.860E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.380331E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.78 | backward: 1182.22 | backward-backward: 1182.19 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 2.37 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step13000 + samples/sec: 652.787 | iteration 16500/ 143000 | elapsed time per iteration (ms): 1568.7 | learning rate: 5.851E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.378173E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.05 | backward: 1181.74 | backward-backward: 1181.70 | backward-allreduce: 0.00 | optimizer: 42.18 | batch generator: 2.59 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step13500 + samples/sec: 652.882 | iteration 17000/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.841E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.373147E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.83 | backward: 1181.78 | backward-backward: 1181.74 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.43 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step14000 + samples/sec: 652.892 | iteration 17500/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.830E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.371378E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.65 | backward: 1181.87 | backward-backward: 1181.83 | backward-allreduce: 0.00 | optimizer: 42.13 | batch generator: 2.39 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step14500 + samples/sec: 653.083 | iteration 18000/ 143000 | elapsed time per iteration (ms): 1567.9 | learning rate: 5.820E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.366905E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 340.69 | backward: 1181.92 | backward-backward: 1181.88 | backward-allreduce: 0.00 | optimizer: 41.98 | batch generator: 2.35 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step15000 + samples/sec: 652.913 | iteration 18500/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.809E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.366473E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.96 | backward: 1181.59 | backward-backward: 1181.55 | backward-allreduce: 0.00 | optimizer: 42.11 | batch generator: 2.66 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step15500 + samples/sec: 653.006 | iteration 19000/ 143000 | elapsed time per iteration (ms): 1568.1 | learning rate: 5.798E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.362913E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.17 | backward: 1181.43 | backward-backward: 1181.39 | backward-allreduce: 0.00 | optimizer: 42.09 | batch generator: 2.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step16000 + samples/sec: 652.846 | iteration 19500/ 143000 | elapsed time per iteration (ms): 1568.5 | learning rate: 5.786E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.360173E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.52 | backward: 1181.15 | backward-backward: 1181.11 | backward-allreduce: 0.00 | optimizer: 42.17 | batch generator: 2.54 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step16500 + samples/sec: 652.901 | iteration 20000/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.774E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.357131E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.25 | backward: 1180.59 | backward-backward: 1180.54 | backward-allreduce: 0.00 | optimizer: 42.06 | batch generator: 2.93 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step17000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 20000 | lm_loss value: 2.306157E+00 | lm_loss_ppl value: 1.003578E+01 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 624.426 | iteration 20500/ 143000 | elapsed time per iteration (ms): 1639.9 | learning rate: 5.762E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.361061E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.19 | backward: 1181.38 | backward-backward: 1181.35 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 3.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step17500 + samples/sec: 652.801 | iteration 21000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.750E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.352849E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.51 | backward: 1181.47 | backward-backward: 1181.43 | backward-allreduce: 0.00 | optimizer: 42.11 | batch generator: 2.87 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step18000 + samples/sec: 652.877 | iteration 21500/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.737E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.352244E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.39 | backward: 1181.52 | backward-backward: 1181.48 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 3.06 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step18500 + samples/sec: 651.241 | iteration 22000/ 143000 | elapsed time per iteration (ms): 1572.4 | learning rate: 5.724E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.349852E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.89 | backward: 1185.93 | backward-backward: 1185.89 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 2.48 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step19000 + samples/sec: 652.793 | iteration 22500/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.711E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.348597E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.64 | backward: 1182.19 | backward-backward: 1182.15 | backward-allreduce: 0.00 | optimizer: 42.18 | batch generator: 2.17 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step19500 + samples/sec: 652.845 | iteration 23000/ 143000 | elapsed time per iteration (ms): 1568.5 | learning rate: 5.697E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.345453E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.97 | backward: 1181.80 | backward-backward: 1181.76 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.42 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step20000 + samples/sec: 652.907 | iteration 23500/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.683E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.343808E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.27 | backward: 1181.43 | backward-backward: 1181.39 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 2.88 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step20500 + samples/sec: 652.815 | iteration 24000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.669E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.343722E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.63 | backward: 1181.16 | backward-backward: 1181.12 | backward-allreduce: 0.00 | optimizer: 42.08 | batch generator: 3.28 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step21000 + samples/sec: 652.822 | iteration 24500/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.654E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.339631E+00 | loss scale: 131072.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.17 | backward: 1181.54 | backward-backward: 1181.50 | backward-allreduce: 0.00 | optimizer: 42.11 | batch generator: 2.74 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step21500 + samples/sec: 651.184 | iteration 25000/ 143000 | elapsed time per iteration (ms): 1572.5 | learning rate: 5.640E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.338007E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.25 | backward: 1185.26 | backward-backward: 1185.23 | backward-allreduce: 0.00 | optimizer: 42.21 | batch generator: 2.77 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step22000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 25000 | lm_loss value: 2.291977E+00 | lm_loss_ppl value: 9.894479E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 624.162 | iteration 25500/ 143000 | elapsed time per iteration (ms): 1640.6 | learning rate: 5.625E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.336621E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.53 | backward: 1181.45 | backward-backward: 1181.42 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 3.52 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step22500 + samples/sec: 652.844 | iteration 26000/ 143000 | elapsed time per iteration (ms): 1568.5 | learning rate: 5.609E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.335987E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.78 | backward: 1182.10 | backward-backward: 1182.07 | backward-allreduce: 0.00 | optimizer: 42.05 | batch generator: 2.37 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step23000 + samples/sec: 652.961 | iteration 26500/ 143000 | elapsed time per iteration (ms): 1568.2 | learning rate: 5.594E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.332942E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.11 | backward: 1181.48 | backward-backward: 1181.45 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 2.81 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step23500 + samples/sec: 652.803 | iteration 27000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.578E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.331611E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.77 | backward: 1181.01 | backward-backward: 1180.97 | backward-allreduce: 0.00 | optimizer: 42.12 | batch generator: 3.35 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step24000 + samples/sec: 652.875 | iteration 27500/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.561E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.330000E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.49 | backward: 1180.26 | backward-backward: 1180.22 | backward-allreduce: 0.00 | optimizer: 42.09 | batch generator: 3.92 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step24500 + samples/sec: 652.805 | iteration 28000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.545E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.328682E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.23 | backward: 1180.46 | backward-backward: 1180.42 | backward-allreduce: 0.00 | optimizer: 42.10 | batch generator: 3.75 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step25000 + samples/sec: 652.996 | iteration 28500/ 143000 | elapsed time per iteration (ms): 1568.2 | learning rate: 5.528E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.327295E+00 | loss scale: 32768.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 342.21 | backward: 1180.53 | backward-backward: 1180.49 | backward-allreduce: 0.00 | optimizer: 41.82 | batch generator: 3.73 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step25500 + samples/sec: 652.723 | iteration 29000/ 143000 | elapsed time per iteration (ms): 1568.8 | learning rate: 5.511E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.325193E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.10 | backward: 1181.71 | backward-backward: 1181.67 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.48 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step26000 + samples/sec: 652.877 | iteration 29500/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.494E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.325383E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.81 | backward: 1181.75 | backward-backward: 1181.72 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 2.42 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step26500 + samples/sec: 651.258 | iteration 30000/ 143000 | elapsed time per iteration (ms): 1572.3 | learning rate: 5.476E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.322457E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.07 | backward: 1185.52 | backward-backward: 1185.48 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.35 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step27000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 30000 | lm_loss value: 2.277179E+00 | lm_loss_ppl value: 9.749142E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 624.191 | iteration 30500/ 143000 | elapsed time per iteration (ms): 1640.5 | learning rate: 5.458E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.321187E+00 | loss scale: 65536.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 340.82 | backward: 1181.90 | backward-backward: 1181.86 | backward-allreduce: 0.00 | optimizer: 41.96 | batch generator: 2.55 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step27500 + samples/sec: 652.945 | iteration 31000/ 143000 | elapsed time per iteration (ms): 1568.3 | learning rate: 5.440E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.320848E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.04 | backward: 1181.65 | backward-backward: 1181.61 | backward-allreduce: 0.00 | optimizer: 42.10 | batch generator: 2.52 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step28000 + samples/sec: 652.636 | iteration 31500/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 5.422E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.317365E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.99 | backward: 1182.15 | backward-backward: 1182.11 | backward-allreduce: 0.00 | optimizer: 42.25 | batch generator: 2.37 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step28500 + samples/sec: 652.710 | iteration 32000/ 143000 | elapsed time per iteration (ms): 1568.8 | learning rate: 5.403E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.317320E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.00 | backward: 1181.85 | backward-backward: 1181.81 | backward-allreduce: 0.00 | optimizer: 42.09 | batch generator: 2.48 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step29000 + samples/sec: 652.873 | iteration 32500/ 143000 | elapsed time per iteration (ms): 1568.5 | learning rate: 5.384E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.316063E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.76 | backward: 1181.82 | backward-backward: 1181.79 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 2.22 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step29500 + samples/sec: 649.256 | iteration 33000/ 143000 | elapsed time per iteration (ms): 1577.2 | learning rate: 5.365E-04 | approx flops per GPU: 73.3TFLOPS | lm_loss: 2.316235E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.07 | backward: 1185.57 | backward-backward: 1185.53 | backward-allreduce: 0.00 | optimizer: 42.24 | batch generator: 2.54 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step30000 + samples/sec: 652.593 | iteration 33500/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 5.346E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.314015E+00 | loss scale: 131072.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.43 | backward: 1181.16 | backward-backward: 1181.13 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 2.80 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step30500 + samples/sec: 651.699 | iteration 34000/ 143000 | elapsed time per iteration (ms): 1571.3 | learning rate: 5.326E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.314418E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.89 | backward: 1181.82 | backward-backward: 1181.78 | backward-allreduce: 0.00 | optimizer: 42.05 | batch generator: 2.40 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step31000 + samples/sec: 652.795 | iteration 34500/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.306E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.311603E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.05 | backward: 1181.60 | backward-backward: 1181.56 | backward-allreduce: 0.00 | optimizer: 42.17 | batch generator: 2.61 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step31500 + samples/sec: 652.959 | iteration 35000/ 143000 | elapsed time per iteration (ms): 1568.2 | learning rate: 5.286E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.310192E+00 | loss scale: 32768.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 340.61 | backward: 1182.14 | backward-backward: 1182.10 | backward-allreduce: 0.00 | optimizer: 41.93 | batch generator: 2.21 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step32000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 35000 | lm_loss value: 2.260724E+00 | lm_loss_ppl value: 9.590026E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 624.047 | iteration 35500/ 143000 | elapsed time per iteration (ms): 1640.9 | learning rate: 5.266E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.309177E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.88 | backward: 1181.97 | backward-backward: 1181.94 | backward-allreduce: 0.00 | optimizer: 42.22 | batch generator: 2.82 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step32500 + samples/sec: 652.613 | iteration 36000/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 5.245E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.307947E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.15 | backward: 1181.74 | backward-backward: 1181.70 | backward-allreduce: 0.00 | optimizer: 42.37 | batch generator: 2.47 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step33000 + samples/sec: 652.829 | iteration 36500/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.224E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.306247E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 340.91 | backward: 1181.90 | backward-backward: 1181.86 | backward-allreduce: 0.00 | optimizer: 42.19 | batch generator: 2.46 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step33500 + samples/sec: 652.736 | iteration 37000/ 143000 | elapsed time per iteration (ms): 1568.8 | learning rate: 5.203E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.305798E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.95 | backward: 1181.91 | backward-backward: 1181.87 | backward-allreduce: 0.00 | optimizer: 42.23 | batch generator: 2.53 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step34000 + samples/sec: 652.794 | iteration 37500/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.182E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.305441E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.94 | backward: 1180.83 | backward-backward: 1180.80 | backward-allreduce: 0.00 | optimizer: 42.10 | batch generator: 3.48 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step34500 + samples/sec: 652.814 | iteration 38000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.160E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.305002E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.77 | backward: 1181.11 | backward-backward: 1181.07 | backward-allreduce: 0.00 | optimizer: 42.17 | batch generator: 2.92 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step35000 + samples/sec: 652.819 | iteration 38500/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.139E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.303915E+00 | loss scale: 131072.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.52 | backward: 1182.10 | backward-backward: 1182.06 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 2.00 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step35500 + samples/sec: 652.797 | iteration 39000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.117E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.304620E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.88 | backward: 1181.73 | backward-backward: 1181.69 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 2.41 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step36000 + samples/sec: 652.844 | iteration 39500/ 143000 | elapsed time per iteration (ms): 1568.5 | learning rate: 5.094E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.301278E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.73 | backward: 1181.06 | backward-backward: 1181.02 | backward-allreduce: 0.00 | optimizer: 42.10 | batch generator: 3.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step36500 + samples/sec: 652.892 | iteration 40000/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.072E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.300157E+00 | loss scale: 65536.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 341.17 | backward: 1181.47 | backward-backward: 1181.43 | backward-allreduce: 0.00 | optimizer: 41.95 | batch generator: 2.63 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step37000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 40000 | lm_loss value: 2.251516E+00 | lm_loss_ppl value: 9.502133E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 624.226 | iteration 40500/ 143000 | elapsed time per iteration (ms): 1640.4 | learning rate: 5.049E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.298494E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.26 | backward: 1181.36 | backward-backward: 1181.32 | backward-allreduce: 0.00 | optimizer: 42.06 | batch generator: 3.17 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step37500 + samples/sec: 652.794 | iteration 41000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 5.026E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.298421E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.13 | backward: 1181.57 | backward-backward: 1181.53 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.59 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step38000 + samples/sec: 652.876 | iteration 41500/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 5.003E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.297525E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.32 | backward: 1181.32 | backward-backward: 1181.28 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 2.91 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step38500 + samples/sec: 652.893 | iteration 42000/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 4.980E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.295306E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.92 | backward: 1181.70 | backward-backward: 1181.67 | backward-allreduce: 0.00 | optimizer: 42.09 | batch generator: 2.49 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step39000 + samples/sec: 652.632 | iteration 42500/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 4.956E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.296228E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.75 | backward: 1182.37 | backward-backward: 1182.33 | backward-allreduce: 0.00 | optimizer: 42.36 | batch generator: 2.23 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step39500 + samples/sec: 652.644 | iteration 43000/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 4.933E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.293967E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.63 | backward: 1182.32 | backward-backward: 1182.28 | backward-allreduce: 0.00 | optimizer: 42.39 | batch generator: 2.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step40000 + samples/sec: 652.509 | iteration 43500/ 143000 | elapsed time per iteration (ms): 1569.3 | learning rate: 4.909E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.294288E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.47 | backward: 1182.79 | backward-backward: 1182.75 | backward-allreduce: 0.00 | optimizer: 42.31 | batch generator: 2.00 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step40500 + samples/sec: 652.478 | iteration 44000/ 143000 | elapsed time per iteration (ms): 1569.4 | learning rate: 4.885E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.291232E+00 | loss scale: 16384.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 340.64 | backward: 1182.61 | backward-backward: 1182.58 | backward-allreduce: 0.00 | optimizer: 42.25 | batch generator: 2.17 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step41000 + samples/sec: 652.344 | iteration 44500/ 143000 | elapsed time per iteration (ms): 1569.7 | learning rate: 4.860E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.290781E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.42 | backward: 1182.10 | backward-backward: 1182.07 | backward-allreduce: 0.00 | optimizer: 42.40 | batch generator: 3.06 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step41500 + samples/sec: 652.071 | iteration 45000/ 143000 | elapsed time per iteration (ms): 1570.4 | learning rate: 4.836E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.290644E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.44 | backward: 1182.27 | backward-backward: 1182.23 | backward-allreduce: 0.00 | optimizer: 42.65 | batch generator: 3.02 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step42000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 45000 | lm_loss value: 2.237995E+00 | lm_loss_ppl value: 9.374518E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 623.794 | iteration 45500/ 143000 | elapsed time per iteration (ms): 1641.6 | learning rate: 4.811E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.289358E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.98 | backward: 1182.39 | backward-backward: 1182.35 | backward-allreduce: 0.00 | optimizer: 42.40 | batch generator: 2.93 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step42500 + samples/sec: 652.070 | iteration 46000/ 143000 | elapsed time per iteration (ms): 1570.4 | learning rate: 4.786E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.288439E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.03 | backward: 1182.84 | backward-backward: 1182.80 | backward-allreduce: 0.00 | optimizer: 42.74 | batch generator: 2.51 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step43000 + samples/sec: 652.546 | iteration 46500/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 4.761E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.287162E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 340.91 | backward: 1182.29 | backward-backward: 1182.25 | backward-allreduce: 0.00 | optimizer: 42.08 | batch generator: 2.52 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step43500 + samples/sec: 652.040 | iteration 47000/ 143000 | elapsed time per iteration (ms): 1570.5 | learning rate: 4.736E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.285849E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.77 | backward: 1182.67 | backward-backward: 1182.63 | backward-allreduce: 0.00 | optimizer: 42.41 | batch generator: 2.33 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step44000 + samples/sec: 652.853 | iteration 47500/ 143000 | elapsed time per iteration (ms): 1568.5 | learning rate: 4.710E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.286582E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.04 | backward: 1181.82 | backward-backward: 1181.78 | backward-allreduce: 0.00 | optimizer: 42.11 | batch generator: 2.72 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step44500 + samples/sec: 652.848 | iteration 48000/ 143000 | elapsed time per iteration (ms): 1568.5 | learning rate: 4.685E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.285410E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.94 | backward: 1181.81 | backward-backward: 1181.78 | backward-allreduce: 0.00 | optimizer: 42.19 | batch generator: 2.45 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step45000 + samples/sec: 652.861 | iteration 48500/ 143000 | elapsed time per iteration (ms): 1568.5 | learning rate: 4.659E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.284136E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.94 | backward: 1181.72 | backward-backward: 1181.68 | backward-allreduce: 0.00 | optimizer: 42.12 | batch generator: 2.43 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step45500 + samples/sec: 652.566 | iteration 49000/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 4.633E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.284063E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.71 | backward: 1182.01 | backward-backward: 1181.97 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 2.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step46000 + samples/sec: 652.951 | iteration 49500/ 143000 | elapsed time per iteration (ms): 1568.3 | learning rate: 4.607E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.283679E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.46 | backward: 1182.09 | backward-backward: 1182.05 | backward-allreduce: 0.00 | optimizer: 42.13 | batch generator: 1.99 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step46500 + samples/sec: 652.813 | iteration 50000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 4.581E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.280456E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.34 | backward: 1181.31 | backward-backward: 1181.27 | backward-allreduce: 0.00 | optimizer: 42.09 | batch generator: 2.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step47000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 50000 | lm_loss value: 2.234592E+00 | lm_loss_ppl value: 9.342673E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 623.068 | iteration 50500/ 143000 | elapsed time per iteration (ms): 1643.5 | learning rate: 4.554E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.281314E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.19 | backward: 1184.33 | backward-backward: 1184.29 | backward-allreduce: 0.00 | optimizer: 42.18 | batch generator: 2.94 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step47500 + samples/sec: 652.941 | iteration 51000/ 143000 | elapsed time per iteration (ms): 1568.3 | learning rate: 4.528E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.280354E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.51 | backward: 1181.16 | backward-backward: 1181.12 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 3.06 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step48000 + samples/sec: 652.145 | iteration 51500/ 143000 | elapsed time per iteration (ms): 1570.2 | learning rate: 4.501E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.278452E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 341.88 | backward: 1182.66 | backward-backward: 1182.62 | backward-allreduce: 0.00 | optimizer: 41.90 | batch generator: 3.45 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step48500 + samples/sec: 651.645 | iteration 52000/ 143000 | elapsed time per iteration (ms): 1571.4 | learning rate: 4.474E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.277888E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.89 | backward: 1184.75 | backward-backward: 1184.71 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 2.53 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step49000 + samples/sec: 652.678 | iteration 52500/ 143000 | elapsed time per iteration (ms): 1568.9 | learning rate: 4.447E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.279532E+00 | loss scale: 8192.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 340.82 | backward: 1182.24 | backward-backward: 1182.20 | backward-allreduce: 0.00 | optimizer: 42.10 | batch generator: 2.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step49500 + samples/sec: 651.823 | iteration 53000/ 143000 | elapsed time per iteration (ms): 1571.0 | learning rate: 4.420E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.276786E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.69 | backward: 1184.61 | backward-backward: 1184.58 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 2.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step50000 + samples/sec: 652.710 | iteration 53500/ 143000 | elapsed time per iteration (ms): 1568.8 | learning rate: 4.393E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.276312E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.96 | backward: 1182.02 | backward-backward: 1181.99 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.45 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step50500 + samples/sec: 652.723 | iteration 54000/ 143000 | elapsed time per iteration (ms): 1568.8 | learning rate: 4.365E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.274856E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 340.75 | backward: 1182.02 | backward-backward: 1181.99 | backward-allreduce: 0.00 | optimizer: 42.19 | batch generator: 2.32 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step51000 + samples/sec: 648.954 | iteration 54500/ 143000 | elapsed time per iteration (ms): 1577.9 | learning rate: 4.337E-04 | approx flops per GPU: 73.3TFLOPS | lm_loss: 2.274580E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 344.31 | backward: 1187.67 | backward-backward: 1187.63 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 5.86 + samples/sec: 648.675 | iteration 55000/ 143000 | elapsed time per iteration (ms): 1578.6 | learning rate: 4.310E-04 | approx flops per GPU: 73.3TFLOPS | lm_loss: 2.273052E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +after 55000 iterations memory (MB) | allocated: 1832.7998046875 | max allocated: 12938.84716796875 | reserved: 14140.0 | max reserved: 14140.0 +time (ms) | forward: 346.33 | backward: 1189.26 | backward-backward: 1189.22 | backward-allreduce: 0.00 | optimizer: 42.11 | batch generator: 6.13 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step52000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 55000 | lm_loss value: 2.228575E+00 | lm_loss_ppl value: 9.286619E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 617.417 | iteration 55500/ 143000 | elapsed time per iteration (ms): 1658.5 | learning rate: 4.282E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.273353E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.60 | backward: 1196.57 | backward-backward: 1196.53 | backward-allreduce: 0.00 | optimizer: 42.25 | batch generator: 4.91 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step52500 + samples/sec: 651.216 | iteration 56000/ 143000 | elapsed time per iteration (ms): 1572.4 | learning rate: 4.254E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.271231E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.53 | backward: 1184.23 | backward-backward: 1184.19 | backward-allreduce: 0.00 | optimizer: 42.00 | batch generator: 4.47 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step53000 + samples/sec: 652.624 | iteration 56500/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 4.226E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.270630E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.79 | backward: 1180.22 | backward-backward: 1180.19 | backward-allreduce: 0.00 | optimizer: 42.71 | batch generator: 4.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step53500 + samples/sec: 649.537 | iteration 57000/ 143000 | elapsed time per iteration (ms): 1576.5 | learning rate: 4.198E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.269735E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.00 | backward: 1187.93 | backward-backward: 1187.89 | backward-allreduce: 0.00 | optimizer: 42.11 | batch generator: 4.68 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step54000 + samples/sec: 649.505 | iteration 57500/ 143000 | elapsed time per iteration (ms): 1576.6 | learning rate: 4.170E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.269509E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.72 | backward: 1188.22 | backward-backward: 1188.18 | backward-allreduce: 0.00 | optimizer: 42.04 | batch generator: 4.53 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step54500 + samples/sec: 649.392 | iteration 58000/ 143000 | elapsed time per iteration (ms): 1576.9 | learning rate: 4.141E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.269973E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.76 | backward: 1188.40 | backward-backward: 1188.37 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 4.78 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step55000 + samples/sec: 649.268 | iteration 58500/ 143000 | elapsed time per iteration (ms): 1577.2 | learning rate: 4.113E-04 | approx flops per GPU: 73.3TFLOPS | lm_loss: 2.265485E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.87 | backward: 1188.15 | backward-backward: 1188.12 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 4.96 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step55500 + samples/sec: 648.017 | iteration 59000/ 143000 | elapsed time per iteration (ms): 1580.2 | learning rate: 4.084E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.267922E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.19 | backward: 1191.30 | backward-backward: 1191.27 | backward-allreduce: 0.00 | optimizer: 42.10 | batch generator: 4.96 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step56000 + samples/sec: 651.310 | iteration 59500/ 143000 | elapsed time per iteration (ms): 1572.2 | learning rate: 4.055E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.266027E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.00 | backward: 1183.41 | backward-backward: 1183.38 | backward-allreduce: 0.00 | optimizer: 42.05 | batch generator: 4.88 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step56500 + samples/sec: 649.366 | iteration 60000/ 143000 | elapsed time per iteration (ms): 1576.9 | learning rate: 4.027E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.266302E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.90 | backward: 1188.29 | backward-backward: 1188.25 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 4.94 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step57000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 60000 | lm_loss value: 2.218708E+00 | lm_loss_ppl value: 9.195445E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 619.697 | iteration 60500/ 143000 | elapsed time per iteration (ms): 1652.4 | learning rate: 3.998E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.264030E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.16 | backward: 1191.71 | backward-backward: 1191.67 | backward-allreduce: 0.00 | optimizer: 41.96 | batch generator: 5.53 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step57500 + samples/sec: 647.922 | iteration 61000/ 143000 | elapsed time per iteration (ms): 1580.4 | learning rate: 3.969E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.263379E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.47 | backward: 1192.25 | backward-backward: 1192.22 | backward-allreduce: 0.00 | optimizer: 42.28 | batch generator: 4.27 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step58000 + samples/sec: 642.652 | iteration 61500/ 143000 | elapsed time per iteration (ms): 1593.4 | learning rate: 3.940E-04 | approx flops per GPU: 72.6TFLOPS | lm_loss: 2.264948E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.86 | backward: 1204.51 | backward-backward: 1204.48 | backward-allreduce: 0.00 | optimizer: 42.50 | batch generator: 4.92 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step58500 + samples/sec: 641.418 | iteration 62000/ 143000 | elapsed time per iteration (ms): 1596.5 | learning rate: 3.911E-04 | approx flops per GPU: 72.5TFLOPS | lm_loss: 2.259878E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.83 | backward: 1207.76 | backward-backward: 1207.72 | backward-allreduce: 0.00 | optimizer: 42.56 | batch generator: 4.87 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step59000 + samples/sec: 638.187 | iteration 62500/ 143000 | elapsed time per iteration (ms): 1604.5 | learning rate: 3.881E-04 | approx flops per GPU: 72.1TFLOPS | lm_loss: 2.261517E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.86 | backward: 1215.75 | backward-backward: 1215.72 | backward-allreduce: 0.00 | optimizer: 42.38 | batch generator: 4.87 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step59500 + samples/sec: 646.549 | iteration 63000/ 143000 | elapsed time per iteration (ms): 1583.8 | learning rate: 3.852E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.257204E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.09 | backward: 1195.31 | backward-backward: 1195.28 | backward-allreduce: 0.00 | optimizer: 41.97 | batch generator: 4.85 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step60000 + samples/sec: 644.568 | iteration 63500/ 143000 | elapsed time per iteration (ms): 1588.7 | learning rate: 3.823E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.258590E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.23 | backward: 1199.66 | backward-backward: 1199.62 | backward-allreduce: 0.00 | optimizer: 42.32 | batch generator: 4.97 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step60500 + samples/sec: 643.947 | iteration 64000/ 143000 | elapsed time per iteration (ms): 1590.2 | learning rate: 3.793E-04 | approx flops per GPU: 72.7TFLOPS | lm_loss: 2.260260E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.31 | backward: 1202.57 | backward-backward: 1202.53 | backward-allreduce: 0.00 | optimizer: 42.52 | batch generator: 3.30 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step61000 + samples/sec: 644.429 | iteration 64500/ 143000 | elapsed time per iteration (ms): 1589.0 | learning rate: 3.764E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.256570E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.32 | backward: 1201.68 | backward-backward: 1201.65 | backward-allreduce: 0.00 | optimizer: 42.52 | batch generator: 3.52 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step61500 + samples/sec: 646.109 | iteration 65000/ 143000 | elapsed time per iteration (ms): 1584.9 | learning rate: 3.735E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.258062E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 341.02 | backward: 1198.31 | backward-backward: 1198.28 | backward-allreduce: 0.00 | optimizer: 42.21 | batch generator: 3.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step62000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 65000 | lm_loss value: 2.208251E+00 | lm_loss_ppl value: 9.099783E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 616.290 | iteration 65500/ 143000 | elapsed time per iteration (ms): 1661.6 | learning rate: 3.705E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.256114E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.38 | backward: 1202.28 | backward-backward: 1202.25 | backward-allreduce: 0.00 | optimizer: 42.59 | batch generator: 4.19 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step62500 + samples/sec: 644.474 | iteration 66000/ 143000 | elapsed time per iteration (ms): 1588.9 | learning rate: 3.675E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.254815E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.51 | backward: 1201.44 | backward-backward: 1201.40 | backward-allreduce: 0.00 | optimizer: 42.38 | batch generator: 3.47 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step63000 + samples/sec: 641.372 | iteration 66500/ 143000 | elapsed time per iteration (ms): 1596.6 | learning rate: 3.646E-04 | approx flops per GPU: 72.5TFLOPS | lm_loss: 2.256632E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.36 | backward: 1209.20 | backward-backward: 1209.16 | backward-allreduce: 0.00 | optimizer: 42.33 | batch generator: 3.31 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step63500 + samples/sec: 644.680 | iteration 67000/ 143000 | elapsed time per iteration (ms): 1588.4 | learning rate: 3.616E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.257665E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.75 | backward: 1200.66 | backward-backward: 1200.62 | backward-allreduce: 0.00 | optimizer: 42.36 | batch generator: 3.83 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step64000 + samples/sec: 639.322 | iteration 67500/ 143000 | elapsed time per iteration (ms): 1601.7 | learning rate: 3.586E-04 | approx flops per GPU: 72.2TFLOPS | lm_loss: 2.254628E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.72 | backward: 1213.66 | backward-backward: 1213.62 | backward-allreduce: 0.00 | optimizer: 42.74 | batch generator: 3.72 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step64500 + samples/sec: 641.877 | iteration 68000/ 143000 | elapsed time per iteration (ms): 1595.3 | learning rate: 3.556E-04 | approx flops per GPU: 72.5TFLOPS | lm_loss: 2.250226E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.75 | backward: 1207.79 | backward-backward: 1207.76 | backward-allreduce: 0.00 | optimizer: 42.29 | batch generator: 3.71 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step65000 + samples/sec: 640.057 | iteration 68500/ 143000 | elapsed time per iteration (ms): 1599.9 | learning rate: 3.527E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 2.251187E+00 | loss scale: 16384.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.81 | backward: 1211.49 | backward-backward: 1211.46 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 4.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step65500 + samples/sec: 644.481 | iteration 69000/ 143000 | elapsed time per iteration (ms): 1588.9 | learning rate: 3.497E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.249857E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.77 | backward: 1199.98 | backward-backward: 1199.94 | backward-allreduce: 0.00 | optimizer: 42.59 | batch generator: 4.82 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step66000 + samples/sec: 646.419 | iteration 69500/ 143000 | elapsed time per iteration (ms): 1584.1 | learning rate: 3.467E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.250643E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.06 | backward: 1195.37 | backward-backward: 1195.33 | backward-allreduce: 0.00 | optimizer: 42.17 | batch generator: 5.05 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step66500 + samples/sec: 641.353 | iteration 70000/ 143000 | elapsed time per iteration (ms): 1596.6 | learning rate: 3.437E-04 | approx flops per GPU: 72.5TFLOPS | lm_loss: 2.251814E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.28 | backward: 1207.27 | backward-backward: 1207.24 | backward-allreduce: 0.00 | optimizer: 42.54 | batch generator: 5.10 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step67000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 70000 | lm_loss value: 2.202823E+00 | lm_loss_ppl value: 9.050529E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 615.114 | iteration 70500/ 143000 | elapsed time per iteration (ms): 1664.7 | learning rate: 3.407E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.250292E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.27 | backward: 1199.94 | backward-backward: 1199.90 | backward-allreduce: 0.00 | optimizer: 42.27 | batch generator: 5.78 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step67500 + samples/sec: 648.211 | iteration 71000/ 143000 | elapsed time per iteration (ms): 1579.7 | learning rate: 3.377E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.248968E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.28 | backward: 1191.11 | backward-backward: 1191.07 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 5.36 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step68000 + samples/sec: 643.127 | iteration 71500/ 143000 | elapsed time per iteration (ms): 1592.2 | learning rate: 3.347E-04 | approx flops per GPU: 72.7TFLOPS | lm_loss: 2.247543E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.89 | backward: 1203.26 | backward-backward: 1203.22 | backward-allreduce: 0.00 | optimizer: 42.50 | batch generator: 4.72 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step68500 + samples/sec: 646.239 | iteration 72000/ 143000 | elapsed time per iteration (ms): 1584.6 | learning rate: 3.317E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.245795E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.36 | backward: 1196.22 | backward-backward: 1196.18 | backward-allreduce: 0.00 | optimizer: 42.29 | batch generator: 4.34 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step69000 + samples/sec: 643.406 | iteration 72500/ 143000 | elapsed time per iteration (ms): 1591.5 | learning rate: 3.288E-04 | approx flops per GPU: 72.7TFLOPS | lm_loss: 2.243905E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.16 | backward: 1203.86 | backward-backward: 1203.83 | backward-allreduce: 0.00 | optimizer: 41.96 | batch generator: 4.05 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step69500 + samples/sec: 646.180 | iteration 73000/ 143000 | elapsed time per iteration (ms): 1584.7 | learning rate: 3.258E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.245046E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.96 | backward: 1195.91 | backward-backward: 1195.87 | backward-allreduce: 0.00 | optimizer: 42.39 | batch generator: 4.75 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step70000 + samples/sec: 648.045 | iteration 73500/ 143000 | elapsed time per iteration (ms): 1580.1 | learning rate: 3.228E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.245450E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.27 | backward: 1192.51 | backward-backward: 1192.48 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 4.22 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step70500 + samples/sec: 651.404 | iteration 74000/ 143000 | elapsed time per iteration (ms): 1572.0 | learning rate: 3.198E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.243675E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.64 | backward: 1184.76 | backward-backward: 1184.72 | backward-allreduce: 0.00 | optimizer: 41.90 | batch generator: 3.50 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step71000 + samples/sec: 647.788 | iteration 74500/ 143000 | elapsed time per iteration (ms): 1580.8 | learning rate: 3.168E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.242719E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.88 | backward: 1192.34 | backward-backward: 1192.30 | backward-allreduce: 0.00 | optimizer: 42.13 | batch generator: 4.72 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step71500 + samples/sec: 644.285 | iteration 75000/ 143000 | elapsed time per iteration (ms): 1589.4 | learning rate: 3.138E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.241913E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.56 | backward: 1201.91 | backward-backward: 1201.87 | backward-allreduce: 0.00 | optimizer: 42.42 | batch generator: 3.43 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step72000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 75000 | lm_loss value: 2.191936E+00 | lm_loss_ppl value: 8.952531E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 616.742 | iteration 75500/ 143000 | elapsed time per iteration (ms): 1660.3 | learning rate: 3.108E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.239421E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.06 | backward: 1200.56 | backward-backward: 1200.53 | backward-allreduce: 0.00 | optimizer: 42.36 | batch generator: 4.50 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step72500 + samples/sec: 644.900 | iteration 76000/ 143000 | elapsed time per iteration (ms): 1587.8 | learning rate: 3.078E-04 | approx flops per GPU: 72.9TFLOPS | lm_loss: 2.240591E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.70 | backward: 1199.34 | backward-backward: 1199.30 | backward-allreduce: 0.00 | optimizer: 42.35 | batch generator: 4.76 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step73000 + samples/sec: 644.743 | iteration 76500/ 143000 | elapsed time per iteration (ms): 1588.2 | learning rate: 3.048E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.240200E+00 | loss scale: 131072.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.16 | backward: 1199.39 | backward-backward: 1199.36 | backward-allreduce: 0.00 | optimizer: 42.25 | batch generator: 5.21 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step73500 + samples/sec: 645.982 | iteration 77000/ 143000 | elapsed time per iteration (ms): 1585.2 | learning rate: 3.019E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.239637E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.67 | backward: 1195.47 | backward-backward: 1195.43 | backward-allreduce: 0.00 | optimizer: 42.19 | batch generator: 4.59 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step74000 + samples/sec: 648.338 | iteration 77500/ 143000 | elapsed time per iteration (ms): 1579.4 | learning rate: 2.989E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.237583E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.55 | backward: 1191.46 | backward-backward: 1191.42 | backward-allreduce: 0.00 | optimizer: 41.96 | batch generator: 4.57 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step74500 + samples/sec: 644.505 | iteration 78000/ 143000 | elapsed time per iteration (ms): 1588.8 | learning rate: 2.959E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.236106E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.12 | backward: 1201.96 | backward-backward: 1201.92 | backward-allreduce: 0.00 | optimizer: 42.27 | batch generator: 3.12 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step75000 + samples/sec: 651.607 | iteration 78500/ 143000 | elapsed time per iteration (ms): 1571.5 | learning rate: 2.930E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.235385E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.65 | backward: 1184.66 | backward-backward: 1184.62 | backward-allreduce: 0.00 | optimizer: 41.90 | batch generator: 3.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step75500 + samples/sec: 649.718 | iteration 79000/ 143000 | elapsed time per iteration (ms): 1576.1 | learning rate: 2.900E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.234992E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.87 | backward: 1188.62 | backward-backward: 1188.58 | backward-allreduce: 0.00 | optimizer: 42.00 | batch generator: 3.71 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step76000 + samples/sec: 649.820 | iteration 79500/ 143000 | elapsed time per iteration (ms): 1575.8 | learning rate: 2.870E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.235844E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.88 | backward: 1188.45 | backward-backward: 1188.41 | backward-allreduce: 0.00 | optimizer: 41.92 | batch generator: 3.78 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step76500 + samples/sec: 649.830 | iteration 80000/ 143000 | elapsed time per iteration (ms): 1575.8 | learning rate: 2.841E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.234097E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.18 | backward: 1188.19 | backward-backward: 1188.15 | backward-allreduce: 0.00 | optimizer: 41.90 | batch generator: 4.23 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step77000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 80000 | lm_loss value: 2.187298E+00 | lm_loss_ppl value: 8.911103E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 619.827 | iteration 80500/ 143000 | elapsed time per iteration (ms): 1652.1 | learning rate: 2.811E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.234598E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 341.94 | backward: 1192.60 | backward-backward: 1192.56 | backward-allreduce: 0.00 | optimizer: 42.13 | batch generator: 4.45 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step77500 + samples/sec: 651.485 | iteration 81000/ 143000 | elapsed time per iteration (ms): 1571.8 | learning rate: 2.782E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.232820E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.45 | backward: 1185.02 | backward-backward: 1184.98 | backward-allreduce: 0.00 | optimizer: 41.90 | batch generator: 3.46 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step78000 + samples/sec: 647.741 | iteration 81500/ 143000 | elapsed time per iteration (ms): 1580.9 | learning rate: 2.753E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.233005E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.24 | backward: 1192.73 | backward-backward: 1192.69 | backward-allreduce: 0.00 | optimizer: 42.40 | batch generator: 4.01 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step78500 + samples/sec: 649.471 | iteration 82000/ 143000 | elapsed time per iteration (ms): 1576.7 | learning rate: 2.723E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.231943E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.77 | backward: 1189.15 | backward-backward: 1189.12 | backward-allreduce: 0.00 | optimizer: 41.97 | batch generator: 3.57 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step79000 + samples/sec: 649.789 | iteration 82500/ 143000 | elapsed time per iteration (ms): 1575.9 | learning rate: 2.694E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.229631E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.43 | backward: 1188.08 | backward-backward: 1188.04 | backward-allreduce: 0.00 | optimizer: 41.97 | batch generator: 4.37 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step79500 + samples/sec: 649.616 | iteration 83000/ 143000 | elapsed time per iteration (ms): 1576.3 | learning rate: 2.665E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.229019E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.49 | backward: 1188.41 | backward-backward: 1188.38 | backward-allreduce: 0.00 | optimizer: 42.15 | batch generator: 4.58 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step80000 + samples/sec: 647.644 | iteration 83500/ 143000 | elapsed time per iteration (ms): 1581.1 | learning rate: 2.636E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.228788E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 343.46 | backward: 1192.08 | backward-backward: 1192.04 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 5.54 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step80500 + samples/sec: 642.702 | iteration 84000/ 143000 | elapsed time per iteration (ms): 1593.3 | learning rate: 2.607E-04 | approx flops per GPU: 72.6TFLOPS | lm_loss: 2.228270E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.32 | backward: 1204.03 | backward-backward: 1203.99 | backward-allreduce: 0.00 | optimizer: 42.42 | batch generator: 5.27 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step81000 + samples/sec: 642.076 | iteration 84500/ 143000 | elapsed time per iteration (ms): 1594.8 | learning rate: 2.578E-04 | approx flops per GPU: 72.5TFLOPS | lm_loss: 2.226328E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.56 | backward: 1195.68 | backward-backward: 1195.64 | backward-allreduce: 0.00 | optimizer: 41.95 | batch generator: 5.40 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step81500 + samples/sec: 651.305 | iteration 85000/ 143000 | elapsed time per iteration (ms): 1572.2 | learning rate: 2.550E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.226146E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.80 | backward: 1183.75 | backward-backward: 1183.72 | backward-allreduce: 0.00 | optimizer: 41.93 | batch generator: 4.74 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step82000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 85000 | lm_loss value: 2.177086E+00 | lm_loss_ppl value: 8.820567E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 618.229 | iteration 85500/ 143000 | elapsed time per iteration (ms): 1656.3 | learning rate: 2.521E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.224127E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.24 | backward: 1197.28 | backward-backward: 1197.24 | backward-allreduce: 0.00 | optimizer: 42.31 | batch generator: 3.76 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step82500 + samples/sec: 646.475 | iteration 86000/ 143000 | elapsed time per iteration (ms): 1584.0 | learning rate: 2.492E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.223833E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.41 | backward: 1196.70 | backward-backward: 1196.66 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 3.22 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step83000 + samples/sec: 646.175 | iteration 86500/ 143000 | elapsed time per iteration (ms): 1584.7 | learning rate: 2.464E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.225564E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.21 | backward: 1196.51 | backward-backward: 1196.48 | backward-allreduce: 0.00 | optimizer: 42.27 | batch generator: 4.12 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step83500 + samples/sec: 646.403 | iteration 87000/ 143000 | elapsed time per iteration (ms): 1584.2 | learning rate: 2.435E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.223049E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.23 | backward: 1195.31 | backward-backward: 1195.27 | backward-allreduce: 0.00 | optimizer: 42.06 | batch generator: 4.96 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step84000 + samples/sec: 644.665 | iteration 87500/ 143000 | elapsed time per iteration (ms): 1588.4 | learning rate: 2.407E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.221034E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.46 | backward: 1199.54 | backward-backward: 1199.50 | backward-allreduce: 0.00 | optimizer: 42.00 | batch generator: 5.27 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step84500 + samples/sec: 649.404 | iteration 88000/ 143000 | elapsed time per iteration (ms): 1576.8 | learning rate: 2.379E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.222323E+00 | loss scale: 8192.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.44 | backward: 1187.87 | backward-backward: 1187.84 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 5.19 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step85000 + samples/sec: 646.053 | iteration 88500/ 143000 | elapsed time per iteration (ms): 1585.0 | learning rate: 2.351E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.220698E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.54 | backward: 1196.62 | backward-backward: 1196.58 | backward-allreduce: 0.00 | optimizer: 42.12 | batch generator: 4.42 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step85500 + samples/sec: 646.035 | iteration 89000/ 143000 | elapsed time per iteration (ms): 1585.1 | learning rate: 2.323E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.220872E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.80 | backward: 1196.31 | backward-backward: 1196.27 | backward-allreduce: 0.00 | optimizer: 42.39 | batch generator: 4.74 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step86000 + samples/sec: 644.533 | iteration 89500/ 143000 | elapsed time per iteration (ms): 1588.7 | learning rate: 2.295E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.218424E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.94 | backward: 1199.67 | backward-backward: 1199.63 | backward-allreduce: 0.00 | optimizer: 42.52 | batch generator: 4.57 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step86500 + samples/sec: 644.513 | iteration 90000/ 143000 | elapsed time per iteration (ms): 1588.8 | learning rate: 2.267E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.218250E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.64 | backward: 1201.06 | backward-backward: 1201.03 | backward-allreduce: 0.00 | optimizer: 42.66 | batch generator: 3.55 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step87000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 90000 | lm_loss value: 2.170986E+00 | lm_loss_ppl value: 8.766921E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 617.930 | iteration 90500/ 143000 | elapsed time per iteration (ms): 1657.1 | learning rate: 2.240E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.217364E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.69 | backward: 1196.65 | backward-backward: 1196.62 | backward-allreduce: 0.00 | optimizer: 42.29 | batch generator: 4.12 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step87500 + samples/sec: 649.596 | iteration 91000/ 143000 | elapsed time per iteration (ms): 1576.4 | learning rate: 2.212E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.215816E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.09 | backward: 1188.89 | backward-backward: 1188.85 | backward-allreduce: 0.00 | optimizer: 42.00 | batch generator: 3.99 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step88000 + samples/sec: 649.716 | iteration 91500/ 143000 | elapsed time per iteration (ms): 1576.1 | learning rate: 2.185E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.216458E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.26 | backward: 1188.43 | backward-backward: 1188.39 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 4.22 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step88500 + samples/sec: 644.739 | iteration 92000/ 143000 | elapsed time per iteration (ms): 1588.2 | learning rate: 2.158E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.215258E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.63 | backward: 1199.91 | backward-backward: 1199.88 | backward-allreduce: 0.00 | optimizer: 42.18 | batch generator: 4.57 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step89000 + samples/sec: 646.241 | iteration 92500/ 143000 | elapsed time per iteration (ms): 1584.5 | learning rate: 2.131E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.214200E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 342.15 | backward: 1196.58 | backward-backward: 1196.54 | backward-allreduce: 0.00 | optimizer: 42.26 | batch generator: 4.11 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step89500 + samples/sec: 649.729 | iteration 93000/ 143000 | elapsed time per iteration (ms): 1576.0 | learning rate: 2.104E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.215164E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.55 | backward: 1187.88 | backward-backward: 1187.84 | backward-allreduce: 0.00 | optimizer: 42.13 | batch generator: 4.32 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step90000 + samples/sec: 647.866 | iteration 93500/ 143000 | elapsed time per iteration (ms): 1580.6 | learning rate: 2.077E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.211570E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.54 | backward: 1191.68 | backward-backward: 1191.64 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 5.35 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step90500 + samples/sec: 645.250 | iteration 94000/ 143000 | elapsed time per iteration (ms): 1587.0 | learning rate: 2.050E-04 | approx flops per GPU: 72.9TFLOPS | lm_loss: 2.211672E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.13 | backward: 1200.18 | backward-backward: 1200.14 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.95 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step91000 + samples/sec: 647.892 | iteration 94500/ 143000 | elapsed time per iteration (ms): 1580.5 | learning rate: 2.024E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.212595E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.41 | backward: 1192.39 | backward-backward: 1192.35 | backward-allreduce: 0.00 | optimizer: 42.28 | batch generator: 4.28 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step91500 + samples/sec: 652.903 | iteration 95000/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 1.998E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.209821E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.53 | backward: 1179.45 | backward-backward: 1179.41 | backward-allreduce: 0.00 | optimizer: 41.82 | batch generator: 5.29 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step92000 +---------------------------------------------------------------------------------------------------------- + validation results at iteration 95000 | lm_loss value: 2.164396E+00 | lm_loss_ppl value: 8.709342E+00 | +---------------------------------------------------------------------------------------------------------- + samples/sec: 619.976 | iteration 95500/ 143000 | elapsed time per iteration (ms): 1651.7 | learning rate: 1.972E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.210356E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 343.67 | backward: 1190.92 | backward-backward: 1190.88 | backward-allreduce: 0.00 | optimizer: 41.69 | batch generator: 6.07 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step92500 + samples/sec: 649.380 | iteration 96000/ 143000 | elapsed time per iteration (ms): 1576.9 | learning rate: 1.946E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.210721E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.67 | backward: 1188.43 | backward-backward: 1188.39 | backward-allreduce: 0.00 | optimizer: 42.17 | batch generator: 4.46 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step93000 + samples/sec: 648.232 | iteration 96500/ 143000 | elapsed time per iteration (ms): 1579.7 | learning rate: 1.920E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.208288E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.77 | backward: 1188.14 | backward-backward: 1188.10 | backward-allreduce: 0.00 | optimizer: 41.91 | batch generator: 4.66 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step93500 + samples/sec: 649.838 | iteration 97000/ 143000 | elapsed time per iteration (ms): 1575.8 | learning rate: 1.894E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.207250E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.73 | backward: 1187.68 | backward-backward: 1187.64 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 4.72 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step94000 + samples/sec: 651.463 | iteration 97500/ 143000 | elapsed time per iteration (ms): 1571.8 | learning rate: 1.869E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.207198E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.33 | backward: 1184.14 | backward-backward: 1184.11 | backward-allreduce: 0.00 | optimizer: 41.81 | batch generator: 4.29 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step94500 + samples/sec: 644.605 | iteration 98000/ 143000 | elapsed time per iteration (ms): 1588.6 | learning rate: 1.843E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 2.205237E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.47 | backward: 1200.13 | backward-backward: 1200.10 | backward-allreduce: 0.00 | optimizer: 42.53 | batch generator: 4.47 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step95000 + samples/sec: 653.092 | iteration 98500/ 143000 | elapsed time per iteration (ms): 1567.9 | learning rate: 1.818E-04 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.204356E+00 | loss scale: 16384.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 343.32 | backward: 1179.29 | backward-backward: 1179.26 | backward-allreduce: 0.00 | optimizer: 41.64 | batch generator: 5.09 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step95500 + samples/sec: 651.298 | iteration 99000/ 143000 | elapsed time per iteration (ms): 1572.2 | learning rate: 1.793E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.204523E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.54 | backward: 1185.00 | backward-backward: 1184.96 | backward-allreduce: 0.00 | optimizer: 42.02 | batch generator: 3.38 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step96000 + samples/sec: 646.389 | iteration 99500/ 143000 | elapsed time per iteration (ms): 1584.2 | learning rate: 1.769E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.205386E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.30 | backward: 1196.08 | backward-backward: 1196.04 | backward-allreduce: 0.00 | optimizer: 42.27 | batch generator: 4.19 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step96500 + samples/sec: 646.346 | iteration 100000/ 143000 | elapsed time per iteration (ms): 1584.3 | learning rate: 1.744E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.203180E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.74 | backward: 1195.74 | backward-backward: 1195.70 | backward-allreduce: 0.00 | optimizer: 42.26 | batch generator: 4.54 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step97000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 100000 | lm_loss value: 2.155080E+00 | lm_loss_ppl value: 8.628577E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 619.854 | iteration 100500/ 143000 | elapsed time per iteration (ms): 1652.0 | learning rate: 1.720E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.201184E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.99 | backward: 1191.65 | backward-backward: 1191.61 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 5.64 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step97500 + samples/sec: 651.373 | iteration 101000/ 143000 | elapsed time per iteration (ms): 1572.1 | learning rate: 1.696E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.200471E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.04 | backward: 1183.79 | backward-backward: 1183.75 | backward-allreduce: 0.00 | optimizer: 41.72 | batch generator: 5.01 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step98000 + samples/sec: 646.291 | iteration 101500/ 143000 | elapsed time per iteration (ms): 1584.4 | learning rate: 1.672E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.200939E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.82 | backward: 1195.98 | backward-backward: 1195.94 | backward-allreduce: 0.00 | optimizer: 42.26 | batch generator: 4.68 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step98500 + samples/sec: 649.578 | iteration 102000/ 143000 | elapsed time per iteration (ms): 1576.4 | learning rate: 1.648E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.199260E+00 | loss scale: 8192.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.79 | backward: 1188.89 | backward-backward: 1188.85 | backward-allreduce: 0.00 | optimizer: 42.13 | batch generator: 3.79 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step99000 + samples/sec: 647.864 | iteration 102500/ 143000 | elapsed time per iteration (ms): 1580.6 | learning rate: 1.624E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.199196E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.50 | backward: 1191.42 | backward-backward: 1191.39 | backward-allreduce: 0.00 | optimizer: 42.11 | batch generator: 5.46 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step99500 + samples/sec: 647.892 | iteration 103000/ 143000 | elapsed time per iteration (ms): 1580.5 | learning rate: 1.601E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.198333E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.84 | backward: 1191.93 | backward-backward: 1191.89 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 4.70 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step100000 + samples/sec: 649.669 | iteration 103500/ 143000 | elapsed time per iteration (ms): 1576.2 | learning rate: 1.578E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.197894E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.59 | backward: 1187.44 | backward-backward: 1187.40 | backward-allreduce: 0.00 | optimizer: 41.93 | batch generator: 5.44 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step100500 + samples/sec: 651.215 | iteration 104000/ 143000 | elapsed time per iteration (ms): 1572.4 | learning rate: 1.555E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.197459E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.89 | backward: 1183.87 | backward-backward: 1183.83 | backward-allreduce: 0.00 | optimizer: 41.99 | batch generator: 4.79 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step101000 + samples/sec: 648.124 | iteration 104500/ 143000 | elapsed time per iteration (ms): 1579.9 | learning rate: 1.532E-04 | approx flops per GPU: 73.2TFLOPS | lm_loss: 2.196155E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.55 | backward: 1191.91 | backward-backward: 1191.87 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 4.43 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step101500 + samples/sec: 651.433 | iteration 105000/ 143000 | elapsed time per iteration (ms): 1571.9 | learning rate: 1.509E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.196881E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.10 | backward: 1183.45 | backward-backward: 1183.42 | backward-allreduce: 0.00 | optimizer: 41.76 | batch generator: 5.07 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step102000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 105000 | lm_loss value: 2.150724E+00 | lm_loss_ppl value: 8.591076E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 615.068 | iteration 105500/ 143000 | elapsed time per iteration (ms): 1664.9 | learning rate: 1.487E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.195792E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.57 | backward: 1203.61 | backward-backward: 1203.57 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 6.04 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step102500 + samples/sec: 649.434 | iteration 106000/ 143000 | elapsed time per iteration (ms): 1576.8 | learning rate: 1.465E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.193735E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.88 | backward: 1187.45 | backward-backward: 1187.42 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 5.66 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step103000 + samples/sec: 646.212 | iteration 106500/ 143000 | elapsed time per iteration (ms): 1584.6 | learning rate: 1.443E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.193853E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.43 | backward: 1195.44 | backward-backward: 1195.41 | backward-allreduce: 0.00 | optimizer: 42.06 | batch generator: 5.08 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step103500 + samples/sec: 646.428 | iteration 107000/ 143000 | elapsed time per iteration (ms): 1584.1 | learning rate: 1.422E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.194042E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 343.06 | backward: 1195.48 | backward-backward: 1195.44 | backward-allreduce: 0.00 | optimizer: 42.25 | batch generator: 5.06 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step104000 + samples/sec: 646.513 | iteration 107500/ 143000 | elapsed time per iteration (ms): 1583.9 | learning rate: 1.400E-04 | approx flops per GPU: 73.0TFLOPS | lm_loss: 2.192559E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.85 | backward: 1195.50 | backward-backward: 1195.46 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 4.81 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step104500 + samples/sec: 651.209 | iteration 108000/ 143000 | elapsed time per iteration (ms): 1572.5 | learning rate: 1.379E-04 | approx flops per GPU: 73.6TFLOPS | lm_loss: 2.190536E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.21 | backward: 1183.96 | backward-backward: 1183.93 | backward-allreduce: 0.00 | optimizer: 41.85 | batch generator: 4.97 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step105000 + samples/sec: 644.931 | iteration 108500/ 143000 | elapsed time per iteration (ms): 1587.8 | learning rate: 1.358E-04 | approx flops per GPU: 72.9TFLOPS | lm_loss: 2.190384E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 343.53 | backward: 1198.69 | backward-backward: 1198.65 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 5.48 + samples/sec: 650.144 | iteration 109000/ 143000 | elapsed time per iteration (ms): 1575.0 | learning rate: 1.338E-04 | approx flops per GPU: 73.4TFLOPS | lm_loss: 2.189802E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +after 109000 iterations memory (MB) | allocated: 1832.7998046875 | max allocated: 12938.84716796875 | reserved: 14140.0 | max reserved: 14140.0 +time (ms) | forward: 348.74 | backward: 1183.25 | backward-backward: 1183.21 | backward-allreduce: 0.00 | optimizer: 41.97 | batch generator: 7.66 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step106000 + samples/sec: 652.217 | iteration 109500/ 143000 | elapsed time per iteration (ms): 1570.0 | learning rate: 1.317E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.188999E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.08 | backward: 1181.45 | backward-backward: 1181.42 | backward-allreduce: 0.00 | optimizer: 41.84 | batch generator: 3.20 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step106500 + samples/sec: 652.688 | iteration 110000/ 143000 | elapsed time per iteration (ms): 1568.9 | learning rate: 1.297E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.191502E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.91 | backward: 1181.58 | backward-backward: 1181.55 | backward-allreduce: 0.00 | optimizer: 41.95 | batch generator: 3.14 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step107000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 110000 | lm_loss value: 2.142145E+00 | lm_loss_ppl value: 8.517688E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 623.705 | iteration 110500/ 143000 | elapsed time per iteration (ms): 1641.8 | learning rate: 1.277E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.188919E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.83 | backward: 1181.75 | backward-backward: 1181.71 | backward-allreduce: 0.00 | optimizer: 42.68 | batch generator: 3.60 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step107500 + samples/sec: 652.605 | iteration 111000/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 1.257E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.188124E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.33 | backward: 1181.47 | backward-backward: 1181.43 | backward-allreduce: 0.00 | optimizer: 41.76 | batch generator: 3.34 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step108000 + samples/sec: 652.408 | iteration 111500/ 143000 | elapsed time per iteration (ms): 1569.6 | learning rate: 1.238E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.186715E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.38 | backward: 1181.36 | backward-backward: 1181.32 | backward-allreduce: 0.00 | optimizer: 41.94 | batch generator: 3.37 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step108500 + samples/sec: 652.542 | iteration 112000/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 1.219E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.183673E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.73 | backward: 1181.80 | backward-backward: 1181.77 | backward-allreduce: 0.00 | optimizer: 42.00 | batch generator: 2.95 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step109000 + samples/sec: 652.696 | iteration 112500/ 143000 | elapsed time per iteration (ms): 1568.9 | learning rate: 1.200E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.185006E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.01 | backward: 1181.62 | backward-backward: 1181.59 | backward-allreduce: 0.00 | optimizer: 41.82 | batch generator: 3.36 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step109500 + samples/sec: 650.842 | iteration 113000/ 143000 | elapsed time per iteration (ms): 1573.3 | learning rate: 1.181E-04 | approx flops per GPU: 73.5TFLOPS | lm_loss: 2.182081E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 342.09 | backward: 1185.47 | backward-backward: 1185.44 | backward-allreduce: 0.00 | optimizer: 41.98 | batch generator: 3.26 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step110000 + samples/sec: 652.625 | iteration 113500/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 1.163E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.184227E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 342.07 | backward: 1181.56 | backward-backward: 1181.52 | backward-allreduce: 0.00 | optimizer: 41.89 | batch generator: 3.38 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step110500 + samples/sec: 650.967 | iteration 114000/ 143000 | elapsed time per iteration (ms): 1573.0 | learning rate: 1.145E-04 | approx flops per GPU: 73.5TFLOPS | lm_loss: 2.182455E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.50 | backward: 1186.12 | backward-backward: 1186.09 | backward-allreduce: 0.00 | optimizer: 41.97 | batch generator: 2.89 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step111000 + samples/sec: 652.553 | iteration 114500/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 1.127E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.182220E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.84 | backward: 1181.99 | backward-backward: 1181.95 | backward-allreduce: 0.00 | optimizer: 41.87 | batch generator: 3.25 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step111500 + samples/sec: 652.562 | iteration 115000/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 1.109E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.181237E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.82 | backward: 1181.91 | backward-backward: 1181.87 | backward-allreduce: 0.00 | optimizer: 41.98 | batch generator: 3.23 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step112000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 115000 | lm_loss value: 2.131743E+00 | lm_loss_ppl value: 8.429548E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 623.940 | iteration 115500/ 143000 | elapsed time per iteration (ms): 1641.2 | learning rate: 1.092E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.179543E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.93 | backward: 1181.80 | backward-backward: 1181.76 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 3.39 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step112500 + samples/sec: 652.627 | iteration 116000/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 1.075E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.180763E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.86 | backward: 1181.75 | backward-backward: 1181.71 | backward-allreduce: 0.00 | optimizer: 41.99 | batch generator: 3.05 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step113000 + samples/sec: 650.606 | iteration 116500/ 143000 | elapsed time per iteration (ms): 1573.9 | learning rate: 1.058E-04 | approx flops per GPU: 73.5TFLOPS | lm_loss: 2.180567E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 342.05 | backward: 1181.67 | backward-backward: 1181.63 | backward-allreduce: 0.00 | optimizer: 41.85 | batch generator: 2.93 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step113500 + samples/sec: 652.487 | iteration 117000/ 143000 | elapsed time per iteration (ms): 1569.4 | learning rate: 1.041E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.177936E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.95 | backward: 1181.84 | backward-backward: 1181.80 | backward-allreduce: 0.00 | optimizer: 41.89 | batch generator: 3.17 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step114000 + samples/sec: 652.538 | iteration 117500/ 143000 | elapsed time per iteration (ms): 1569.3 | learning rate: 1.025E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.178621E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.76 | backward: 1181.95 | backward-backward: 1181.91 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 2.87 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step114500 + samples/sec: 652.548 | iteration 118000/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 1.009E-04 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.177385E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.87 | backward: 1181.69 | backward-backward: 1181.66 | backward-allreduce: 0.00 | optimizer: 42.00 | batch generator: 2.90 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step115000 + samples/sec: 652.516 | iteration 118500/ 143000 | elapsed time per iteration (ms): 1569.3 | learning rate: 9.933E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.176641E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.90 | backward: 1181.78 | backward-backward: 1181.74 | backward-allreduce: 0.00 | optimizer: 41.98 | batch generator: 3.17 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step115500 + samples/sec: 652.650 | iteration 119000/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 9.779E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.174950E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.69 | backward: 1181.89 | backward-backward: 1181.85 | backward-allreduce: 0.00 | optimizer: 41.99 | batch generator: 3.02 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step116000 + samples/sec: 652.682 | iteration 119500/ 143000 | elapsed time per iteration (ms): 1568.9 | learning rate: 9.628E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.175945E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 341.61 | backward: 1181.86 | backward-backward: 1181.82 | backward-allreduce: 0.00 | optimizer: 41.79 | batch generator: 2.92 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step116500 + samples/sec: 650.987 | iteration 120000/ 143000 | elapsed time per iteration (ms): 1573.0 | learning rate: 9.480E-05 | approx flops per GPU: 73.5TFLOPS | lm_loss: 2.174810E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.60 | backward: 1185.83 | backward-backward: 1185.79 | backward-allreduce: 0.00 | optimizer: 41.99 | batch generator: 2.70 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step117000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 120000 | lm_loss value: 2.124752E+00 | lm_loss_ppl value: 8.370824E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 623.951 | iteration 120500/ 143000 | elapsed time per iteration (ms): 1641.2 | learning rate: 9.334E-05 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.173496E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.73 | backward: 1181.85 | backward-backward: 1181.81 | backward-allreduce: 0.00 | optimizer: 41.99 | batch generator: 3.21 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step117500 + samples/sec: 652.631 | iteration 121000/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 9.191E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.174440E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.77 | backward: 1181.80 | backward-backward: 1181.77 | backward-allreduce: 0.00 | optimizer: 41.98 | batch generator: 2.75 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step118000 + samples/sec: 652.567 | iteration 121500/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 9.051E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.173671E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.59 | backward: 1181.86 | backward-backward: 1181.82 | backward-allreduce: 0.00 | optimizer: 42.00 | batch generator: 2.79 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step118500 + samples/sec: 652.802 | iteration 122000/ 143000 | elapsed time per iteration (ms): 1568.6 | learning rate: 8.914E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.172527E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.43 | backward: 1181.86 | backward-backward: 1181.82 | backward-allreduce: 0.00 | optimizer: 41.96 | batch generator: 2.75 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step119000 + samples/sec: 652.880 | iteration 122500/ 143000 | elapsed time per iteration (ms): 1568.4 | learning rate: 8.781E-05 | approx flops per GPU: 73.8TFLOPS | lm_loss: 2.173356E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 341.59 | backward: 1181.80 | backward-backward: 1181.76 | backward-allreduce: 0.00 | optimizer: 41.73 | batch generator: 2.92 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step119500 + samples/sec: 650.924 | iteration 123000/ 143000 | elapsed time per iteration (ms): 1573.1 | learning rate: 8.650E-05 | approx flops per GPU: 73.5TFLOPS | lm_loss: 2.173805E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.65 | backward: 1185.98 | backward-backward: 1185.94 | backward-allreduce: 0.00 | optimizer: 42.12 | batch generator: 3.04 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step120000 + samples/sec: 652.598 | iteration 123500/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 8.522E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.172041E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.93 | backward: 1181.61 | backward-backward: 1181.58 | backward-allreduce: 0.00 | optimizer: 41.99 | batch generator: 3.07 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step120500 + samples/sec: 652.779 | iteration 124000/ 143000 | elapsed time per iteration (ms): 1568.7 | learning rate: 8.398E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.170999E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.55 | backward: 1181.79 | backward-backward: 1181.75 | backward-allreduce: 0.00 | optimizer: 42.02 | batch generator: 2.81 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step121000 + samples/sec: 652.557 | iteration 124500/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 8.276E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.169544E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.34 | backward: 1181.97 | backward-backward: 1181.94 | backward-allreduce: 0.00 | optimizer: 42.08 | batch generator: 2.44 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step121500 + samples/sec: 652.622 | iteration 125000/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 8.157E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.170835E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.36 | backward: 1182.07 | backward-backward: 1182.04 | backward-allreduce: 0.00 | optimizer: 41.96 | batch generator: 2.61 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step122000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 125000 | lm_loss value: 2.123132E+00 | lm_loss_ppl value: 8.357269E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 623.976 | iteration 125500/ 143000 | elapsed time per iteration (ms): 1641.1 | learning rate: 8.041E-05 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.169736E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.44 | backward: 1181.94 | backward-backward: 1181.90 | backward-allreduce: 0.00 | optimizer: 41.93 | batch generator: 3.00 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step122500 + samples/sec: 652.529 | iteration 126000/ 143000 | elapsed time per iteration (ms): 1569.3 | learning rate: 7.929E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.168448E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.76 | backward: 1181.67 | backward-backward: 1181.64 | backward-allreduce: 0.00 | optimizer: 42.02 | batch generator: 2.96 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step123000 + samples/sec: 651.041 | iteration 126500/ 143000 | elapsed time per iteration (ms): 1572.9 | learning rate: 7.820E-05 | approx flops per GPU: 73.5TFLOPS | lm_loss: 2.168848E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 341.38 | backward: 1185.89 | backward-backward: 1185.85 | backward-allreduce: 0.00 | optimizer: 42.08 | batch generator: 2.62 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step123500 + samples/sec: 652.597 | iteration 127000/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 7.713E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.169487E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.51 | backward: 1182.06 | backward-backward: 1182.02 | backward-allreduce: 0.00 | optimizer: 41.99 | batch generator: 2.64 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step124000 + samples/sec: 652.519 | iteration 127500/ 143000 | elapsed time per iteration (ms): 1569.3 | learning rate: 7.610E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.167813E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.29 | backward: 1182.03 | backward-backward: 1182.00 | backward-allreduce: 0.00 | optimizer: 42.05 | batch generator: 2.48 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step124500 + samples/sec: 652.592 | iteration 128000/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 7.509E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.168724E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.51 | backward: 1181.89 | backward-backward: 1181.85 | backward-allreduce: 0.00 | optimizer: 42.02 | batch generator: 2.67 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step125000 + samples/sec: 652.619 | iteration 128500/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 7.412E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.166499E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.52 | backward: 1181.85 | backward-backward: 1181.82 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 2.57 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step125500 + samples/sec: 652.748 | iteration 129000/ 143000 | elapsed time per iteration (ms): 1568.8 | learning rate: 7.318E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.167298E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.54 | backward: 1181.81 | backward-backward: 1181.77 | backward-allreduce: 0.00 | optimizer: 41.96 | batch generator: 2.59 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step126000 + samples/sec: 652.727 | iteration 129500/ 143000 | elapsed time per iteration (ms): 1568.8 | learning rate: 7.228E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.164677E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.55 | backward: 1181.72 | backward-backward: 1181.68 | backward-allreduce: 0.00 | optimizer: 42.00 | batch generator: 2.67 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step126500 + samples/sec: 652.562 | iteration 130000/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 7.140E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.165758E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.71 | backward: 1181.60 | backward-backward: 1181.57 | backward-allreduce: 0.00 | optimizer: 42.03 | batch generator: 2.69 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step127000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 130000 | lm_loss value: 2.120112E+00 | lm_loss_ppl value: 8.332068E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 623.969 | iteration 130500/ 143000 | elapsed time per iteration (ms): 1641.1 | learning rate: 7.056E-05 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.165841E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.35 | backward: 1182.08 | backward-backward: 1182.04 | backward-allreduce: 0.00 | optimizer: 41.99 | batch generator: 2.88 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step127500 + samples/sec: 652.551 | iteration 131000/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 6.974E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.165390E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.53 | backward: 1181.98 | backward-backward: 1181.95 | backward-allreduce: 0.00 | optimizer: 42.02 | batch generator: 2.67 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step128000 + samples/sec: 652.469 | iteration 131500/ 143000 | elapsed time per iteration (ms): 1569.4 | learning rate: 6.896E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.163113E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.88 | backward: 1181.70 | backward-backward: 1181.67 | backward-allreduce: 0.00 | optimizer: 42.05 | batch generator: 2.98 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step128500 + samples/sec: 652.333 | iteration 132000/ 143000 | elapsed time per iteration (ms): 1569.8 | learning rate: 6.822E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.163762E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 341.46 | backward: 1182.19 | backward-backward: 1182.16 | backward-allreduce: 0.00 | optimizer: 41.78 | batch generator: 2.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step129000 + samples/sec: 650.744 | iteration 132500/ 143000 | elapsed time per iteration (ms): 1573.6 | learning rate: 6.750E-05 | approx flops per GPU: 73.5TFLOPS | lm_loss: 2.164282E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.50 | backward: 1186.41 | backward-backward: 1186.37 | backward-allreduce: 0.00 | optimizer: 42.16 | batch generator: 2.55 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step129500 + samples/sec: 652.443 | iteration 133000/ 143000 | elapsed time per iteration (ms): 1569.5 | learning rate: 6.681E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.162564E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.32 | backward: 1182.32 | backward-backward: 1182.28 | backward-allreduce: 0.00 | optimizer: 42.09 | batch generator: 2.31 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step130000 + samples/sec: 652.571 | iteration 133500/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 6.616E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.162058E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.61 | backward: 1181.85 | backward-backward: 1181.81 | backward-allreduce: 0.00 | optimizer: 42.05 | batch generator: 2.62 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step130500 + samples/sec: 652.431 | iteration 134000/ 143000 | elapsed time per iteration (ms): 1569.5 | learning rate: 6.554E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.161019E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.42 | backward: 1182.03 | backward-backward: 1181.99 | backward-allreduce: 0.00 | optimizer: 42.04 | batch generator: 2.55 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step131000 + samples/sec: 652.599 | iteration 134500/ 143000 | elapsed time per iteration (ms): 1569.1 | learning rate: 6.496E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.161353E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 341.51 | backward: 1182.07 | backward-backward: 1182.03 | backward-allreduce: 0.00 | optimizer: 41.78 | batch generator: 2.67 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step131500 + samples/sec: 652.344 | iteration 135000/ 143000 | elapsed time per iteration (ms): 1569.7 | learning rate: 6.440E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.161707E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.77 | backward: 1181.87 | backward-backward: 1181.83 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 2.89 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step132000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 135000 | lm_loss value: 2.115296E+00 | lm_loss_ppl value: 8.292043E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 622.081 | iteration 135500/ 143000 | elapsed time per iteration (ms): 1646.1 | learning rate: 6.388E-05 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.160641E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.81 | backward: 1186.31 | backward-backward: 1186.27 | backward-allreduce: 0.00 | optimizer: 42.18 | batch generator: 3.32 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step132500 + samples/sec: 652.550 | iteration 136000/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 6.339E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.162566E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.47 | backward: 1182.01 | backward-backward: 1181.97 | backward-allreduce: 0.00 | optimizer: 42.08 | batch generator: 2.53 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step133000 + samples/sec: 652.557 | iteration 136500/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 6.293E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.159975E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.47 | backward: 1181.91 | backward-backward: 1181.87 | backward-allreduce: 0.00 | optimizer: 42.14 | batch generator: 2.57 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step133500 + samples/sec: 652.650 | iteration 137000/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 6.251E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.161976E+00 | loss scale: 16384.0 | number of skipped iterations: 3 | number of nan iterations: 0 | +time (ms) | forward: 341.46 | backward: 1181.87 | backward-backward: 1181.83 | backward-allreduce: 0.00 | optimizer: 41.87 | batch generator: 2.54 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step134000 + samples/sec: 652.559 | iteration 137500/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 6.212E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.159670E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.45 | backward: 1182.02 | backward-backward: 1181.99 | backward-allreduce: 0.00 | optimizer: 42.08 | batch generator: 2.44 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step134500 + samples/sec: 652.391 | iteration 138000/ 143000 | elapsed time per iteration (ms): 1569.6 | learning rate: 6.176E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.159072E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.49 | backward: 1182.11 | backward-backward: 1182.07 | backward-allreduce: 0.00 | optimizer: 42.10 | batch generator: 2.56 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step135000 + samples/sec: 652.637 | iteration 138500/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 6.144E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.161305E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.43 | backward: 1181.84 | backward-backward: 1181.80 | backward-allreduce: 0.00 | optimizer: 42.05 | batch generator: 2.79 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step135500 + samples/sec: 652.571 | iteration 139000/ 143000 | elapsed time per iteration (ms): 1569.2 | learning rate: 6.115E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.160093E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.51 | backward: 1182.00 | backward-backward: 1181.96 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 2.76 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step136000 + samples/sec: 652.503 | iteration 139500/ 143000 | elapsed time per iteration (ms): 1569.3 | learning rate: 6.089E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.158746E+00 | loss scale: 32768.0 | number of skipped iterations: 2 | number of nan iterations: 0 | +time (ms) | forward: 341.42 | backward: 1181.80 | backward-backward: 1181.76 | backward-allreduce: 0.00 | optimizer: 41.97 | batch generator: 2.41 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step136500 + samples/sec: 652.650 | iteration 140000/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 6.066E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.157870E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.90 | backward: 1181.51 | backward-backward: 1181.47 | backward-allreduce: 0.00 | optimizer: 42.07 | batch generator: 2.98 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step137000 +----------------------------------------------------------------------------------------------------------- + validation results at iteration 140000 | lm_loss value: 2.113937E+00 | lm_loss_ppl value: 8.280780E+00 | +----------------------------------------------------------------------------------------------------------- + samples/sec: 623.943 | iteration 140500/ 143000 | elapsed time per iteration (ms): 1641.2 | learning rate: 6.047E-05 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.158919E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.70 | backward: 1181.65 | backward-backward: 1181.62 | backward-allreduce: 0.00 | optimizer: 42.12 | batch generator: 3.12 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step137500 + samples/sec: 652.627 | iteration 141000/ 143000 | elapsed time per iteration (ms): 1569.0 | learning rate: 6.031E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.159078E+00 | loss scale: 65536.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.48 | backward: 1181.81 | backward-backward: 1181.77 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 2.65 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step138000 + samples/sec: 652.507 | iteration 141500/ 143000 | elapsed time per iteration (ms): 1569.3 | learning rate: 6.018E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.158313E+00 | loss scale: 32768.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.83 | backward: 1181.47 | backward-backward: 1181.43 | backward-allreduce: 0.00 | optimizer: 42.01 | batch generator: 2.88 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step138500 + samples/sec: 652.482 | iteration 142000/ 143000 | elapsed time per iteration (ms): 1569.4 | learning rate: 6.009E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.159870E+00 | loss scale: 16384.0 | number of skipped iterations: 1 | number of nan iterations: 0 | +time (ms) | forward: 341.70 | backward: 1181.87 | backward-backward: 1181.83 | backward-allreduce: 0.00 | optimizer: 42.02 | batch generator: 2.86 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step139000 + samples/sec: 650.872 | iteration 142500/ 143000 | elapsed time per iteration (ms): 1573.3 | learning rate: 6.003E-05 | approx flops per GPU: 73.5TFLOPS | lm_loss: 2.157927E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.64 | backward: 1185.68 | backward-backward: 1185.64 | backward-allreduce: 0.00 | optimizer: 42.21 | batch generator: 2.79 +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step139500 + samples/sec: 652.507 | iteration 143000/ 143000 | elapsed time per iteration (ms): 1569.3 | learning rate: 6.000E-05 | approx flops per GPU: 73.7TFLOPS | lm_loss: 2.158180E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) | forward: 341.84 | backward: 1181.68 | backward-backward: 1181.64 | backward-allreduce: 0.00 | optimizer: 42.19 | batch generator: 2.93 +--------------------------------------------------------------------------------------------------------------------------- + validation results at the end of training for val data | lm_loss value: 2.115951E+00 | lm_loss_ppl value: 8.297475E+00 | +--------------------------------------------------------------------------------------------------------------------------- +WARNING: Deleting old checkpoints: + /u/wangh/workspace_ptmp/checkpoints_gptneox/flownet/450M_nogptj_nobias_nomalinit_wtying_nonparamln_finalwithparam_lr6e-4/checkpoints/global_step140000 +---------------------------------------------------------------------------------------------------------------------- + test results at the end of training for test data | lm_loss value: 2.107435E+00 | lm_loss_ppl value: 8.227115E+00 | +---------------------------------------------------------------------------------------------------------------------- + +