jpata's picture
added CMS model from 2024 April 5 PF meeting
652e089
[2024-03-24 23:57:43,238] INFO: Will use single-gpu: NVIDIA A100 80GB PCIe
[2024-03-24 23:57:43,242] INFO: using dtype=torch.bfloat16
[2024-03-24 23:57:43,242] INFO: using dtype=torch.bfloat16
[2024-03-24 23:57:43,277] INFO: using attention_type=flash
[2024-03-24 23:57:43,277] INFO: using attention_type=flash
[2024-03-24 23:57:43,290] INFO: using attention_type=flash
[2024-03-24 23:57:43,290] INFO: using attention_type=flash
[2024-03-24 23:57:43,302] INFO: using attention_type=flash
[2024-03-24 23:57:43,302] INFO: using attention_type=flash
[2024-03-24 23:57:43,314] INFO: using attention_type=flash
[2024-03-24 23:57:43,314] INFO: using attention_type=flash
[2024-03-24 23:57:43,330] INFO: using attention_type=flash
[2024-03-24 23:57:43,330] INFO: using attention_type=flash
[2024-03-24 23:57:43,342] INFO: using attention_type=flash
[2024-03-24 23:57:43,342] INFO: using attention_type=flash
[2024-03-24 23:57:43,354] INFO: using attention_type=flash
[2024-03-24 23:57:43,354] INFO: using attention_type=flash
[2024-03-24 23:57:43,367] INFO: using attention_type=flash
[2024-03-24 23:57:43,367] INFO: using attention_type=flash
[2024-03-24 23:57:43,380] INFO: using attention_type=flash
[2024-03-24 23:57:43,380] INFO: using attention_type=flash
[2024-03-24 23:57:43,392] INFO: using attention_type=flash
[2024-03-24 23:57:43,392] INFO: using attention_type=flash
[2024-03-24 23:57:43,404] INFO: using attention_type=flash
[2024-03-24 23:57:43,404] INFO: using attention_type=flash
[2024-03-24 23:57:43,415] INFO: using attention_type=flash
[2024-03-24 23:57:43,415] INFO: using attention_type=flash
[2024-03-24 23:57:43,660] INFO: MLPF(
(nn0_id): Sequential(
(0): Linear(in_features=55, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=512, bias=True)
)
(nn0_reg): Sequential(
(0): Linear(in_features=55, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=512, bias=True)
)
(conv_id): ModuleList(
(0-5): 6 x SelfAttentionLayer(
(mha): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(norm0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(seq): Sequential(
(0): Linear(in_features=512, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(conv_reg): ModuleList(
(0-5): 6 x SelfAttentionLayer(
(mha): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(norm0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(seq): Sequential(
(0): Linear(in_features=512, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(nn_id): Sequential(
(0): Linear(in_features=567, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=9, bias=True)
)
(nn_pt): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
(nn_eta): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
(nn_sin_phi): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
(nn_cos_phi): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
(nn_energy): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
)
[2024-03-24 23:57:43,660] INFO: MLPF(
(nn0_id): Sequential(
(0): Linear(in_features=55, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=512, bias=True)
)
(nn0_reg): Sequential(
(0): Linear(in_features=55, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=512, bias=True)
)
(conv_id): ModuleList(
(0-5): 6 x SelfAttentionLayer(
(mha): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(norm0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(seq): Sequential(
(0): Linear(in_features=512, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(conv_reg): ModuleList(
(0-5): 6 x SelfAttentionLayer(
(mha): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(norm0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(seq): Sequential(
(0): Linear(in_features=512, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(nn_id): Sequential(
(0): Linear(in_features=567, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=9, bias=True)
)
(nn_pt): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
(nn_eta): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
(nn_sin_phi): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
(nn_cos_phi): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
(nn_energy): RegressionOutput(
(nn): Sequential(
(0): Linear(in_features=576, out_features=512, bias=True)
(1): ReLU()
(2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(3): Dropout(p=0.0, inplace=False)
(4): Linear(in_features=512, out_features=2, bias=True)
)
)
)
[2024-03-24 23:57:43,666] INFO: Trainable parameters: 21304339
[2024-03-24 23:57:43,666] INFO: Trainable parameters: 21304339
[2024-03-24 23:57:43,668] INFO: Non-trainable parameters: 0
[2024-03-24 23:57:43,668] INFO: Non-trainable parameters: 0
[2024-03-24 23:57:43,671] INFO: Total parameters: 21304339
[2024-03-24 23:57:43,671] INFO: Total parameters: 21304339
[2024-03-24 23:57:43,682] INFO: Modules Trainable parameters Non-tranable parameters
nn0_id.0.weight 28160 0
nn0_id.0.bias 512 0
nn0_id.2.weight 512 0
nn0_id.2.bias 512 0
nn0_id.4.weight 262144 0
nn0_id.4.bias 512 0
nn0_reg.0.weight 28160 0
nn0_reg.0.bias 512 0
nn0_reg.2.weight 512 0
nn0_reg.2.bias 512 0
nn0_reg.4.weight 262144 0
nn0_reg.4.bias 512 0
conv_id.0.mha.in_proj_weight 786432 0
conv_id.0.mha.in_proj_bias 1536 0
conv_id.0.mha.out_proj.weight 262144 0
conv_id.0.mha.out_proj.bias 512 0
conv_id.0.norm0.weight 512 0
conv_id.0.norm0.bias 512 0
conv_id.0.norm1.weight 512 0
conv_id.0.norm1.bias 512 0
conv_id.0.seq.0.weight 262144 0
conv_id.0.seq.0.bias 512 0
conv_id.0.seq.2.weight 262144 0
conv_id.0.seq.2.bias 512 0
conv_id.1.mha.in_proj_weight 786432 0
conv_id.1.mha.in_proj_bias 1536 0
conv_id.1.mha.out_proj.weight 262144 0
conv_id.1.mha.out_proj.bias 512 0
conv_id.1.norm0.weight 512 0
conv_id.1.norm0.bias 512 0
conv_id.1.norm1.weight 512 0
conv_id.1.norm1.bias 512 0
conv_id.1.seq.0.weight 262144 0
conv_id.1.seq.0.bias 512 0
conv_id.1.seq.2.weight 262144 0
conv_id.1.seq.2.bias 512 0
conv_id.2.mha.in_proj_weight 786432 0
conv_id.2.mha.in_proj_bias 1536 0
conv_id.2.mha.out_proj.weight 262144 0
conv_id.2.mha.out_proj.bias 512 0
conv_id.2.norm0.weight 512 0
conv_id.2.norm0.bias 512 0
conv_id.2.norm1.weight 512 0
conv_id.2.norm1.bias 512 0
conv_id.2.seq.0.weight 262144 0
conv_id.2.seq.0.bias 512 0
conv_id.2.seq.2.weight 262144 0
conv_id.2.seq.2.bias 512 0
conv_id.3.mha.in_proj_weight 786432 0
conv_id.3.mha.in_proj_bias 1536 0
conv_id.3.mha.out_proj.weight 262144 0
conv_id.3.mha.out_proj.bias 512 0
conv_id.3.norm0.weight 512 0
conv_id.3.norm0.bias 512 0
conv_id.3.norm1.weight 512 0
conv_id.3.norm1.bias 512 0
conv_id.3.seq.0.weight 262144 0
conv_id.3.seq.0.bias 512 0
conv_id.3.seq.2.weight 262144 0
conv_id.3.seq.2.bias 512 0
conv_id.4.mha.in_proj_weight 786432 0
conv_id.4.mha.in_proj_bias 1536 0
conv_id.4.mha.out_proj.weight 262144 0
conv_id.4.mha.out_proj.bias 512 0
conv_id.4.norm0.weight 512 0
conv_id.4.norm0.bias 512 0
conv_id.4.norm1.weight 512 0
conv_id.4.norm1.bias 512 0
conv_id.4.seq.0.weight 262144 0
conv_id.4.seq.0.bias 512 0
conv_id.4.seq.2.weight 262144 0
conv_id.4.seq.2.bias 512 0
conv_id.5.mha.in_proj_weight 786432 0
conv_id.5.mha.in_proj_bias 1536 0
conv_id.5.mha.out_proj.weight 262144 0
conv_id.5.mha.out_proj.bias 512 0
conv_id.5.norm0.weight 512 0
conv_id.5.norm0.bias 512 0
conv_id.5.norm1.weight 512 0
conv_id.5.norm1.bias 512 0
conv_id.5.seq.0.weight 262144 0
conv_id.5.seq.0.bias 512 0
conv_id.5.seq.2.weight 262144 0
conv_id.5.seq.2.bias 512 0
conv_reg.0.mha.in_proj_weight 786432 0
conv_reg.0.mha.in_proj_bias 1536 0
conv_reg.0.mha.out_proj.weight 262144 0
conv_reg.0.mha.out_proj.bias 512 0
conv_reg.0.norm0.weight 512 0
conv_reg.0.norm0.bias 512 0
conv_reg.0.norm1.weight 512 0
conv_reg.0.norm1.bias 512 0
conv_reg.0.seq.0.weight 262144 0
conv_reg.0.seq.0.bias 512 0
conv_reg.0.seq.2.weight 262144 0
conv_reg.0.seq.2.bias 512 0
conv_reg.1.mha.in_proj_weight 786432 0
conv_reg.1.mha.in_proj_bias 1536 0
conv_reg.1.mha.out_proj.weight 262144 0
conv_reg.1.mha.out_proj.bias 512 0
conv_reg.1.norm0.weight 512 0
conv_reg.1.norm0.bias 512 0
conv_reg.1.norm1.weight 512 0
conv_reg.1.norm1.bias 512 0
conv_reg.1.seq.0.weight 262144 0
conv_reg.1.seq.0.bias 512 0
conv_reg.1.seq.2.weight 262144 0
conv_reg.1.seq.2.bias 512 0
conv_reg.2.mha.in_proj_weight 786432 0
conv_reg.2.mha.in_proj_bias 1536 0
conv_reg.2.mha.out_proj.weight 262144 0
conv_reg.2.mha.out_proj.bias 512 0
conv_reg.2.norm0.weight 512 0
conv_reg.2.norm0.bias 512 0
conv_reg.2.norm1.weight 512 0
conv_reg.2.norm1.bias 512 0
conv_reg.2.seq.0.weight 262144 0
conv_reg.2.seq.0.bias 512 0
conv_reg.2.seq.2.weight 262144 0
conv_reg.2.seq.2.bias 512 0
conv_reg.3.mha.in_proj_weight 786432 0
conv_reg.3.mha.in_proj_bias 1536 0
conv_reg.3.mha.out_proj.weight 262144 0
conv_reg.3.mha.out_proj.bias 512 0
conv_reg.3.norm0.weight 512 0
conv_reg.3.norm0.bias 512 0
conv_reg.3.norm1.weight 512 0
conv_reg.3.norm1.bias 512 0
conv_reg.3.seq.0.weight 262144 0
conv_reg.3.seq.0.bias 512 0
conv_reg.3.seq.2.weight 262144 0
conv_reg.3.seq.2.bias 512 0
conv_reg.4.mha.in_proj_weight 786432 0
conv_reg.4.mha.in_proj_bias 1536 0
conv_reg.4.mha.out_proj.weight 262144 0
conv_reg.4.mha.out_proj.bias 512 0
conv_reg.4.norm0.weight 512 0
conv_reg.4.norm0.bias 512 0
conv_reg.4.norm1.weight 512 0
conv_reg.4.norm1.bias 512 0
conv_reg.4.seq.0.weight 262144 0
conv_reg.4.seq.0.bias 512 0
conv_reg.4.seq.2.weight 262144 0
conv_reg.4.seq.2.bias 512 0
conv_reg.5.mha.in_proj_weight 786432 0
conv_reg.5.mha.in_proj_bias 1536 0
conv_reg.5.mha.out_proj.weight 262144 0
conv_reg.5.mha.out_proj.bias 512 0
conv_reg.5.norm0.weight 512 0
conv_reg.5.norm0.bias 512 0
conv_reg.5.norm1.weight 512 0
conv_reg.5.norm1.bias 512 0
conv_reg.5.seq.0.weight 262144 0
conv_reg.5.seq.0.bias 512 0
conv_reg.5.seq.2.weight 262144 0
conv_reg.5.seq.2.bias 512 0
nn_id.0.weight 290304 0
nn_id.0.bias 512 0
nn_id.2.weight 512 0
nn_id.2.bias 512 0
nn_id.4.weight 4608 0
nn_id.4.bias 9 0
nn_pt.nn.0.weight 294912 0
nn_pt.nn.0.bias 512 0
nn_pt.nn.2.weight 512 0
nn_pt.nn.2.bias 512 0
nn_pt.nn.4.weight 1024 0
nn_pt.nn.4.bias 2 0
nn_eta.nn.0.weight 294912 0
nn_eta.nn.0.bias 512 0
nn_eta.nn.2.weight 512 0
nn_eta.nn.2.bias 512 0
nn_eta.nn.4.weight 1024 0
nn_eta.nn.4.bias 2 0
nn_sin_phi.nn.0.weight 294912 0
nn_sin_phi.nn.0.bias 512 0
nn_sin_phi.nn.2.weight 512 0
nn_sin_phi.nn.2.bias 512 0
nn_sin_phi.nn.4.weight 1024 0
nn_sin_phi.nn.4.bias 2 0
nn_cos_phi.nn.0.weight 294912 0
nn_cos_phi.nn.0.bias 512 0
nn_cos_phi.nn.2.weight 512 0
nn_cos_phi.nn.2.bias 512 0
nn_cos_phi.nn.4.weight 1024 0
nn_cos_phi.nn.4.bias 2 0
nn_energy.nn.0.weight 294912 0
nn_energy.nn.0.bias 512 0
nn_energy.nn.2.weight 512 0
nn_energy.nn.2.bias 512 0
nn_energy.nn.4.weight 1024 0
nn_energy.nn.4.bias 2 0
[2024-03-24 23:57:43,682] INFO: Modules Trainable parameters Non-tranable parameters
nn0_id.0.weight 28160 0
nn0_id.0.bias 512 0
nn0_id.2.weight 512 0
nn0_id.2.bias 512 0
nn0_id.4.weight 262144 0
nn0_id.4.bias 512 0
nn0_reg.0.weight 28160 0
nn0_reg.0.bias 512 0
nn0_reg.2.weight 512 0
nn0_reg.2.bias 512 0
nn0_reg.4.weight 262144 0
nn0_reg.4.bias 512 0
conv_id.0.mha.in_proj_weight 786432 0
conv_id.0.mha.in_proj_bias 1536 0
conv_id.0.mha.out_proj.weight 262144 0
conv_id.0.mha.out_proj.bias 512 0
conv_id.0.norm0.weight 512 0
conv_id.0.norm0.bias 512 0
conv_id.0.norm1.weight 512 0
conv_id.0.norm1.bias 512 0
conv_id.0.seq.0.weight 262144 0
conv_id.0.seq.0.bias 512 0
conv_id.0.seq.2.weight 262144 0
conv_id.0.seq.2.bias 512 0
conv_id.1.mha.in_proj_weight 786432 0
conv_id.1.mha.in_proj_bias 1536 0
conv_id.1.mha.out_proj.weight 262144 0
conv_id.1.mha.out_proj.bias 512 0
conv_id.1.norm0.weight 512 0
conv_id.1.norm0.bias 512 0
conv_id.1.norm1.weight 512 0
conv_id.1.norm1.bias 512 0
conv_id.1.seq.0.weight 262144 0
conv_id.1.seq.0.bias 512 0
conv_id.1.seq.2.weight 262144 0
conv_id.1.seq.2.bias 512 0
conv_id.2.mha.in_proj_weight 786432 0
conv_id.2.mha.in_proj_bias 1536 0
conv_id.2.mha.out_proj.weight 262144 0
conv_id.2.mha.out_proj.bias 512 0
conv_id.2.norm0.weight 512 0
conv_id.2.norm0.bias 512 0
conv_id.2.norm1.weight 512 0
conv_id.2.norm1.bias 512 0
conv_id.2.seq.0.weight 262144 0
conv_id.2.seq.0.bias 512 0
conv_id.2.seq.2.weight 262144 0
conv_id.2.seq.2.bias 512 0
conv_id.3.mha.in_proj_weight 786432 0
conv_id.3.mha.in_proj_bias 1536 0
conv_id.3.mha.out_proj.weight 262144 0
conv_id.3.mha.out_proj.bias 512 0
conv_id.3.norm0.weight 512 0
conv_id.3.norm0.bias 512 0
conv_id.3.norm1.weight 512 0
conv_id.3.norm1.bias 512 0
conv_id.3.seq.0.weight 262144 0
conv_id.3.seq.0.bias 512 0
conv_id.3.seq.2.weight 262144 0
conv_id.3.seq.2.bias 512 0
conv_id.4.mha.in_proj_weight 786432 0
conv_id.4.mha.in_proj_bias 1536 0
conv_id.4.mha.out_proj.weight 262144 0
conv_id.4.mha.out_proj.bias 512 0
conv_id.4.norm0.weight 512 0
conv_id.4.norm0.bias 512 0
conv_id.4.norm1.weight 512 0
conv_id.4.norm1.bias 512 0
conv_id.4.seq.0.weight 262144 0
conv_id.4.seq.0.bias 512 0
conv_id.4.seq.2.weight 262144 0
conv_id.4.seq.2.bias 512 0
conv_id.5.mha.in_proj_weight 786432 0
conv_id.5.mha.in_proj_bias 1536 0
conv_id.5.mha.out_proj.weight 262144 0
conv_id.5.mha.out_proj.bias 512 0
conv_id.5.norm0.weight 512 0
conv_id.5.norm0.bias 512 0
conv_id.5.norm1.weight 512 0
conv_id.5.norm1.bias 512 0
conv_id.5.seq.0.weight 262144 0
conv_id.5.seq.0.bias 512 0
conv_id.5.seq.2.weight 262144 0
conv_id.5.seq.2.bias 512 0
conv_reg.0.mha.in_proj_weight 786432 0
conv_reg.0.mha.in_proj_bias 1536 0
conv_reg.0.mha.out_proj.weight 262144 0
conv_reg.0.mha.out_proj.bias 512 0
conv_reg.0.norm0.weight 512 0
conv_reg.0.norm0.bias 512 0
conv_reg.0.norm1.weight 512 0
conv_reg.0.norm1.bias 512 0
conv_reg.0.seq.0.weight 262144 0
conv_reg.0.seq.0.bias 512 0
conv_reg.0.seq.2.weight 262144 0
conv_reg.0.seq.2.bias 512 0
conv_reg.1.mha.in_proj_weight 786432 0
conv_reg.1.mha.in_proj_bias 1536 0
conv_reg.1.mha.out_proj.weight 262144 0
conv_reg.1.mha.out_proj.bias 512 0
conv_reg.1.norm0.weight 512 0
conv_reg.1.norm0.bias 512 0
conv_reg.1.norm1.weight 512 0
conv_reg.1.norm1.bias 512 0
conv_reg.1.seq.0.weight 262144 0
conv_reg.1.seq.0.bias 512 0
conv_reg.1.seq.2.weight 262144 0
conv_reg.1.seq.2.bias 512 0
conv_reg.2.mha.in_proj_weight 786432 0
conv_reg.2.mha.in_proj_bias 1536 0
conv_reg.2.mha.out_proj.weight 262144 0
conv_reg.2.mha.out_proj.bias 512 0
conv_reg.2.norm0.weight 512 0
conv_reg.2.norm0.bias 512 0
conv_reg.2.norm1.weight 512 0
conv_reg.2.norm1.bias 512 0
conv_reg.2.seq.0.weight 262144 0
conv_reg.2.seq.0.bias 512 0
conv_reg.2.seq.2.weight 262144 0
conv_reg.2.seq.2.bias 512 0
conv_reg.3.mha.in_proj_weight 786432 0
conv_reg.3.mha.in_proj_bias 1536 0
conv_reg.3.mha.out_proj.weight 262144 0
conv_reg.3.mha.out_proj.bias 512 0
conv_reg.3.norm0.weight 512 0
conv_reg.3.norm0.bias 512 0
conv_reg.3.norm1.weight 512 0
conv_reg.3.norm1.bias 512 0
conv_reg.3.seq.0.weight 262144 0
conv_reg.3.seq.0.bias 512 0
conv_reg.3.seq.2.weight 262144 0
conv_reg.3.seq.2.bias 512 0
conv_reg.4.mha.in_proj_weight 786432 0
conv_reg.4.mha.in_proj_bias 1536 0
conv_reg.4.mha.out_proj.weight 262144 0
conv_reg.4.mha.out_proj.bias 512 0
conv_reg.4.norm0.weight 512 0
conv_reg.4.norm0.bias 512 0
conv_reg.4.norm1.weight 512 0
conv_reg.4.norm1.bias 512 0
conv_reg.4.seq.0.weight 262144 0
conv_reg.4.seq.0.bias 512 0
conv_reg.4.seq.2.weight 262144 0
conv_reg.4.seq.2.bias 512 0
conv_reg.5.mha.in_proj_weight 786432 0
conv_reg.5.mha.in_proj_bias 1536 0
conv_reg.5.mha.out_proj.weight 262144 0
conv_reg.5.mha.out_proj.bias 512 0
conv_reg.5.norm0.weight 512 0
conv_reg.5.norm0.bias 512 0
conv_reg.5.norm1.weight 512 0
conv_reg.5.norm1.bias 512 0
conv_reg.5.seq.0.weight 262144 0
conv_reg.5.seq.0.bias 512 0
conv_reg.5.seq.2.weight 262144 0
conv_reg.5.seq.2.bias 512 0
nn_id.0.weight 290304 0
nn_id.0.bias 512 0
nn_id.2.weight 512 0
nn_id.2.bias 512 0
nn_id.4.weight 4608 0
nn_id.4.bias 9 0
nn_pt.nn.0.weight 294912 0
nn_pt.nn.0.bias 512 0
nn_pt.nn.2.weight 512 0
nn_pt.nn.2.bias 512 0
nn_pt.nn.4.weight 1024 0
nn_pt.nn.4.bias 2 0
nn_eta.nn.0.weight 294912 0
nn_eta.nn.0.bias 512 0
nn_eta.nn.2.weight 512 0
nn_eta.nn.2.bias 512 0
nn_eta.nn.4.weight 1024 0
nn_eta.nn.4.bias 2 0
nn_sin_phi.nn.0.weight 294912 0
nn_sin_phi.nn.0.bias 512 0
nn_sin_phi.nn.2.weight 512 0
nn_sin_phi.nn.2.bias 512 0
nn_sin_phi.nn.4.weight 1024 0
nn_sin_phi.nn.4.bias 2 0
nn_cos_phi.nn.0.weight 294912 0
nn_cos_phi.nn.0.bias 512 0
nn_cos_phi.nn.2.weight 512 0
nn_cos_phi.nn.2.bias 512 0
nn_cos_phi.nn.4.weight 1024 0
nn_cos_phi.nn.4.bias 2 0
nn_energy.nn.0.weight 294912 0
nn_energy.nn.0.bias 512 0
nn_energy.nn.2.weight 512 0
nn_energy.nn.2.bias 512 0
nn_energy.nn.4.weight 1024 0
nn_energy.nn.4.bias 2 0
[2024-03-24 23:57:43,685] INFO: Creating experiment dir experiments/pyg-cms_20240324_235743_208080
[2024-03-24 23:57:43,685] INFO: Creating experiment dir experiments/pyg-cms_20240324_235743_208080
[2024-03-24 23:57:43,690] INFO: Model directory experiments/pyg-cms_20240324_235743_208080
[2024-03-24 23:57:43,690] INFO: Model directory experiments/pyg-cms_20240324_235743_208080
[2024-03-24 23:57:49,345] INFO: train_dataset: cms_pf_ttbar, 320100
[2024-03-24 23:57:49,345] INFO: train_dataset: cms_pf_ttbar, 320100
[2024-03-24 23:57:49,622] INFO: valid_dataset: cms_pf_ttbar, 80040
[2024-03-24 23:57:49,622] INFO: valid_dataset: cms_pf_ttbar, 80040
[2024-03-24 23:57:49,736] INFO: Initiating epoch #1 train run on device rank=0
[2024-03-24 23:57:49,736] INFO: Initiating epoch #1 train run on device rank=0
[2024-03-25 04:22:50,148] INFO: Initiating epoch #1 valid run on device rank=0
[2024-03-25 04:22:50,148] INFO: Initiating epoch #1 valid run on device rank=0
[2024-03-25 04:42:43,853] INFO: Rank 0: epoch=1 / 100 train_loss=20.8840 valid_loss=19.4969 stale=0 time=284.9m eta=28205.3m
[2024-03-25 04:42:43,853] INFO: Rank 0: epoch=1 / 100 train_loss=20.8840 valid_loss=19.4969 stale=0 time=284.9m eta=28205.3m
[2024-03-25 04:42:43,865] INFO: Initiating epoch #2 train run on device rank=0
[2024-03-25 04:42:43,865] INFO: Initiating epoch #2 train run on device rank=0
[2024-03-25 09:08:30,154] INFO: Initiating epoch #2 valid run on device rank=0
[2024-03-25 09:08:30,154] INFO: Initiating epoch #2 valid run on device rank=0
[2024-03-25 09:28:30,625] INFO: Rank 0: epoch=2 / 100 train_loss=19.0650 valid_loss=18.8620 stale=0 time=285.78m eta=27963.4m
[2024-03-25 09:28:30,625] INFO: Rank 0: epoch=2 / 100 train_loss=19.0650 valid_loss=18.8620 stale=0 time=285.78m eta=27963.4m
[2024-03-25 09:28:30,647] INFO: Initiating epoch #3 train run on device rank=0
[2024-03-25 09:28:30,647] INFO: Initiating epoch #3 train run on device rank=0
[2024-03-25 13:55:12,130] INFO: Initiating epoch #3 valid run on device rank=0
[2024-03-25 13:55:12,130] INFO: Initiating epoch #3 valid run on device rank=0
[2024-03-25 14:15:15,806] INFO: Rank 0: epoch=3 / 100 train_loss=18.7688 valid_loss=18.6758 stale=0 time=286.75m eta=27723.7m
[2024-03-25 14:15:15,806] INFO: Rank 0: epoch=3 / 100 train_loss=18.7688 valid_loss=18.6758 stale=0 time=286.75m eta=27723.7m
[2024-03-25 14:15:15,821] INFO: Initiating epoch #4 train run on device rank=0
[2024-03-25 14:15:15,821] INFO: Initiating epoch #4 train run on device rank=0
[2024-03-25 18:42:35,229] INFO: Initiating epoch #4 valid run on device rank=0
[2024-03-25 18:42:35,229] INFO: Initiating epoch #4 valid run on device rank=0
[2024-03-25 19:02:40,697] INFO: Rank 0: epoch=4 / 100 train_loss=18.6170 valid_loss=18.5653 stale=0 time=287.41m eta=27476.4m
[2024-03-25 19:02:40,697] INFO: Rank 0: epoch=4 / 100 train_loss=18.6170 valid_loss=18.5653 stale=0 time=287.41m eta=27476.4m
[2024-03-25 19:02:40,717] INFO: Initiating epoch #5 train run on device rank=0
[2024-03-25 19:02:40,717] INFO: Initiating epoch #5 train run on device rank=0
[2024-03-25 23:29:55,640] INFO: Initiating epoch #5 valid run on device rank=0
[2024-03-25 23:29:55,640] INFO: Initiating epoch #5 valid run on device rank=0
[2024-03-25 23:50:00,453] INFO: Rank 0: epoch=5 / 100 train_loss=18.5102 valid_loss=18.4685 stale=0 time=287.33m eta=27211.4m
[2024-03-25 23:50:00,453] INFO: Rank 0: epoch=5 / 100 train_loss=18.5102 valid_loss=18.4685 stale=0 time=287.33m eta=27211.4m
[2024-03-25 23:50:00,467] INFO: Initiating epoch #6 train run on device rank=0
[2024-03-25 23:50:00,467] INFO: Initiating epoch #6 train run on device rank=0
[2024-03-26 04:16:46,611] INFO: Initiating epoch #6 valid run on device rank=0
[2024-03-26 04:16:46,611] INFO: Initiating epoch #6 valid run on device rank=0
[2024-03-26 04:36:44,551] INFO: Rank 0: epoch=6 / 100 train_loss=18.4325 valid_loss=18.4090 stale=0 time=286.73m eta=26929.6m
[2024-03-26 04:36:44,551] INFO: Rank 0: epoch=6 / 100 train_loss=18.4325 valid_loss=18.4090 stale=0 time=286.73m eta=26929.6m
[2024-03-26 04:36:44,567] INFO: Initiating epoch #7 train run on device rank=0
[2024-03-26 04:36:44,567] INFO: Initiating epoch #7 train run on device rank=0
[2024-03-26 09:01:18,997] INFO: Initiating epoch #7 valid run on device rank=0
[2024-03-26 09:01:18,997] INFO: Initiating epoch #7 valid run on device rank=0
[2024-03-26 09:21:17,339] INFO: Rank 0: epoch=7 / 100 train_loss=18.3752 valid_loss=18.3620 stale=0 time=284.55m eta=26617.4m
[2024-03-26 09:21:17,339] INFO: Rank 0: epoch=7 / 100 train_loss=18.3752 valid_loss=18.3620 stale=0 time=284.55m eta=26617.4m
[2024-03-26 09:21:17,356] INFO: Initiating epoch #8 train run on device rank=0
[2024-03-26 09:21:17,356] INFO: Initiating epoch #8 train run on device rank=0
[2024-03-26 13:46:38,478] INFO: Initiating epoch #8 valid run on device rank=0
[2024-03-26 13:46:38,478] INFO: Initiating epoch #8 valid run on device rank=0
[2024-03-26 14:06:32,269] INFO: Rank 0: epoch=8 / 100 train_loss=18.3286 valid_loss=18.3267 stale=0 time=285.25m eta=26320.2m
[2024-03-26 14:06:32,269] INFO: Rank 0: epoch=8 / 100 train_loss=18.3286 valid_loss=18.3267 stale=0 time=285.25m eta=26320.2m
[2024-03-26 14:06:32,282] INFO: Initiating epoch #9 train run on device rank=0
[2024-03-26 14:06:32,282] INFO: Initiating epoch #9 train run on device rank=0
[2024-03-26 18:32:18,831] INFO: Initiating epoch #9 valid run on device rank=0
[2024-03-26 18:32:18,831] INFO: Initiating epoch #9 valid run on device rank=0
[2024-03-26 18:52:20,399] INFO: Rank 0: epoch=9 / 100 train_loss=18.2890 valid_loss=18.2883 stale=0 time=285.8m eta=26031.2m
[2024-03-26 18:52:20,399] INFO: Rank 0: epoch=9 / 100 train_loss=18.2890 valid_loss=18.2883 stale=0 time=285.8m eta=26031.2m
[2024-03-26 18:52:20,416] INFO: Initiating epoch #10 train run on device rank=0
[2024-03-26 18:52:20,416] INFO: Initiating epoch #10 train run on device rank=0
[2024-03-26 23:18:19,010] INFO: Initiating epoch #10 valid run on device rank=0
[2024-03-26 23:18:19,010] INFO: Initiating epoch #10 valid run on device rank=0
[2024-03-26 23:38:18,380] INFO: Rank 0: epoch=10 / 100 train_loss=18.2532 valid_loss=18.2458 stale=0 time=285.97m eta=25744.3m
[2024-03-26 23:38:18,380] INFO: Rank 0: epoch=10 / 100 train_loss=18.2532 valid_loss=18.2458 stale=0 time=285.97m eta=25744.3m
[2024-03-26 23:38:18,395] INFO: Initiating epoch #11 train run on device rank=0
[2024-03-26 23:38:18,395] INFO: Initiating epoch #11 train run on device rank=0
[2024-03-27 04:04:16,483] INFO: Initiating epoch #11 valid run on device rank=0
[2024-03-27 04:04:16,483] INFO: Initiating epoch #11 valid run on device rank=0
[2024-03-27 04:24:14,645] INFO: Rank 0: epoch=11 / 100 train_loss=18.2191 valid_loss=18.2212 stale=0 time=285.94m eta=25457.4m
[2024-03-27 04:24:14,645] INFO: Rank 0: epoch=11 / 100 train_loss=18.2191 valid_loss=18.2212 stale=0 time=285.94m eta=25457.4m
[2024-03-27 04:24:14,659] INFO: Initiating epoch #12 train run on device rank=0
[2024-03-27 04:24:14,659] INFO: Initiating epoch #12 train run on device rank=0
[2024-03-27 08:50:12,730] INFO: Initiating epoch #12 valid run on device rank=0
[2024-03-27 08:50:12,730] INFO: Initiating epoch #12 valid run on device rank=0
[2024-03-27 09:10:07,240] INFO: Rank 0: epoch=12 / 100 train_loss=18.1875 valid_loss=18.2005 stale=0 time=285.88m eta=25170.1m
[2024-03-27 09:10:07,240] INFO: Rank 0: epoch=12 / 100 train_loss=18.1875 valid_loss=18.2005 stale=0 time=285.88m eta=25170.1m
[2024-03-27 09:10:07,255] INFO: Initiating epoch #13 train run on device rank=0
[2024-03-27 09:10:07,255] INFO: Initiating epoch #13 train run on device rank=0
[2024-03-27 13:36:41,534] INFO: Initiating epoch #13 valid run on device rank=0
[2024-03-27 13:36:41,534] INFO: Initiating epoch #13 valid run on device rank=0
[2024-03-27 13:56:51,429] INFO: Rank 0: epoch=13 / 100 train_loss=18.1578 valid_loss=18.1808 stale=0 time=286.74m eta=24888.9m
[2024-03-27 13:56:51,429] INFO: Rank 0: epoch=13 / 100 train_loss=18.1578 valid_loss=18.1808 stale=0 time=286.74m eta=24888.9m
[2024-03-27 13:56:51,446] INFO: Initiating epoch #14 train run on device rank=0
[2024-03-27 13:56:51,446] INFO: Initiating epoch #14 train run on device rank=0
[2024-03-27 18:24:24,174] INFO: Initiating epoch #14 valid run on device rank=0
[2024-03-27 18:24:24,174] INFO: Initiating epoch #14 valid run on device rank=0
[2024-03-27 18:44:20,853] INFO: Rank 0: epoch=14 / 100 train_loss=18.1282 valid_loss=18.1575 stale=0 time=287.49m eta=24611.5m
[2024-03-27 18:44:20,853] INFO: Rank 0: epoch=14 / 100 train_loss=18.1282 valid_loss=18.1575 stale=0 time=287.49m eta=24611.5m
[2024-03-27 18:44:20,870] INFO: Initiating epoch #15 train run on device rank=0
[2024-03-27 18:44:20,870] INFO: Initiating epoch #15 train run on device rank=0
[2024-03-27 23:12:11,710] INFO: Initiating epoch #15 valid run on device rank=0
[2024-03-27 23:12:11,710] INFO: Initiating epoch #15 valid run on device rank=0
[2024-03-27 23:32:20,988] INFO: Rank 0: epoch=15 / 100 train_loss=18.0996 valid_loss=18.1267 stale=0 time=288.0m eta=24335.6m
[2024-03-27 23:32:20,988] INFO: Rank 0: epoch=15 / 100 train_loss=18.0996 valid_loss=18.1267 stale=0 time=288.0m eta=24335.6m
[2024-03-27 23:32:21,002] INFO: Initiating epoch #16 train run on device rank=0
[2024-03-27 23:32:21,002] INFO: Initiating epoch #16 train run on device rank=0
[2024-03-28 03:58:27,660] INFO: Initiating epoch #16 valid run on device rank=0
[2024-03-28 03:58:27,660] INFO: Initiating epoch #16 valid run on device rank=0
[2024-03-28 04:18:21,127] INFO: Rank 0: epoch=16 / 100 train_loss=18.0729 valid_loss=18.1030 stale=0 time=286.0m eta=24047.7m
[2024-03-28 04:18:21,127] INFO: Rank 0: epoch=16 / 100 train_loss=18.0729 valid_loss=18.1030 stale=0 time=286.0m eta=24047.7m
[2024-03-28 04:18:21,141] INFO: Initiating epoch #17 train run on device rank=0
[2024-03-28 04:18:21,141] INFO: Initiating epoch #17 train run on device rank=0
[2024-03-28 08:44:11,691] INFO: Initiating epoch #17 valid run on device rank=0
[2024-03-28 08:44:11,691] INFO: Initiating epoch #17 valid run on device rank=0
[2024-03-28 09:04:09,550] INFO: Rank 0: epoch=17 / 100 train_loss=18.0478 valid_loss=18.0912 stale=0 time=285.81m eta=23759.1m
[2024-03-28 09:04:09,550] INFO: Rank 0: epoch=17 / 100 train_loss=18.0478 valid_loss=18.0912 stale=0 time=285.81m eta=23759.1m
[2024-03-28 09:04:09,566] INFO: Initiating epoch #18 train run on device rank=0
[2024-03-28 09:04:09,566] INFO: Initiating epoch #18 train run on device rank=0
[2024-03-28 13:29:43,054] INFO: Initiating epoch #18 valid run on device rank=0
[2024-03-28 13:29:43,054] INFO: Initiating epoch #18 valid run on device rank=0
[2024-03-28 13:49:42,746] INFO: Rank 0: epoch=18 / 100 train_loss=18.0235 valid_loss=18.0697 stale=0 time=285.55m eta=23469.7m
[2024-03-28 13:49:42,746] INFO: Rank 0: epoch=18 / 100 train_loss=18.0235 valid_loss=18.0697 stale=0 time=285.55m eta=23469.7m
[2024-03-28 13:49:42,760] INFO: Initiating epoch #19 train run on device rank=0
[2024-03-28 13:49:42,760] INFO: Initiating epoch #19 train run on device rank=0
[2024-03-28 18:15:26,865] INFO: Initiating epoch #19 valid run on device rank=0
[2024-03-28 18:15:26,865] INFO: Initiating epoch #19 valid run on device rank=0
[2024-03-28 18:35:24,119] INFO: Rank 0: epoch=19 / 100 train_loss=18.0008 valid_loss=18.0532 stale=0 time=285.69m eta=23181.2m
[2024-03-28 18:35:24,119] INFO: Rank 0: epoch=19 / 100 train_loss=18.0008 valid_loss=18.0532 stale=0 time=285.69m eta=23181.2m
[2024-03-28 18:35:24,137] INFO: Initiating epoch #20 train run on device rank=0
[2024-03-28 18:35:24,137] INFO: Initiating epoch #20 train run on device rank=0
[2024-03-28 23:00:36,705] INFO: Initiating epoch #20 valid run on device rank=0
[2024-03-28 23:00:36,705] INFO: Initiating epoch #20 valid run on device rank=0
[2024-03-28 23:20:35,806] INFO: Rank 0: epoch=20 / 100 train_loss=17.9783 valid_loss=18.0347 stale=0 time=285.19m eta=22891.1m
[2024-03-28 23:20:35,806] INFO: Rank 0: epoch=20 / 100 train_loss=17.9783 valid_loss=18.0347 stale=0 time=285.19m eta=22891.1m
[2024-03-28 23:20:35,825] INFO: Initiating epoch #21 train run on device rank=0
[2024-03-28 23:20:35,825] INFO: Initiating epoch #21 train run on device rank=0
[2024-03-29 03:46:25,188] INFO: Initiating epoch #21 valid run on device rank=0
[2024-03-29 03:46:25,188] INFO: Initiating epoch #21 valid run on device rank=0
[2024-03-29 04:06:24,286] INFO: Rank 0: epoch=21 / 100 train_loss=17.9565 valid_loss=18.0197 stale=0 time=285.81m eta=22603.7m
[2024-03-29 04:06:24,286] INFO: Rank 0: epoch=21 / 100 train_loss=17.9565 valid_loss=18.0197 stale=0 time=285.81m eta=22603.7m
[2024-03-29 04:06:24,301] INFO: Initiating epoch #22 train run on device rank=0
[2024-03-29 04:06:24,301] INFO: Initiating epoch #22 train run on device rank=0
[2024-03-29 08:31:11,736] INFO: Initiating epoch #22 valid run on device rank=0
[2024-03-29 08:31:11,736] INFO: Initiating epoch #22 valid run on device rank=0
[2024-03-29 08:51:08,443] INFO: Rank 0: epoch=22 / 100 train_loss=17.9356 valid_loss=18.0009 stale=0 time=284.74m eta=22312.7m
[2024-03-29 08:51:08,443] INFO: Rank 0: epoch=22 / 100 train_loss=17.9356 valid_loss=18.0009 stale=0 time=284.74m eta=22312.7m
[2024-03-29 08:51:08,458] INFO: Initiating epoch #23 train run on device rank=0
[2024-03-29 08:51:08,458] INFO: Initiating epoch #23 train run on device rank=0
[2024-03-29 13:16:30,304] INFO: Initiating epoch #23 valid run on device rank=0
[2024-03-29 13:16:30,304] INFO: Initiating epoch #23 valid run on device rank=0
[2024-03-29 13:36:29,623] INFO: Rank 0: epoch=23 / 100 train_loss=17.9150 valid_loss=17.9919 stale=0 time=285.35m eta=22024.2m
[2024-03-29 13:36:29,623] INFO: Rank 0: epoch=23 / 100 train_loss=17.9150 valid_loss=17.9919 stale=0 time=285.35m eta=22024.2m
[2024-03-29 13:36:29,637] INFO: Initiating epoch #24 train run on device rank=0
[2024-03-29 13:36:29,637] INFO: Initiating epoch #24 train run on device rank=0
[2024-03-29 18:01:59,324] INFO: Initiating epoch #24 valid run on device rank=0
[2024-03-29 18:01:59,324] INFO: Initiating epoch #24 valid run on device rank=0
[2024-03-29 18:21:58,557] INFO: Rank 0: epoch=24 / 100 train_loss=17.8948 valid_loss=17.9806 stale=0 time=285.48m eta=21736.5m
[2024-03-29 18:21:58,557] INFO: Rank 0: epoch=24 / 100 train_loss=17.8948 valid_loss=17.9806 stale=0 time=285.48m eta=21736.5m
[2024-03-29 18:21:58,573] INFO: Initiating epoch #25 train run on device rank=0
[2024-03-29 18:21:58,573] INFO: Initiating epoch #25 train run on device rank=0
[2024-03-29 22:47:04,103] INFO: Initiating epoch #25 valid run on device rank=0
[2024-03-29 22:47:04,103] INFO: Initiating epoch #25 valid run on device rank=0
[2024-03-29 23:06:58,509] INFO: Rank 0: epoch=25 / 100 train_loss=17.8745 valid_loss=17.9677 stale=0 time=285.0m eta=21447.4m
[2024-03-29 23:06:58,509] INFO: Rank 0: epoch=25 / 100 train_loss=17.8745 valid_loss=17.9677 stale=0 time=285.0m eta=21447.4m
[2024-03-29 23:06:58,528] INFO: Initiating epoch #26 train run on device rank=0
[2024-03-29 23:06:58,528] INFO: Initiating epoch #26 train run on device rank=0
[2024-03-30 03:32:08,173] INFO: Initiating epoch #26 valid run on device rank=0
[2024-03-30 03:32:08,173] INFO: Initiating epoch #26 valid run on device rank=0
[2024-03-30 03:52:08,592] INFO: Rank 0: epoch=26 / 100 train_loss=17.8549 valid_loss=17.9569 stale=0 time=285.17m eta=21159.2m
[2024-03-30 03:52:08,592] INFO: Rank 0: epoch=26 / 100 train_loss=17.8549 valid_loss=17.9569 stale=0 time=285.17m eta=21159.2m
[2024-03-30 03:52:08,608] INFO: Initiating epoch #27 train run on device rank=0
[2024-03-30 03:52:08,608] INFO: Initiating epoch #27 train run on device rank=0
[2024-03-30 08:17:59,526] INFO: Initiating epoch #27 valid run on device rank=0
[2024-03-30 08:17:59,526] INFO: Initiating epoch #27 valid run on device rank=0
[2024-03-30 08:38:00,700] INFO: Rank 0: epoch=27 / 100 train_loss=17.8347 valid_loss=17.9366 stale=0 time=285.87m eta=20873.1m
[2024-03-30 08:38:00,700] INFO: Rank 0: epoch=27 / 100 train_loss=17.8347 valid_loss=17.9366 stale=0 time=285.87m eta=20873.1m
[2024-03-30 08:38:00,714] INFO: Initiating epoch #28 train run on device rank=0
[2024-03-30 08:38:00,714] INFO: Initiating epoch #28 train run on device rank=0
[2024-03-30 13:03:33,057] INFO: Initiating epoch #28 valid run on device rank=0
[2024-03-30 13:03:33,057] INFO: Initiating epoch #28 valid run on device rank=0
[2024-03-30 13:23:29,411] INFO: Rank 0: epoch=28 / 100 train_loss=17.8149 valid_loss=17.9264 stale=0 time=285.48m eta=20586.0m
[2024-03-30 13:23:29,411] INFO: Rank 0: epoch=28 / 100 train_loss=17.8149 valid_loss=17.9264 stale=0 time=285.48m eta=20586.0m
[2024-03-30 13:23:29,424] INFO: Initiating epoch #29 train run on device rank=0
[2024-03-30 13:23:29,424] INFO: Initiating epoch #29 train run on device rank=0
[2024-03-30 17:49:27,995] INFO: Initiating epoch #29 valid run on device rank=0
[2024-03-30 17:49:27,995] INFO: Initiating epoch #29 valid run on device rank=0
[2024-03-30 18:09:26,968] INFO: Rank 0: epoch=29 / 100 train_loss=17.7950 valid_loss=17.9112 stale=0 time=285.96m eta=20300.2m
[2024-03-30 18:09:26,968] INFO: Rank 0: epoch=29 / 100 train_loss=17.7950 valid_loss=17.9112 stale=0 time=285.96m eta=20300.2m
[2024-03-30 18:09:26,982] INFO: Initiating epoch #30 train run on device rank=0
[2024-03-30 18:09:26,982] INFO: Initiating epoch #30 train run on device rank=0
[2024-03-30 22:34:51,876] INFO: Initiating epoch #30 valid run on device rank=0
[2024-03-30 22:34:51,876] INFO: Initiating epoch #30 valid run on device rank=0
[2024-03-30 22:54:53,145] INFO: Rank 0: epoch=30 / 100 train_loss=17.7757 valid_loss=17.9006 stale=0 time=285.44m eta=20013.1m
[2024-03-30 22:54:53,145] INFO: Rank 0: epoch=30 / 100 train_loss=17.7757 valid_loss=17.9006 stale=0 time=285.44m eta=20013.1m
[2024-03-30 22:54:53,159] INFO: Initiating epoch #31 train run on device rank=0
[2024-03-30 22:54:53,159] INFO: Initiating epoch #31 train run on device rank=0
[2024-03-31 04:20:44,340] INFO: Initiating epoch #31 valid run on device rank=0
[2024-03-31 04:20:44,340] INFO: Initiating epoch #31 valid run on device rank=0
[2024-03-31 04:40:42,326] INFO: Rank 0: epoch=31 / 100 train_loss=17.7565 valid_loss=17.8932 stale=0 time=285.82m eta=19727.0m
[2024-03-31 04:40:42,326] INFO: Rank 0: epoch=31 / 100 train_loss=17.7565 valid_loss=17.8932 stale=0 time=285.82m eta=19727.0m
[2024-03-31 04:40:42,350] INFO: Initiating epoch #32 train run on device rank=0
[2024-03-31 04:40:42,350] INFO: Initiating epoch #32 train run on device rank=0
[2024-03-31 09:06:05,665] INFO: Initiating epoch #32 valid run on device rank=0
[2024-03-31 09:06:05,665] INFO: Initiating epoch #32 valid run on device rank=0
[2024-03-31 09:26:03,353] INFO: Rank 0: epoch=32 / 100 train_loss=17.7375 valid_loss=17.8774 stale=0 time=285.35m eta=19440.0m
[2024-03-31 09:26:03,353] INFO: Rank 0: epoch=32 / 100 train_loss=17.7375 valid_loss=17.8774 stale=0 time=285.35m eta=19440.0m
[2024-03-31 09:26:03,368] INFO: Initiating epoch #33 train run on device rank=0
[2024-03-31 09:26:03,368] INFO: Initiating epoch #33 train run on device rank=0