micro_batch_size: 4
Browse files- README.md +25 -0
- scripts/pretrain-core-model.yaml +2 -1
README.md
CHANGED
@@ -65,6 +65,31 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
|
|
65 |
```
|
66 |
|
67 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# ...
|
69 |
```
|
70 |
|
|
|
65 |
```
|
66 |
|
67 |
```
|
68 |
+
Seed set to 23
|
69 |
+
Time to instantiate model: 0.23 seconds.
|
70 |
+
Total parameters: 226,165,248
|
71 |
+
Verifying settings ...
|
72 |
+
Measured TFLOPs: 7111.07
|
73 |
+
Epoch 1 | iter 256 step 1 | loss train: 10.531, val: n/a | iter time: 3552.77 ms (step) remaining time: 4 days, 2:53:44
|
74 |
+
Epoch 1 | iter 512 step 2 | loss train: 10.517, val: n/a | iter time: 759.61 ms (step) remaining time: 3 days, 21:53:33
|
75 |
+
Epoch 1 | iter 768 step 3 | loss train: 10.478, val: n/a | iter time: 758.59 ms (step) remaining time: 3 days, 20:06:10
|
76 |
+
Epoch 1 | iter 1024 step 4 | loss train: 10.432, val: n/a | iter time: 758.46 ms (step) remaining time: 3 days, 19:11:21
|
77 |
+
Epoch 1 | iter 1280 step 5 | loss train: 10.317, val: n/a | iter time: 757.80 ms (step) remaining time: 3 days, 18:37:07
|
78 |
+
Epoch 1 | iter 1536 step 6 | loss train: 10.203, val: n/a | iter time: 757.94 ms (step) remaining time: 3 days, 18:13:14
|
79 |
+
Epoch 1 | iter 1792 step 7 | loss train: 10.092, val: n/a | iter time: 758.36 ms (step) remaining time: 3 days, 17:55:18
|
80 |
+
Epoch 1 | iter 2048 step 8 | loss train: 9.999, val: n/a | iter time: 758.86 ms (step) remaining time: 3 days, 17:41:21
|
81 |
+
Epoch 1 | iter 2304 step 9 | loss train: 9.811, val: n/a | iter time: 756.62 ms (step) remaining time: 3 days, 17:29:46
|
82 |
+
Epoch 1 | iter 2560 step 10 | loss train: 9.700, val: n/a | iter time: 756.86 ms (step) remaining time: 3 days, 17:18:59
|
83 |
+
Epoch 1 | iter 2816 step 11 | loss train: 9.546, val: n/a | iter time: 757.33 ms (step) remaining time: 3 days, 17:09:34
|
84 |
+
Epoch 1 | iter 3072 step 12 | loss train: 9.437, val: n/a | iter time: 756.18 ms (step) remaining time: 3 days, 17:01:19
|
85 |
+
Epoch 1 | iter 3328 step 13 | loss train: 9.336, val: n/a | iter time: 759.60 ms (step) remaining time: 3 days, 16:53:49
|
86 |
+
Epoch 1 | iter 3584 step 14 | loss train: 9.240, val: n/a | iter time: 758.52 ms (step) remaining time: 3 days, 16:46:55
|
87 |
+
Epoch 1 | iter 3840 step 15 | loss train: 9.120, val: n/a | iter time: 754.31 ms (step) remaining time: 3 days, 16:40:23
|
88 |
+
Epoch 1 | iter 4096 step 16 | loss train: 9.016, val: n/a | iter time: 757.21 ms (step) remaining time: 3 days, 16:34:19
|
89 |
+
Epoch 1 | iter 4352 step 17 | loss train: 8.913, val: n/a | iter time: 754.89 ms (step) remaining time: 3 days, 16:28:34
|
90 |
+
Epoch 1 | iter 4608 step 18 | loss train: 8.854, val: n/a | iter time: 756.99 ms (step) remaining time: 3 days, 16:23:07
|
91 |
+
Epoch 1 | iter 4864 step 19 | loss train: 8.798, val: n/a | iter time: 756.30 ms (step) remaining time: 3 days, 16:17:59
|
92 |
+
Epoch 1 | iter 5120 step 20 | loss train: 8.726, val: n/a | iter time: 756.11 ms (step) remaining time: 3 days, 16:13:04
|
93 |
# ...
|
94 |
```
|
95 |
|
scripts/pretrain-core-model.yaml
CHANGED
@@ -67,7 +67,8 @@ train:
|
|
67 |
# global_batch_size: 256
|
68 |
|
69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
70 |
-
micro_batch_size:
|
|
|
71 |
# micro_batch_size: 1
|
72 |
|
73 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
|
|
67 |
# global_batch_size: 256
|
68 |
|
69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
70 |
+
micro_batch_size: 4
|
71 |
+
# micro_batch_size: 2
|
72 |
# micro_batch_size: 1
|
73 |
|
74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|