bigscience-bot
commited on
Commit
•
5f100e1
1
Parent(s):
b98ce8e
new data
Browse files- logs/main_log.txt +64 -0
logs/main_log.txt
CHANGED
@@ -76938,3 +76938,67 @@ time (ms)
|
|
76938 |
time (ms)
|
76939 |
iteration 550/ 292968 | consumed samples: 1126400 | consumed tokens: 84869120 | elapsed time per iteration (ms): 112519.9 | learning rate: 3.004E-05 | global batch size: 2048 | lm loss: 5.550436E+00 | loss scale: 8192.0 | grad norm: 10479.605 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76940 |
time (ms)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76938 |
time (ms)
|
76939 |
iteration 550/ 292968 | consumed samples: 1126400 | consumed tokens: 84869120 | elapsed time per iteration (ms): 112519.9 | learning rate: 3.004E-05 | global batch size: 2048 | lm loss: 5.550436E+00 | loss scale: 8192.0 | grad norm: 10479.605 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76940 |
time (ms)
|
76941 |
+
iteration 551/ 292968 | consumed samples: 1128448 | consumed tokens: 85049344 | elapsed time per iteration (ms): 111692.6 | learning rate: 3.009E-05 | global batch size: 2048 | lm loss: 5.542852E+00 | loss scale: 8192.0 | grad norm: 18511.064 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76942 |
+
time (ms)
|
76943 |
+
iteration 552/ 292968 | consumed samples: 1130496 | consumed tokens: 85229568 | elapsed time per iteration (ms): 111626.8 | learning rate: 3.015E-05 | global batch size: 2048 | lm loss: 5.529922E+00 | loss scale: 8192.0 | grad norm: 9669.866 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76944 |
+
time (ms)
|
76945 |
+
iteration 553/ 292968 | consumed samples: 1132544 | consumed tokens: 85409792 | elapsed time per iteration (ms): 112073.7 | learning rate: 3.020E-05 | global batch size: 2048 | lm loss: 5.545301E+00 | loss scale: 8192.0 | grad norm: 12652.392 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76946 |
+
time (ms)
|
76947 |
+
iteration 554/ 292968 | consumed samples: 1134592 | consumed tokens: 85590016 | elapsed time per iteration (ms): 111561.5 | learning rate: 3.026E-05 | global batch size: 2048 | lm loss: 5.548908E+00 | loss scale: 8192.0 | grad norm: 12234.313 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76948 |
+
time (ms)
|
76949 |
+
iteration 555/ 292968 | consumed samples: 1136640 | consumed tokens: 85770240 | elapsed time per iteration (ms): 111056.4 | learning rate: 3.031E-05 | global batch size: 2048 | lm loss: 5.538098E+00 | loss scale: 8192.0 | grad norm: 12248.211 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76950 |
+
time (ms)
|
76951 |
+
iteration 556/ 292968 | consumed samples: 1138688 | consumed tokens: 85950464 | elapsed time per iteration (ms): 111595.6 | learning rate: 3.037E-05 | global batch size: 2048 | lm loss: 5.537742E+00 | loss scale: 8192.0 | grad norm: 10560.271 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76952 |
+
time (ms)
|
76953 |
+
iteration 557/ 292968 | consumed samples: 1140736 | consumed tokens: 86130688 | elapsed time per iteration (ms): 113191.6 | learning rate: 3.042E-05 | global batch size: 2048 | lm loss: 5.517148E+00 | loss scale: 8192.0 | grad norm: 14233.138 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76954 |
+
time (ms)
|
76955 |
+
iteration 558/ 292968 | consumed samples: 1142784 | consumed tokens: 86310912 | elapsed time per iteration (ms): 112335.2 | learning rate: 3.047E-05 | global batch size: 2048 | lm loss: 5.566739E+00 | loss scale: 8192.0 | grad norm: 14225.350 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76956 |
+
time (ms)
|
76957 |
+
iteration 559/ 292968 | consumed samples: 1144832 | consumed tokens: 86491136 | elapsed time per iteration (ms): 113204.0 | learning rate: 3.053E-05 | global batch size: 2048 | lm loss: 5.529708E+00 | loss scale: 8192.0 | grad norm: 9114.316 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76958 |
+
time (ms)
|
76959 |
+
iteration 560/ 292968 | consumed samples: 1146880 | consumed tokens: 86671360 | elapsed time per iteration (ms): 111793.0 | learning rate: 3.058E-05 | global batch size: 2048 | lm loss: 5.541924E+00 | loss scale: 8192.0 | grad norm: 9695.972 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76960 |
+
time (ms)
|
76961 |
+
iteration 561/ 292968 | consumed samples: 1148928 | consumed tokens: 86851584 | elapsed time per iteration (ms): 113028.7 | learning rate: 3.064E-05 | global batch size: 2048 | lm loss: 5.521393E+00 | loss scale: 8192.0 | grad norm: 11158.709 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76962 |
+
time (ms)
|
76963 |
+
iteration 562/ 292968 | consumed samples: 1150976 | consumed tokens: 87031808 | elapsed time per iteration (ms): 111623.4 | learning rate: 3.069E-05 | global batch size: 2048 | lm loss: 5.501397E+00 | loss scale: 8192.0 | grad norm: 11525.341 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76964 |
+
time (ms)
|
76965 |
+
iteration 563/ 292968 | consumed samples: 1153024 | consumed tokens: 87212032 | elapsed time per iteration (ms): 110973.8 | learning rate: 3.075E-05 | global batch size: 2048 | lm loss: 5.487821E+00 | loss scale: 8192.0 | grad norm: 12021.366 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76966 |
+
time (ms)
|
76967 |
+
iteration 564/ 292968 | consumed samples: 1155072 | consumed tokens: 87392256 | elapsed time per iteration (ms): 113374.4 | learning rate: 3.080E-05 | global batch size: 2048 | lm loss: 5.480217E+00 | loss scale: 8192.0 | grad norm: 10903.562 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76968 |
+
time (ms)
|
76969 |
+
iteration 565/ 292968 | consumed samples: 1157120 | consumed tokens: 87572480 | elapsed time per iteration (ms): 112996.4 | learning rate: 3.086E-05 | global batch size: 2048 | lm loss: 5.499344E+00 | loss scale: 8192.0 | grad norm: 10305.931 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76970 |
+
time (ms)
|
76971 |
+
iteration 566/ 292968 | consumed samples: 1159168 | consumed tokens: 87752704 | elapsed time per iteration (ms): 112129.1 | learning rate: 3.091E-05 | global batch size: 2048 | lm loss: 5.520879E+00 | loss scale: 8192.0 | grad norm: 12505.504 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76972 |
+
time (ms)
|
76973 |
+
iteration 567/ 292968 | consumed samples: 1161216 | consumed tokens: 87932928 | elapsed time per iteration (ms): 112661.0 | learning rate: 3.097E-05 | global batch size: 2048 | lm loss: 5.531937E+00 | loss scale: 8192.0 | grad norm: 14944.754 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76974 |
+
time (ms)
|
76975 |
+
iteration 568/ 292968 | consumed samples: 1163264 | consumed tokens: 88113152 | elapsed time per iteration (ms): 113956.6 | learning rate: 3.102E-05 | global batch size: 2048 | lm loss: 5.497797E+00 | loss scale: 8192.0 | grad norm: 11478.429 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76976 |
+
time (ms)
|
76977 |
+
iteration 569/ 292968 | consumed samples: 1165312 | consumed tokens: 88293376 | elapsed time per iteration (ms): 112649.6 | learning rate: 3.107E-05 | global batch size: 2048 | lm loss: 5.505655E+00 | loss scale: 8192.0 | grad norm: 13474.430 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76978 |
+
time (ms)
|
76979 |
+
iteration 570/ 292968 | consumed samples: 1167360 | consumed tokens: 88473600 | elapsed time per iteration (ms): 111252.0 | learning rate: 3.113E-05 | global batch size: 2048 | lm loss: 5.493463E+00 | loss scale: 8192.0 | grad norm: 14819.370 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76980 |
+
time (ms)
|
76981 |
+
iteration 571/ 292968 | consumed samples: 1169408 | consumed tokens: 88653824 | elapsed time per iteration (ms): 112373.6 | learning rate: 3.118E-05 | global batch size: 2048 | lm loss: 5.485642E+00 | loss scale: 8192.0 | grad norm: 7874.211 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76982 |
+
time (ms)
|
76983 |
+
iteration 572/ 292968 | consumed samples: 1171456 | consumed tokens: 88834048 | elapsed time per iteration (ms): 112530.1 | learning rate: 3.124E-05 | global batch size: 2048 | lm loss: 5.480896E+00 | loss scale: 8192.0 | grad norm: 14748.807 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76984 |
+
time (ms)
|
76985 |
+
iteration 573/ 292968 | consumed samples: 1173504 | consumed tokens: 89014272 | elapsed time per iteration (ms): 111003.6 | learning rate: 3.129E-05 | global batch size: 2048 | lm loss: 5.495447E+00 | loss scale: 8192.0 | grad norm: 11089.801 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76986 |
+
time (ms)
|
76987 |
+
iteration 574/ 292968 | consumed samples: 1175552 | consumed tokens: 89194496 | elapsed time per iteration (ms): 112117.2 | learning rate: 3.135E-05 | global batch size: 2048 | lm loss: 5.516068E+00 | loss scale: 8192.0 | grad norm: 15890.094 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76988 |
+
time (ms)
|
76989 |
+
iteration 575/ 292968 | consumed samples: 1177600 | consumed tokens: 89374720 | elapsed time per iteration (ms): 113068.6 | learning rate: 3.140E-05 | global batch size: 2048 | lm loss: 5.471289E+00 | loss scale: 8192.0 | grad norm: 10932.631 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76990 |
+
time (ms)
|
76991 |
+
iteration 576/ 292968 | consumed samples: 1179648 | consumed tokens: 89554944 | elapsed time per iteration (ms): 111584.4 | learning rate: 3.146E-05 | global batch size: 2048 | lm loss: 5.460034E+00 | loss scale: 8192.0 | grad norm: 14436.227 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76992 |
+
time (ms)
|
76993 |
+
iteration 577/ 292968 | consumed samples: 1181696 | consumed tokens: 89735168 | elapsed time per iteration (ms): 113415.4 | learning rate: 3.151E-05 | global batch size: 2048 | lm loss: 5.467341E+00 | loss scale: 8192.0 | grad norm: 9677.502 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76994 |
+
time (ms)
|
76995 |
+
iteration 578/ 292968 | consumed samples: 1183744 | consumed tokens: 89915392 | elapsed time per iteration (ms): 112958.8 | learning rate: 3.157E-05 | global batch size: 2048 | lm loss: 5.456917E+00 | loss scale: 8192.0 | grad norm: 16119.399 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76996 |
+
time (ms)
|
76997 |
+
iteration 579/ 292968 | consumed samples: 1185792 | consumed tokens: 90095616 | elapsed time per iteration (ms): 111312.2 | learning rate: 3.162E-05 | global batch size: 2048 | lm loss: 5.460016E+00 | loss scale: 8192.0 | grad norm: 12161.697 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
76998 |
+
time (ms)
|
76999 |
+
iteration 580/ 292968 | consumed samples: 1187840 | consumed tokens: 90275840 | elapsed time per iteration (ms): 112441.1 | learning rate: 3.168E-05 | global batch size: 2048 | lm loss: 5.463281E+00 | loss scale: 8192.0 | grad norm: 12047.781 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
77000 |
+
time (ms)
|
77001 |
+
iteration 581/ 292968 | consumed samples: 1189888 | consumed tokens: 90472448 | elapsed time per iteration (ms): 110471.3 | learning rate: 3.173E-05 | global batch size: 2048 | lm loss: 5.491323E+00 | loss scale: 8192.0 | grad norm: 11849.322 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
77002 |
+
time (ms)
|
77003 |
+
iteration 582/ 292968 | consumed samples: 1191936 | consumed tokens: 90669056 | elapsed time per iteration (ms): 108157.8 | learning rate: 3.178E-05 | global batch size: 2048 | lm loss: 5.475502E+00 | loss scale: 8192.0 | grad norm: 10832.692 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
77004 |
+
time (ms)
|