pretrain dataset
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -186,6 +186,7 @@ outputs = optimize(
|
|
186 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
187 |
chunk_size=(2049 * 8012),
|
188 |
num_workers=32,
|
|
|
189 |
)
|
190 |
|
191 |
#
|
|
|
186 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
187 |
chunk_size=(2049 * 8012),
|
188 |
num_workers=32,
|
189 |
+
reorder_files=False,
|
190 |
)
|
191 |
|
192 |
#
|