mtasic85 commited on
Commit
d799763
1 Parent(s): 70e7514

pretrain dataset

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -186,6 +186,7 @@ outputs = optimize(
186
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
187
  chunk_size=(2049 * 8012),
188
  num_workers=32,
 
189
  )
190
 
191
  #
 
186
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
187
  chunk_size=(2049 * 8012),
188
  num_workers=32,
189
+ reorder_files=False,
190
  )
191
 
192
  #