mtasic85 commited on
Commit
fa41d5c
1 Parent(s): 8724fe8

contrain dataset

Browse files
scripts/prepare_contrain_dataset.py CHANGED
@@ -1,11 +1,11 @@
1
- from typing import Optional, Union, Callable, Iterator, Any
2
  from collections.abc import Collection
3
  from functools import partial
4
 
5
- import numpy as np
6
  from datasets import load_dataset
7
  from litdata import optimize, TokensLoader
8
  from litgpt.tokenizer import Tokenizer
 
9
 
10
 
11
  def batch_dict_iterator(path: Optional[str]=None,
@@ -65,8 +65,6 @@ def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer
65
  assert isinstance(dataset_config, (dict, list))
66
 
67
  for text in batch_iterator(dataset_config):
68
- # print(text)
69
- # break
70
  text_ids = tokenizer.encode(text, bos=False, eos=True)
71
  yield text_ids
72
 
@@ -717,7 +715,7 @@ datasets_configs = [
717
  {'role': 'user', 'content': r['prompt']},
718
  {'role': 'assistant', 'content': r['response']},
719
  ]}, # 12.4 MB, 3,000
720
- {'path': 'dvilasuero/dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
721
  {'role': 'system', 'content': r['system']},
722
  {'role': 'user', 'content': r['prompt']},
723
  {'role': 'assistant', 'content': r['response']},
@@ -739,8 +737,17 @@ outputs = optimize(
739
  inputs=datasets_configs,
740
  output_dir='../contrain-data/',
741
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
742
- # chunk_size=(2049 * 8012),
743
  chunk_size=(8193 * 2003),
744
  num_workers=32,
745
  # compression='zstd',
746
  )
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union, Callable, Iterator
2
  from collections.abc import Collection
3
  from functools import partial
4
 
 
5
  from datasets import load_dataset
6
  from litdata import optimize, TokensLoader
7
  from litgpt.tokenizer import Tokenizer
8
+ from litdata import StreamingDataset
9
 
10
 
11
  def batch_dict_iterator(path: Optional[str]=None,
 
65
  assert isinstance(dataset_config, (dict, list))
66
 
67
  for text in batch_iterator(dataset_config):
 
 
68
  text_ids = tokenizer.encode(text, bos=False, eos=True)
69
  yield text_ids
70
 
 
715
  {'role': 'user', 'content': r['prompt']},
716
  {'role': 'assistant', 'content': r['response']},
717
  ]}, # 12.4 MB, 3,000
718
+ {'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
719
  {'role': 'system', 'content': r['system']},
720
  {'role': 'user', 'content': r['prompt']},
721
  {'role': 'assistant', 'content': r['response']},
 
737
  inputs=datasets_configs,
738
  output_dir='../contrain-data/',
739
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
 
740
  chunk_size=(8193 * 2003),
741
  num_workers=32,
742
  # compression='zstd',
743
  )
744
+
745
+ #
746
+ # total number of chunks
747
+ #
748
+ dataset = StreamingDataset(
749
+ input_dir='../contrain-data/',
750
+ item_loader=TokensLoader(block_size=8193),
751
+ )
752
+
753
+ print(len(dataset))
scripts/prepare_pretrain_dataset.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional, Union
2
  from functools import partial
3
 
4
  from datasets import load_dataset
@@ -15,7 +15,7 @@ def batch_dict_iterator(path: str,
15
  revision: Optional[str]=None,
16
  split: str='train',
17
  num_proc: Optional[int]=None,
18
- format: Optional[str]=None):
19
  assert isinstance(format, str) or callable(format)
20
 
21
  dataset = load_dataset(path=path,
@@ -86,12 +86,12 @@ datasets_configs = [
86
  # general knowledge
87
  #
88
  # 2.89 GB, 430,000, English September of 2017
89
- *[
90
  {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
91
  for i in range(0, 100, 20)
92
  ],
93
  # 3.18 GB, 1,010,500
94
- *[
95
  {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
96
  for i in range(0, 100, 20)
97
  ],
 
1
+ from typing import Optional, Union, Iterator
2
  from functools import partial
3
 
4
  from datasets import load_dataset
 
15
  revision: Optional[str]=None,
16
  split: str='train',
17
  num_proc: Optional[int]=None,
18
+ format: Optional[str]=None) -> Iterator[str]:
19
  assert isinstance(format, str) or callable(format)
20
 
21
  dataset = load_dataset(path=path,
 
86
  # general knowledge
87
  #
88
  # 2.89 GB, 430,000, English September of 2017
89
+ [
90
  {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
91
  for i in range(0, 100, 20)
92
  ],
93
  # 3.18 GB, 1,010,500
94
+ [
95
  {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
96
  for i in range(0, 100, 20)
97
  ],