contrain dataset
Browse files
scripts/prepare_contrain_dataset.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
from typing import Optional, Union, Callable, Iterator
|
2 |
from collections.abc import Collection
|
3 |
from functools import partial
|
4 |
|
5 |
-
import numpy as np
|
6 |
from datasets import load_dataset
|
7 |
from litdata import optimize, TokensLoader
|
8 |
from litgpt.tokenizer import Tokenizer
|
|
|
9 |
|
10 |
|
11 |
def batch_dict_iterator(path: Optional[str]=None,
|
@@ -65,8 +65,6 @@ def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer
|
|
65 |
assert isinstance(dataset_config, (dict, list))
|
66 |
|
67 |
for text in batch_iterator(dataset_config):
|
68 |
-
# print(text)
|
69 |
-
# break
|
70 |
text_ids = tokenizer.encode(text, bos=False, eos=True)
|
71 |
yield text_ids
|
72 |
|
@@ -717,7 +715,7 @@ datasets_configs = [
|
|
717 |
{'role': 'user', 'content': r['prompt']},
|
718 |
{'role': 'assistant', 'content': r['response']},
|
719 |
]}, # 12.4 MB, 3,000
|
720 |
-
{'path': 'dvilasuero/
|
721 |
{'role': 'system', 'content': r['system']},
|
722 |
{'role': 'user', 'content': r['prompt']},
|
723 |
{'role': 'assistant', 'content': r['response']},
|
@@ -739,8 +737,17 @@ outputs = optimize(
|
|
739 |
inputs=datasets_configs,
|
740 |
output_dir='../contrain-data/',
|
741 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
742 |
-
# chunk_size=(2049 * 8012),
|
743 |
chunk_size=(8193 * 2003),
|
744 |
num_workers=32,
|
745 |
# compression='zstd',
|
746 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Union, Callable, Iterator
|
2 |
from collections.abc import Collection
|
3 |
from functools import partial
|
4 |
|
|
|
5 |
from datasets import load_dataset
|
6 |
from litdata import optimize, TokensLoader
|
7 |
from litgpt.tokenizer import Tokenizer
|
8 |
+
from litdata import StreamingDataset
|
9 |
|
10 |
|
11 |
def batch_dict_iterator(path: Optional[str]=None,
|
|
|
65 |
assert isinstance(dataset_config, (dict, list))
|
66 |
|
67 |
for text in batch_iterator(dataset_config):
|
|
|
|
|
68 |
text_ids = tokenizer.encode(text, bos=False, eos=True)
|
69 |
yield text_ids
|
70 |
|
|
|
715 |
{'role': 'user', 'content': r['prompt']},
|
716 |
{'role': 'assistant', 'content': r['response']},
|
717 |
]}, # 12.4 MB, 3,000
|
718 |
+
{'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
|
719 |
{'role': 'system', 'content': r['system']},
|
720 |
{'role': 'user', 'content': r['prompt']},
|
721 |
{'role': 'assistant', 'content': r['response']},
|
|
|
737 |
inputs=datasets_configs,
|
738 |
output_dir='../contrain-data/',
|
739 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
|
|
740 |
chunk_size=(8193 * 2003),
|
741 |
num_workers=32,
|
742 |
# compression='zstd',
|
743 |
)
|
744 |
+
|
745 |
+
#
|
746 |
+
# total number of chunks
|
747 |
+
#
|
748 |
+
dataset = StreamingDataset(
|
749 |
+
input_dir='../contrain-data/',
|
750 |
+
item_loader=TokensLoader(block_size=8193),
|
751 |
+
)
|
752 |
+
|
753 |
+
print(len(dataset))
|
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import Optional, Union
|
2 |
from functools import partial
|
3 |
|
4 |
from datasets import load_dataset
|
@@ -15,7 +15,7 @@ def batch_dict_iterator(path: str,
|
|
15 |
revision: Optional[str]=None,
|
16 |
split: str='train',
|
17 |
num_proc: Optional[int]=None,
|
18 |
-
format: Optional[str]=None):
|
19 |
assert isinstance(format, str) or callable(format)
|
20 |
|
21 |
dataset = load_dataset(path=path,
|
@@ -86,12 +86,12 @@ datasets_configs = [
|
|
86 |
# general knowledge
|
87 |
#
|
88 |
# 2.89 GB, 430,000, English September of 2017
|
89 |
-
|
90 |
{'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
|
91 |
for i in range(0, 100, 20)
|
92 |
],
|
93 |
# 3.18 GB, 1,010,500
|
94 |
-
|
95 |
{'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
96 |
for i in range(0, 100, 20)
|
97 |
],
|
|
|
1 |
+
from typing import Optional, Union, Iterator
|
2 |
from functools import partial
|
3 |
|
4 |
from datasets import load_dataset
|
|
|
15 |
revision: Optional[str]=None,
|
16 |
split: str='train',
|
17 |
num_proc: Optional[int]=None,
|
18 |
+
format: Optional[str]=None) -> Iterator[str]:
|
19 |
assert isinstance(format, str) or callable(format)
|
20 |
|
21 |
dataset = load_dataset(path=path,
|
|
|
86 |
# general knowledge
|
87 |
#
|
88 |
# 2.89 GB, 430,000, English September of 2017
|
89 |
+
[
|
90 |
{'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
|
91 |
for i in range(0, 100, 20)
|
92 |
],
|
93 |
# 3.18 GB, 1,010,500
|
94 |
+
[
|
95 |
{'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
96 |
for i in range(0, 100, 20)
|
97 |
],
|