pretrain dataset
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -87,13 +87,13 @@ datasets_configs = [
|
|
87 |
#
|
88 |
# 2.89 GB, 430,000, English September of 2017
|
89 |
*[
|
90 |
-
{'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i +
|
91 |
-
for i in range(0, 100,
|
92 |
],
|
93 |
# 3.18 GB, 1,010,500
|
94 |
*[
|
95 |
-
{'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i +
|
96 |
-
for i in range(0, 100,
|
97 |
],
|
98 |
|
99 |
#
|
@@ -106,18 +106,18 @@ datasets_configs = [
|
|
106 |
#
|
107 |
# 12.2 MB, 500,000
|
108 |
[
|
109 |
-
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i +
|
110 |
-
for i in range(0, 100,
|
111 |
],
|
112 |
# 125 MB, 1,000,000
|
113 |
[
|
114 |
-
{'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i +
|
115 |
-
for i in range(0, 100,
|
116 |
],
|
117 |
# 3.49 GB, 22,259,474
|
118 |
[
|
119 |
-
{'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i +
|
120 |
-
for i in range(0, 100,
|
121 |
],
|
122 |
# 9.05 GB, 2,583,257
|
123 |
[
|
@@ -130,8 +130,8 @@ datasets_configs = [
|
|
130 |
#
|
131 |
# 1.52 GB, 2,101,279
|
132 |
[
|
133 |
-
{'path': 'milkshake721/2.1M-wiki-STEM', 'split': f'train[{i}%:{i +
|
134 |
-
for i in range(0, 100,
|
135 |
],
|
136 |
|
137 |
#
|
|
|
87 |
#
|
88 |
# 2.89 GB, 430,000, English September of 2017
|
89 |
*[
|
90 |
+
{'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
|
91 |
+
for i in range(0, 100, 20)
|
92 |
],
|
93 |
# 3.18 GB, 1,010,500
|
94 |
*[
|
95 |
+
{'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
96 |
+
for i in range(0, 100, 20)
|
97 |
],
|
98 |
|
99 |
#
|
|
|
106 |
#
|
107 |
# 12.2 MB, 500,000
|
108 |
[
|
109 |
+
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]+test', 'format': '{instruction} = {output}'}
|
110 |
+
for i in range(0, 100, 20)
|
111 |
],
|
112 |
# 125 MB, 1,000,000
|
113 |
[
|
114 |
+
{'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]', 'format': '{expression} = {result}'}
|
115 |
+
for i in range(0, 100, 20)
|
116 |
],
|
117 |
# 3.49 GB, 22,259,474
|
118 |
[
|
119 |
+
{'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 20}%]+validation+test', 'format': '{instruction} . {output}'}
|
120 |
+
for i in range(0, 100, 20)
|
121 |
],
|
122 |
# 9.05 GB, 2,583,257
|
123 |
[
|
|
|
130 |
#
|
131 |
# 1.52 GB, 2,101,279
|
132 |
[
|
133 |
+
{'path': 'milkshake721/2.1M-wiki-STEM', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
134 |
+
for i in range(0, 100, 20)
|
135 |
],
|
136 |
|
137 |
#
|