mtasic85 commited on
Commit
d4df749
1 Parent(s): 59615b0

pretrain dataset

Browse files
Files changed (1) hide show
  1. scripts/prepare_pretrain_dataset.py +12 -12
scripts/prepare_pretrain_dataset.py CHANGED
@@ -87,13 +87,13 @@ datasets_configs = [
87
  #
88
  # 2.89 GB, 430,000, English September of 2017
89
  *[
90
- {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['summary']}
91
- for i in range(0, 100, 10)
92
  ],
93
  # 3.18 GB, 1,010,500
94
  *[
95
- {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
96
- for i in range(0, 100, 10)
97
  ],
98
 
99
  #
@@ -106,18 +106,18 @@ datasets_configs = [
106
  #
107
  # 12.2 MB, 500,000
108
  [
109
- {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 10}%]+test', 'format': '{instruction} = {output}'}
110
- for i in range(0, 100, 10)
111
  ],
112
  # 125 MB, 1,000,000
113
  [
114
- {'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 10}%]', 'format': '{expression} = {result}'}
115
- for i in range(0, 100, 10)
116
  ],
117
  # 3.49 GB, 22,259,474
118
  [
119
- {'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 10}%]+validation+test', 'format': '{instruction} . {output}'}
120
- for i in range(0, 100, 10)
121
  ],
122
  # 9.05 GB, 2,583,257
123
  [
@@ -130,8 +130,8 @@ datasets_configs = [
130
  #
131
  # 1.52 GB, 2,101,279
132
  [
133
- {'path': 'milkshake721/2.1M-wiki-STEM', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
134
- for i in range(0, 100, 10)
135
  ],
136
 
137
  #
 
87
  #
88
  # 2.89 GB, 430,000, English September of 2017
89
  *[
90
+ {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
91
+ for i in range(0, 100, 20)
92
  ],
93
  # 3.18 GB, 1,010,500
94
  *[
95
+ {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
96
+ for i in range(0, 100, 20)
97
  ],
98
 
99
  #
 
106
  #
107
  # 12.2 MB, 500,000
108
  [
109
+ {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]+test', 'format': '{instruction} = {output}'}
110
+ for i in range(0, 100, 20)
111
  ],
112
  # 125 MB, 1,000,000
113
  [
114
+ {'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]', 'format': '{expression} = {result}'}
115
+ for i in range(0, 100, 20)
116
  ],
117
  # 3.49 GB, 22,259,474
118
  [
119
+ {'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 20}%]+validation+test', 'format': '{instruction} . {output}'}
120
+ for i in range(0, 100, 20)
121
  ],
122
  # 9.05 GB, 2,583,257
123
  [
 
130
  #
131
  # 1.52 GB, 2,101,279
132
  [
133
+ {'path': 'milkshake721/2.1M-wiki-STEM', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
134
+ for i in range(0, 100, 20)
135
  ],
136
 
137
  #