guipenedo HF staff commited on
Commit
f7efb82
·
unverified ·
1 Parent(s): ac80290

styling and small text changes

Browse files
dist/distill.bundle.js CHANGED
The diff for this file is too large to render. See raw diff
 
dist/distill.bundle.js.map CHANGED
The diff for this file is too large to render. See raw diff
 
dist/index.html CHANGED
@@ -1,125 +1,20 @@
1
  <!doctype html>
2
 
3
  <head>
4
- <link rel="stylesheet" href="style.css">
5
  <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
6
  <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
7
  <meta name="viewport" content="width=device-width, initial-scale=1">
8
  <meta charset="utf8">
9
  <base target="_blank">
10
  <title>FineWeb: decanting the web for the finest text data at scale</title>
11
- <style>
12
-
13
- /* ****************************************
14
- * TOC
15
- ******************************************/
16
- @media (max-width: 1199px) {
17
- d-contents {
18
- display: none;
19
- justify-self: start;
20
- align-self: start;
21
- padding-bottom: 0.5em;
22
- margin-bottom: 1em;
23
- padding-left: 0.25em;
24
- border-bottom: 1px solid rgba(0, 0, 0, 0.1);
25
- border-bottom-width: 1px;
26
- border-bottom-style: solid;
27
- border-bottom-color: rgba(0, 0, 0, 0.1);
28
- }
29
- }
30
-
31
- d-contents a:hover {
32
- border-bottom: none;
33
- }
34
-
35
-
36
- @media (min-width: 1200px) {
37
- d-article {
38
- /* Ensure d-article does not prevent sticky positioning */
39
- overflow: visible;
40
- }
41
-
42
- d-contents {
43
- align-self: start;
44
- grid-column-start: 1 !important;
45
- grid-column-end: 4 !important;
46
- grid-row: auto / span 6;
47
- justify-self: end;
48
- margin-top: 0em;
49
- padding-right: 3em;
50
- padding-left: 2em;
51
- border-right: 1px solid rgba(0, 0, 0, 0.1);
52
- border-right-width: 1px;
53
- border-right-style: solid;
54
- border-right-color: rgba(0, 0, 0, 0.1);
55
- position: -webkit-sticky; /* For Safari */
56
- position: sticky;
57
- top: 10px; /* Adjust this value if needed */
58
- }
59
- }
60
-
61
- d-contents nav h3 {
62
- margin-top: 0;
63
- margin-bottom: 1em;
64
- }
65
-
66
- d-contents nav div {
67
- color: rgba(0, 0, 0, 0.8);
68
- font-weight: bold;
69
- }
70
-
71
- d-contents nav a {
72
- color: rgba(0, 0, 0, 0.8);
73
- border-bottom: none;
74
- text-decoration: none;
75
- }
76
-
77
- d-contents li {
78
- list-style-type: none;
79
- }
80
-
81
- d-contents ul, d-article d-contents ul {
82
- padding-left: 1em;
83
- }
84
-
85
- d-contents nav ul li {
86
- margin-bottom: .25em;
87
- }
88
-
89
- d-contents nav a:hover {
90
- text-decoration: underline solid rgba(0, 0, 0, 0.6);
91
- }
92
-
93
- d-contents nav ul {
94
- margin-top: 0;
95
- margin-bottom: 6px;
96
- }
97
-
98
-
99
- d-contents nav > div {
100
- display: block;
101
- outline: none;
102
- margin-bottom: 0.5em;
103
- }
104
-
105
- d-contents nav > div > a {
106
- font-size: 13px;
107
- font-weight: 600;
108
- }
109
-
110
- d-contents nav > div > a:hover,
111
- d-contents nav > ul > li > a:hover {
112
- text-decoration: none;
113
- }
114
-
115
- </style>
116
  </head>
117
 
118
  <body>
119
  <d-front-matter>
120
  <script id='distill-front-matter' type="text/json">{
121
  "title": "🍷 FineWeb: decanting the web for the finest text data at scale",
122
- "description": "This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb recipe (listing and explaining all of our design choices), and the process followed to create 📚 FineWeb-Edu.",
123
  "published": "May 28, 2024",
124
  "affiliation": {"name": "HuggingFace"},
125
  "authors": [
@@ -187,17 +82,18 @@
187
  <p>Both datasets are released under the permissive <a href="https://opendatacommons.org/licenses/by/1-0/">ODC-By 1.0 license</a></p>
188
 
189
  <p>As 🍷 FineWeb has gathered a lot of interest from the
190
- community, we decided to explain in all details the steps involved in creating it as well as our processing decisions and
191
- many lessons learned along the way. Hence the present (lengthy) technical report. Read on for all the juicy details on large text dataset creation!</p>
 
192
  <p><strong>TLDR:</strong> This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb
193
- recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset as well.</p>
194
 
195
  <h2>General considerations on web data</h2>
196
  <h3>Sourcing the data</h3>
197
  <p>A common question often asked regarding web datasets used
198
  to train LLMs is “where do they even get all that data?”. There are generally two options:</p>
199
  <ul>
200
- <li>you either crawl it yourself, like companies like OpenAI or Anthropic (among other) do (see hints <a
201
  href="https://platform.openai.com/docs/gptbot">here</a> and <a
202
  href="https://darkvisitors.com/agents/claudebot">here</a>)
203
  </li>
@@ -206,15 +102,15 @@
206
  <li>you use a public repository of crawled webpages, like the one maintained by
207
  the non-profit <a href="https://commoncrawl.org/">CommonCrawl</a></li>
208
  </ul>
209
- <p>To build and filter 🍷 FineWeb, following what had done in the past by number of LLM training teams,
210
  we used <a href="https://commoncrawl.org/">CommonCrawl</a> (CC) as a starting point.
211
  The Common Crawl non–profit organization has been crawling the web since 2007 and
212
- release a new crawl containing 200 to 300 TB of textual content obtained via automatic web crawling usually
213
  every 1 or 2 months. </p>
214
- <p>As an example, the latest CC crawl (id 2024-18) contains 2.7
215
- billion web pages, totaling 386 TiB of uncompressed HTML text content (Note that the size changes from dump to dump).
216
- Ninety-six crawls have been released since 2013 and 3 dumps from 2008 to 2012, which are in a different (older) format.
217
- <d-footnote>We have not processed these 3 older dumps.</d-footnote> </p>
218
 
219
  <h3>Processing at scale</h3>
220
  <p>Given the sheer size of the data involved, one of the main
@@ -225,40 +121,39 @@
225
  href="https://github.com/huggingface/datatrove"><code>datatrove</code></a><d-cite bibtex-key="penedo2024datatrove"></d-cite>, an open-source data
226
  processing library that allowed us to seamlessly scale our filtering and deduplication setup to thousands of
227
  CPU cores. All the data processing steps involved in the creation of 🍷 FineWeb used this <a
228
- href="https://github.com/huggingface/datatrove">library</a>. In most cases, you'll find the exact same scripts we used in the <a
229
- href="https://github.com/huggingface/datatrove"><code>datatrove</code></a> repository.</p>
230
 
231
  <h3>What is good data?</h3>
232
  <p>This is probably the main question to keep in mind when
233
- creating a dataset. In most context and in particular in the context of large language model pretraining <d-footnote>Note that all our discussion in this report is focused on the special field of web-scale datasets ("web-scale" typically meaning >100 billion tokens) used to pretrained a Large Language Models (by pretraining we mean the very first step of training of a model, starting from random weights). We don't pretend to cover any other field of dataset creation nor that the lessons or hypothesis we develop in this document can extend to any field beside this specific field.</d-footnote>, "high quality" is not a very well defined term<d-cite bibtex-key="albalak2024survey"></d-cite>, and not even a property of documents that can always be clearly perceived through direct, human, observation alone.<d-cite bibtex-key="longpre2023pretrainers"></d-cite></p>
234
  <p>It is still common to train a model on a given corpus considered "clean"
235
  (typically wikipedia<d-footnote>Even though as we mentioned above the notion of "clean" is so ill-defined that it should probably not been seen as equivalent to wikipedia-type of text</d-footnote>) and use it to check the perplexity on the dataset
236
  that we were trying to curate<d-cite bibtex-key="wenzek2019ccnet"></d-cite>. Unfortunately this does not always correlate with improved performance on a set of downstream
237
- tasks of interest<d-cite bibtex-key="soldaini2024dolma"></d-cite>, and as a result another often used approach is to train small models<d-footnote>"Small" in comparison to standard sizes of today's LLM, i.e. small in comparison to 7-70 billion parameters. In this work "small" means about 1-2 billion parameters</d-footnote> on a representative subset of our dataset and evaluate them on
238
- a set of evaluation tasks. The reason small model are used is because training models is
239
- expensive and time consuming as a function of model size. In this second approach, it is important to
240
  choose a diverse and representative set of dataset-evaluation tasks and try not to overfit to any one individual benchmark as it would risk hurting the generality of the obtained LLM after pretraining.</p>
241
  <p>Yet another way to compare different datasets would be to
242
  train a model on each dataset and have humans rate and compare the generations of the models (like on the <a
243
  href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
244
  reliable results in terms of representing real model usage, but getting ablation results this way is unfortunately
245
- expensive and slow. It also often requires for the models to have undergone through an instruction finetuning stage to acquire conversational capabilities, as pretrained models are not directly-designed to follow instructions and thus much more sensitive to prompt details.<d-cite bibtex-key="ouyang2022training"></d-cite></p>
246
  <p>In this work, we went with the approach of training small
247
  models and evaluating them on a set of "early-signal" benchmark tasks. We believe this is a reasonable proxy for the quality
248
- of the data used to train these models with the above mentioned caveat around overfitting on the evaluation benchmarks.</p>
249
  <h3>Ablations and evaluation setup</h3>
250
- <p>To be able to compare the impact of a given processing
251
- step, we typically train two models on two versions of the dataset, one version processed with the extra –ablated– step and another version with this step
252
- ablated (cut/removed). Apart from the data, these two models are otherwise identical: same number of parameters, architecture hyper-parameters, and are trained
253
  on an equal number of randomly sampled tokens from each version of the data, for a single epoch — the only difference being thus the
254
- training data. We then evaluate each model on the same set of tasks and compare average
255
  scores.</p>
256
- <p>Our ablation models are trained using <a
257
  href="https://github.com/huggingface/nanotron"><code>nanotron</code></a> with this config [<strong>TODO:
258
- INSERT SIMPLIFIED NANOTRON CONFIG HERE</strong>]. Ablation models have 1.82B parameters (including embeddings), used the Llama
259
- architecture with a 2048 sequence length, a global batch size of ~2 million tokens and GPT2 tokenizer as mentioned above. For most
260
  ablations we trained on ~28B tokens (roughly the Chinchilla<d-cite bibtex-key="hoffmann2022training"></d-cite> optimal training size for this
261
- model size). To confirm relative performance after several steps of filtering we conducted longer training runs in 350 billion tokens as mentioned further below.</p>
262
  <p>We evaluated the models using <a
263
  href="https://github.com/huggingface/lighteval/"><code>lighteval</code></a>. We carefully selected a set of benchmark for ablations by selecting
264
  benchmarks that would provide good signal at a relatively small scale ("small" models trained on only "a few
@@ -266,17 +161,17 @@
266
  <ul>
267
  <li>small variance between runs trained on different samplings of the same
268
  dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
269
- resulting scores to have as little evaluation noise as possible and sensitive to exact data choice (apart from larger ablation that we are concerned with)
270
  </li>
271
  </ul>
272
  <ul>
273
  <li>performance increasing monotonically (or close) over a training run:
274
- ideally, as the number of seen tokens increases, the performance of a high-signal benchmark should not decrease
275
  (which would be indicative of unreliable results at a small scale)
276
  </li>
277
  </ul>
278
  <ul>
279
- <li>performance above the random noise level with a few standard deviations at least. Given our small ablation models and trainings we usually don't reach extremely high scores on any benchmark, but we want to make sure that the scores we get are above random noise.
280
  </li>
281
  </ul>
282
  <p>After consideration, we selected the following list of benchmarks:</p>
@@ -291,7 +186,7 @@
291
  <li>MMLU<d-cite bibtex-key="hendrycks2021measuring"></d-cite></li>
292
  </ul>
293
  <p>To
294
- compute our checkpoint evaluation in a constrained time, we capped the longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
295
  min on a single node of 8 GPUs - done in parallel to the training).</p>
296
  <aside>You can find the full list of tasks and prompts we used <a
297
  href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/lighteval_tasks.py">here</a>.</aside>
 
1
  <!doctype html>
2
 
3
  <head>
 
4
  <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
5
  <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
6
  <meta name="viewport" content="width=device-width, initial-scale=1">
7
  <meta charset="utf8">
8
  <base target="_blank">
9
  <title>FineWeb: decanting the web for the finest text data at scale</title>
10
+ <link rel="stylesheet" href="style.css">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  </head>
12
 
13
  <body>
14
  <d-front-matter>
15
  <script id='distill-front-matter' type="text/json">{
16
  "title": "🍷 FineWeb: decanting the web for the finest text data at scale",
17
+ "description": "This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.",
18
  "published": "May 28, 2024",
19
  "affiliation": {"name": "HuggingFace"},
20
  "authors": [
 
82
  <p>Both datasets are released under the permissive <a href="https://opendatacommons.org/licenses/by/1-0/">ODC-By 1.0 license</a></p>
83
 
84
  <p>As 🍷 FineWeb has gathered a lot of interest from the
85
+ community, we decided to explain in full detail the steps involved in creating it as well as our processing decisions and
86
+ many lessons learned along the way. Hence, the present (lengthy) technical report. Read on for all the juicy details on large text dataset creation!</p>
87
+ <aside>For the best possible reading experience, we recommend not using a mobile phone.</aside>
88
  <p><strong>TLDR:</strong> This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb
89
+ recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.</p>
90
 
91
  <h2>General considerations on web data</h2>
92
  <h3>Sourcing the data</h3>
93
  <p>A common question often asked regarding web datasets used
94
  to train LLMs is “where do they even get all that data?”. There are generally two options:</p>
95
  <ul>
96
+ <li>you either crawl it yourself, like companies such as OpenAI or Anthropic (among others) do (see <a
97
  href="https://platform.openai.com/docs/gptbot">here</a> and <a
98
  href="https://darkvisitors.com/agents/claudebot">here</a>)
99
  </li>
 
102
  <li>you use a public repository of crawled webpages, like the one maintained by
103
  the non-profit <a href="https://commoncrawl.org/">CommonCrawl</a></li>
104
  </ul>
105
+ <p>To build 🍷 FineWeb, following what has been done in the past by a number of LLM training teams,
106
  we used <a href="https://commoncrawl.org/">CommonCrawl</a> (CC) as a starting point.
107
  The Common Crawl non–profit organization has been crawling the web since 2007 and
108
+ releases a new crawl containing 200 to 400 TiB of textual content obtained via automatic web crawling usually
109
  every 1 or 2 months. </p>
110
+ <p>As an example, the latest CC crawl (April 2024) contains 2.7
111
+ billion web pages, totaling 386 TiB of uncompressed HTML text content<d-footnote>Note that the size changes from crawl to crawl</d-footnote>.
112
+ Ninety-six crawls have been released since 2013 and 3 crawls from 2008 to 2012, which are in a different (older) format.
113
+ <d-footnote>We have not processed these 3 older crawls.</d-footnote> </p>
114
 
115
  <h3>Processing at scale</h3>
116
  <p>Given the sheer size of the data involved, one of the main
 
121
  href="https://github.com/huggingface/datatrove"><code>datatrove</code></a><d-cite bibtex-key="penedo2024datatrove"></d-cite>, an open-source data
122
  processing library that allowed us to seamlessly scale our filtering and deduplication setup to thousands of
123
  CPU cores. All the data processing steps involved in the creation of 🍷 FineWeb used this <a
124
+ href="https://github.com/huggingface/datatrove">library</a>. You will find the exact scripts we used in the
125
+ <a href="https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py"><code>datatrove</code> repository</a>.</p>
126
 
127
  <h3>What is good data?</h3>
128
  <p>This is probably the main question to keep in mind when
129
+ creating a dataset. In most contexts and, in particular, in the context of large language model pretraining <d-footnote>Note that this report is focused on the special field of web-scale datasets ("web-scale" typically meaning >100 billion tokens obtained from the web) used to pretrain a Large Language Model (by pretraining we mean the very first step in the training of a model, starting from random weights). We don't pretend to cover any other field of dataset creation nor that the lessons or hypothesis we develop in this document can extend to any field besides this specific field.</d-footnote>, "high quality" is not a very well defined term<d-cite bibtex-key="albalak2024survey"></d-cite>, and not even a property of documents that can always be clearly perceived through direct human observation alone.<d-cite bibtex-key="longpre2023pretrainers"></d-cite></p>
130
  <p>It is still common to train a model on a given corpus considered "clean"
131
  (typically wikipedia<d-footnote>Even though as we mentioned above the notion of "clean" is so ill-defined that it should probably not been seen as equivalent to wikipedia-type of text</d-footnote>) and use it to check the perplexity on the dataset
132
  that we were trying to curate<d-cite bibtex-key="wenzek2019ccnet"></d-cite>. Unfortunately this does not always correlate with improved performance on a set of downstream
133
+ tasks of interest<d-cite bibtex-key="soldaini2024dolma"></d-cite>, and as a result another often used approach is to train small models<d-footnote>"Small" in comparison to standard sizes of today's LLMs, i.e. small in comparison to 7-70 billion parameters. In this work "small" means about 1-2 billion parameters</d-footnote> on a representative subset of our dataset and evaluate them on
134
+ a set of evaluation tasks. Small models are used because training costs and time are a function of model size. In this second approach, it is important to
 
135
  choose a diverse and representative set of dataset-evaluation tasks and try not to overfit to any one individual benchmark as it would risk hurting the generality of the obtained LLM after pretraining.</p>
136
  <p>Yet another way to compare different datasets would be to
137
  train a model on each dataset and have humans rate and compare the generations of the models (like on the <a
138
  href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
139
  reliable results in terms of representing real model usage, but getting ablation results this way is unfortunately
140
+ expensive and slow. It also often requires for the models to have undergone an instruction finetuning stage to acquire conversational capabilities, as pretrained models are not directly designed to follow instructions and are thus much more sensitive to prompt details.<d-cite bibtex-key="ouyang2022training"></d-cite></p>
141
  <p>In this work, we went with the approach of training small
142
  models and evaluating them on a set of "early-signal" benchmark tasks. We believe this is a reasonable proxy for the quality
143
+ of the data used to train these models, when keeping in mind the above-mentioned caveat around overfitting on the evaluation benchmarks.</p>
144
  <h3>Ablations and evaluation setup</h3>
145
+ <p>To compare the impact of a given processing
146
+ step, we trained two models on two versions of the dataset, one version processed with the extra step (the one we wish to evaluate) and another version with this step
147
+ ablated (cut/removed). Apart from the data, these two models would be otherwise identical: the same number of parameters, architecture hyper-parameters, and trained
148
  on an equal number of randomly sampled tokens from each version of the data, for a single epoch — the only difference being thus the
149
+ training data. We then evaluated each model on the same set of tasks and compared average
150
  scores.</p>
151
+ <p>Our ablation models were trained using <a
152
  href="https://github.com/huggingface/nanotron"><code>nanotron</code></a> with this config [<strong>TODO:
153
+ INSERT SIMPLIFIED NANOTRON CONFIG HERE</strong>]. Our ablation models have 1.82B parameters (including embeddings), used the Llama
154
+ architecture with a 2048 sequence length, a global batch size of ~2 million tokens, and the GPT2 tokenizer. For most
155
  ablations we trained on ~28B tokens (roughly the Chinchilla<d-cite bibtex-key="hoffmann2022training"></d-cite> optimal training size for this
156
+ model size). To confirm relative performance improvements after each step of filtering we conducted longer training runs on 350 billion tokens as mentioned further below.</p>
157
  <p>We evaluated the models using <a
158
  href="https://github.com/huggingface/lighteval/"><code>lighteval</code></a>. We carefully selected a set of benchmark for ablations by selecting
159
  benchmarks that would provide good signal at a relatively small scale ("small" models trained on only "a few
 
161
  <ul>
162
  <li>small variance between runs trained on different samplings of the same
163
  dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
164
+ resulting scores to have as little sensitivity to exact data choice as possible (apart from larger ablations that we are concerned with)
165
  </li>
166
  </ul>
167
  <ul>
168
  <li>performance increasing monotonically (or close) over a training run:
169
+ ideally, as the number of seen tokens increases, the performance on a high-signal benchmark should not decrease
170
  (which would be indicative of unreliable results at a small scale)
171
  </li>
172
  </ul>
173
  <ul>
174
+ <li>performance above random baseline for this task by at least a few standard deviations: given our small ablation models and trainings we usually don't reach extremely high scores on any benchmark, but we want to make sure that the scores we get are above random noise.
175
  </li>
176
  </ul>
177
  <p>After consideration, we selected the following list of benchmarks:</p>
 
186
  <li>MMLU<d-cite bibtex-key="hendrycks2021measuring"></d-cite></li>
187
  </ul>
188
  <p>To
189
+ ensure our checkpoint evaluation stayed within a limited timeframe, we capped the longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
190
  min on a single node of 8 GPUs - done in parallel to the training).</p>
191
  <aside>You can find the full list of tasks and prompts we used <a
192
  href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/lighteval_tasks.py">here</a>.</aside>
dist/style.css CHANGED
@@ -141,4 +141,103 @@ d-byline .byline {
141
 
142
  d-contents > nav a.active {
143
  text-decoration: underline;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  }
 
141
 
142
  d-contents > nav a.active {
143
  text-decoration: underline;
144
+ }
145
+
146
+ @media (max-width: 1199px) {
147
+ d-contents {
148
+ display: none;
149
+ justify-self: start;
150
+ align-self: start;
151
+ padding-bottom: 0.5em;
152
+ margin-bottom: 1em;
153
+ padding-left: 0.25em;
154
+ border-bottom: 1px solid rgba(0, 0, 0, 0.1);
155
+ border-bottom-width: 1px;
156
+ border-bottom-style: solid;
157
+ border-bottom-color: rgba(0, 0, 0, 0.1);
158
+ }
159
+ }
160
+
161
+ d-contents a:hover {
162
+ border-bottom: none;
163
+ }
164
+
165
+
166
+ @media (min-width: 1200px) {
167
+ d-article {
168
+ /* Ensure d-article does not prevent sticky positioning */
169
+ overflow: visible;
170
+ }
171
+
172
+ d-contents {
173
+ align-self: start;
174
+ grid-column-start: 1 !important;
175
+ grid-column-end: 4 !important;
176
+ grid-row: auto / span 6;
177
+ justify-self: end;
178
+ margin-top: 0em;
179
+ padding-right: 3em;
180
+ padding-left: 2em;
181
+ border-right: 1px solid rgba(0, 0, 0, 0.1);
182
+ border-right-width: 1px;
183
+ border-right-style: solid;
184
+ border-right-color: rgba(0, 0, 0, 0.1);
185
+ position: -webkit-sticky; /* For Safari */
186
+ position: sticky;
187
+ top: 10px; /* Adjust this value if needed */
188
+ }
189
+ }
190
+
191
+ d-contents nav h3 {
192
+ margin-top: 0;
193
+ margin-bottom: 1em;
194
+ }
195
+
196
+ d-contents nav div {
197
+ color: rgba(0, 0, 0, 0.8);
198
+ font-weight: bold;
199
+ }
200
+
201
+ d-contents nav a {
202
+ color: rgba(0, 0, 0, 0.8);
203
+ border-bottom: none;
204
+ text-decoration: none;
205
+ }
206
+
207
+ d-contents li {
208
+ list-style-type: none;
209
+ }
210
+
211
+ d-contents ul, d-article d-contents ul {
212
+ padding-left: 1em;
213
+ }
214
+
215
+ d-contents nav ul li {
216
+ margin-bottom: .25em;
217
+ }
218
+
219
+ d-contents nav a:hover {
220
+ text-decoration: underline solid rgba(0, 0, 0, 0.6);
221
+ }
222
+
223
+ d-contents nav ul {
224
+ margin-top: 0;
225
+ margin-bottom: 6px;
226
+ }
227
+
228
+
229
+ d-contents nav > div {
230
+ display: block;
231
+ outline: none;
232
+ margin-bottom: 0.5em;
233
+ }
234
+
235
+ d-contents nav > div > a {
236
+ font-size: 13px;
237
+ font-weight: 600;
238
+ }
239
+
240
+ d-contents nav > div > a:hover,
241
+ d-contents nav > ul > li > a:hover {
242
+ text-decoration: none;
243
  }
src/distill.js CHANGED
@@ -4382,6 +4382,7 @@ d-footnote-list a.footnote-backlink {
4382
 
4383
  const backlink = document.createElement('a');
4384
  backlink.setAttribute('class', 'footnote-backlink');
 
4385
  backlink.textContent = '[↩]';
4386
  backlink.href = '#' + footnote.id;
4387
 
 
4382
 
4383
  const backlink = document.createElement('a');
4384
  backlink.setAttribute('class', 'footnote-backlink');
4385
+ backlink.setAttribute('target', '_self');
4386
  backlink.textContent = '[↩]';
4387
  backlink.href = '#' + footnote.id;
4388
 
src/index.html CHANGED
@@ -1,125 +1,20 @@
1
  <!doctype html>
2
 
3
  <head>
4
- <link rel="stylesheet" href="style.css">
5
  <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
6
  <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
7
  <meta name="viewport" content="width=device-width, initial-scale=1">
8
  <meta charset="utf8">
9
  <base target="_blank">
10
  <title>FineWeb: decanting the web for the finest text data at scale</title>
11
- <style>
12
-
13
- /* ****************************************
14
- * TOC
15
- ******************************************/
16
- @media (max-width: 1199px) {
17
- d-contents {
18
- display: none;
19
- justify-self: start;
20
- align-self: start;
21
- padding-bottom: 0.5em;
22
- margin-bottom: 1em;
23
- padding-left: 0.25em;
24
- border-bottom: 1px solid rgba(0, 0, 0, 0.1);
25
- border-bottom-width: 1px;
26
- border-bottom-style: solid;
27
- border-bottom-color: rgba(0, 0, 0, 0.1);
28
- }
29
- }
30
-
31
- d-contents a:hover {
32
- border-bottom: none;
33
- }
34
-
35
-
36
- @media (min-width: 1200px) {
37
- d-article {
38
- /* Ensure d-article does not prevent sticky positioning */
39
- overflow: visible;
40
- }
41
-
42
- d-contents {
43
- align-self: start;
44
- grid-column-start: 1 !important;
45
- grid-column-end: 4 !important;
46
- grid-row: auto / span 6;
47
- justify-self: end;
48
- margin-top: 0em;
49
- padding-right: 3em;
50
- padding-left: 2em;
51
- border-right: 1px solid rgba(0, 0, 0, 0.1);
52
- border-right-width: 1px;
53
- border-right-style: solid;
54
- border-right-color: rgba(0, 0, 0, 0.1);
55
- position: -webkit-sticky; /* For Safari */
56
- position: sticky;
57
- top: 10px; /* Adjust this value if needed */
58
- }
59
- }
60
-
61
- d-contents nav h3 {
62
- margin-top: 0;
63
- margin-bottom: 1em;
64
- }
65
-
66
- d-contents nav div {
67
- color: rgba(0, 0, 0, 0.8);
68
- font-weight: bold;
69
- }
70
-
71
- d-contents nav a {
72
- color: rgba(0, 0, 0, 0.8);
73
- border-bottom: none;
74
- text-decoration: none;
75
- }
76
-
77
- d-contents li {
78
- list-style-type: none;
79
- }
80
-
81
- d-contents ul, d-article d-contents ul {
82
- padding-left: 1em;
83
- }
84
-
85
- d-contents nav ul li {
86
- margin-bottom: .25em;
87
- }
88
-
89
- d-contents nav a:hover {
90
- text-decoration: underline solid rgba(0, 0, 0, 0.6);
91
- }
92
-
93
- d-contents nav ul {
94
- margin-top: 0;
95
- margin-bottom: 6px;
96
- }
97
-
98
-
99
- d-contents nav > div {
100
- display: block;
101
- outline: none;
102
- margin-bottom: 0.5em;
103
- }
104
-
105
- d-contents nav > div > a {
106
- font-size: 13px;
107
- font-weight: 600;
108
- }
109
-
110
- d-contents nav > div > a:hover,
111
- d-contents nav > ul > li > a:hover {
112
- text-decoration: none;
113
- }
114
-
115
- </style>
116
  </head>
117
 
118
  <body>
119
  <d-front-matter>
120
  <script id='distill-front-matter' type="text/json">{
121
  "title": "🍷 FineWeb: decanting the web for the finest text data at scale",
122
- "description": "This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb recipe (listing and explaining all of our design choices), and the process followed to create 📚 FineWeb-Edu.",
123
  "published": "May 28, 2024",
124
  "affiliation": {"name": "HuggingFace"},
125
  "authors": [
@@ -187,17 +82,18 @@
187
  <p>Both datasets are released under the permissive <a href="https://opendatacommons.org/licenses/by/1-0/">ODC-By 1.0 license</a></p>
188
 
189
  <p>As 🍷 FineWeb has gathered a lot of interest from the
190
- community, we decided to explain in all details the steps involved in creating it as well as our processing decisions and
191
- many lessons learned along the way. Hence the present (lengthy) technical report. Read on for all the juicy details on large text dataset creation!</p>
 
192
  <p><strong>TLDR:</strong> This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb
193
- recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset as well.</p>
194
 
195
  <h2>General considerations on web data</h2>
196
  <h3>Sourcing the data</h3>
197
  <p>A common question often asked regarding web datasets used
198
  to train LLMs is “where do they even get all that data?”. There are generally two options:</p>
199
  <ul>
200
- <li>you either crawl it yourself, like companies like OpenAI or Anthropic (among other) do (see hints <a
201
  href="https://platform.openai.com/docs/gptbot">here</a> and <a
202
  href="https://darkvisitors.com/agents/claudebot">here</a>)
203
  </li>
@@ -206,15 +102,15 @@
206
  <li>you use a public repository of crawled webpages, like the one maintained by
207
  the non-profit <a href="https://commoncrawl.org/">CommonCrawl</a></li>
208
  </ul>
209
- <p>To build and filter 🍷 FineWeb, following what had done in the past by number of LLM training teams,
210
  we used <a href="https://commoncrawl.org/">CommonCrawl</a> (CC) as a starting point.
211
  The Common Crawl non–profit organization has been crawling the web since 2007 and
212
- release a new crawl containing 200 to 300 TB of textual content obtained via automatic web crawling usually
213
  every 1 or 2 months. </p>
214
- <p>As an example, the latest CC crawl (id 2024-18) contains 2.7
215
- billion web pages, totaling 386 TiB of uncompressed HTML text content (Note that the size changes from dump to dump).
216
- Ninety-six crawls have been released since 2013 and 3 dumps from 2008 to 2012, which are in a different (older) format.
217
- <d-footnote>We have not processed these 3 older dumps.</d-footnote> </p>
218
 
219
  <h3>Processing at scale</h3>
220
  <p>Given the sheer size of the data involved, one of the main
@@ -225,40 +121,39 @@
225
  href="https://github.com/huggingface/datatrove"><code>datatrove</code></a><d-cite bibtex-key="penedo2024datatrove"></d-cite>, an open-source data
226
  processing library that allowed us to seamlessly scale our filtering and deduplication setup to thousands of
227
  CPU cores. All the data processing steps involved in the creation of 🍷 FineWeb used this <a
228
- href="https://github.com/huggingface/datatrove">library</a>. In most cases, you'll find the exact same scripts we used in the <a
229
- href="https://github.com/huggingface/datatrove"><code>datatrove</code></a> repository.</p>
230
 
231
  <h3>What is good data?</h3>
232
  <p>This is probably the main question to keep in mind when
233
- creating a dataset. In most context and in particular in the context of large language model pretraining <d-footnote>Note that all our discussion in this report is focused on the special field of web-scale datasets ("web-scale" typically meaning >100 billion tokens) used to pretrained a Large Language Models (by pretraining we mean the very first step of training of a model, starting from random weights). We don't pretend to cover any other field of dataset creation nor that the lessons or hypothesis we develop in this document can extend to any field beside this specific field.</d-footnote>, "high quality" is not a very well defined term<d-cite bibtex-key="albalak2024survey"></d-cite>, and not even a property of documents that can always be clearly perceived through direct, human, observation alone.<d-cite bibtex-key="longpre2023pretrainers"></d-cite></p>
234
  <p>It is still common to train a model on a given corpus considered "clean"
235
  (typically wikipedia<d-footnote>Even though as we mentioned above the notion of "clean" is so ill-defined that it should probably not been seen as equivalent to wikipedia-type of text</d-footnote>) and use it to check the perplexity on the dataset
236
  that we were trying to curate<d-cite bibtex-key="wenzek2019ccnet"></d-cite>. Unfortunately this does not always correlate with improved performance on a set of downstream
237
- tasks of interest<d-cite bibtex-key="soldaini2024dolma"></d-cite>, and as a result another often used approach is to train small models<d-footnote>"Small" in comparison to standard sizes of today's LLM, i.e. small in comparison to 7-70 billion parameters. In this work "small" means about 1-2 billion parameters</d-footnote> on a representative subset of our dataset and evaluate them on
238
- a set of evaluation tasks. The reason small model are used is because training models is
239
- expensive and time consuming as a function of model size. In this second approach, it is important to
240
  choose a diverse and representative set of dataset-evaluation tasks and try not to overfit to any one individual benchmark as it would risk hurting the generality of the obtained LLM after pretraining.</p>
241
  <p>Yet another way to compare different datasets would be to
242
  train a model on each dataset and have humans rate and compare the generations of the models (like on the <a
243
  href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
244
  reliable results in terms of representing real model usage, but getting ablation results this way is unfortunately
245
- expensive and slow. It also often requires for the models to have undergone through an instruction finetuning stage to acquire conversational capabilities, as pretrained models are not directly-designed to follow instructions and thus much more sensitive to prompt details.<d-cite bibtex-key="ouyang2022training"></d-cite></p>
246
  <p>In this work, we went with the approach of training small
247
  models and evaluating them on a set of "early-signal" benchmark tasks. We believe this is a reasonable proxy for the quality
248
- of the data used to train these models with the above mentioned caveat around overfitting on the evaluation benchmarks.</p>
249
  <h3>Ablations and evaluation setup</h3>
250
- <p>To be able to compare the impact of a given processing
251
- step, we typically train two models on two versions of the dataset, one version processed with the extra –ablated– step and another version with this step
252
- ablated (cut/removed). Apart from the data, these two models are otherwise identical: same number of parameters, architecture hyper-parameters, and are trained
253
  on an equal number of randomly sampled tokens from each version of the data, for a single epoch — the only difference being thus the
254
- training data. We then evaluate each model on the same set of tasks and compare average
255
  scores.</p>
256
- <p>Our ablation models are trained using <a
257
  href="https://github.com/huggingface/nanotron"><code>nanotron</code></a> with this config [<strong>TODO:
258
- INSERT SIMPLIFIED NANOTRON CONFIG HERE</strong>]. Ablation models have 1.82B parameters (including embeddings), used the Llama
259
- architecture with a 2048 sequence length, a global batch size of ~2 million tokens and GPT2 tokenizer as mentioned above. For most
260
  ablations we trained on ~28B tokens (roughly the Chinchilla<d-cite bibtex-key="hoffmann2022training"></d-cite> optimal training size for this
261
- model size). To confirm relative performance after several steps of filtering we conducted longer training runs in 350 billion tokens as mentioned further below.</p>
262
  <p>We evaluated the models using <a
263
  href="https://github.com/huggingface/lighteval/"><code>lighteval</code></a>. We carefully selected a set of benchmark for ablations by selecting
264
  benchmarks that would provide good signal at a relatively small scale ("small" models trained on only "a few
@@ -266,17 +161,17 @@
266
  <ul>
267
  <li>small variance between runs trained on different samplings of the same
268
  dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
269
- resulting scores to have as little evaluation noise as possible and sensitive to exact data choice (apart from larger ablation that we are concerned with)
270
  </li>
271
  </ul>
272
  <ul>
273
  <li>performance increasing monotonically (or close) over a training run:
274
- ideally, as the number of seen tokens increases, the performance of a high-signal benchmark should not decrease
275
  (which would be indicative of unreliable results at a small scale)
276
  </li>
277
  </ul>
278
  <ul>
279
- <li>performance above the random noise level with a few standard deviations at least. Given our small ablation models and trainings we usually don't reach extremely high scores on any benchmark, but we want to make sure that the scores we get are above random noise.
280
  </li>
281
  </ul>
282
  <p>After consideration, we selected the following list of benchmarks:</p>
@@ -291,7 +186,7 @@
291
  <li>MMLU<d-cite bibtex-key="hendrycks2021measuring"></d-cite></li>
292
  </ul>
293
  <p>To
294
- compute our checkpoint evaluation in a constrained time, we capped the longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
295
  min on a single node of 8 GPUs - done in parallel to the training).</p>
296
  <aside>You can find the full list of tasks and prompts we used <a
297
  href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/lighteval_tasks.py">here</a>.</aside>
 
1
  <!doctype html>
2
 
3
  <head>
 
4
  <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
5
  <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
6
  <meta name="viewport" content="width=device-width, initial-scale=1">
7
  <meta charset="utf8">
8
  <base target="_blank">
9
  <title>FineWeb: decanting the web for the finest text data at scale</title>
10
+ <link rel="stylesheet" href="style.css">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  </head>
12
 
13
  <body>
14
  <d-front-matter>
15
  <script id='distill-front-matter' type="text/json">{
16
  "title": "🍷 FineWeb: decanting the web for the finest text data at scale",
17
+ "description": "This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.",
18
  "published": "May 28, 2024",
19
  "affiliation": {"name": "HuggingFace"},
20
  "authors": [
 
82
  <p>Both datasets are released under the permissive <a href="https://opendatacommons.org/licenses/by/1-0/">ODC-By 1.0 license</a></p>
83
 
84
  <p>As 🍷 FineWeb has gathered a lot of interest from the
85
+ community, we decided to explain in full detail the steps involved in creating it as well as our processing decisions and
86
+ many lessons learned along the way. Hence, the present (lengthy) technical report. Read on for all the juicy details on large text dataset creation!</p>
87
+ <aside>For the best possible reading experience, we recommend not using a mobile phone.</aside>
88
  <p><strong>TLDR:</strong> This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb
89
+ recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.</p>
90
 
91
  <h2>General considerations on web data</h2>
92
  <h3>Sourcing the data</h3>
93
  <p>A common question often asked regarding web datasets used
94
  to train LLMs is “where do they even get all that data?”. There are generally two options:</p>
95
  <ul>
96
+ <li>you either crawl it yourself, like companies such as OpenAI or Anthropic (among others) do (see <a
97
  href="https://platform.openai.com/docs/gptbot">here</a> and <a
98
  href="https://darkvisitors.com/agents/claudebot">here</a>)
99
  </li>
 
102
  <li>you use a public repository of crawled webpages, like the one maintained by
103
  the non-profit <a href="https://commoncrawl.org/">CommonCrawl</a></li>
104
  </ul>
105
+ <p>To build 🍷 FineWeb, following what has been done in the past by a number of LLM training teams,
106
  we used <a href="https://commoncrawl.org/">CommonCrawl</a> (CC) as a starting point.
107
  The Common Crawl non–profit organization has been crawling the web since 2007 and
108
+ releases a new crawl containing 200 to 400 TiB of textual content obtained via automatic web crawling usually
109
  every 1 or 2 months. </p>
110
+ <p>As an example, the latest CC crawl (April 2024) contains 2.7
111
+ billion web pages, totaling 386 TiB of uncompressed HTML text content<d-footnote>Note that the size changes from crawl to crawl</d-footnote>.
112
+ Ninety-six crawls have been released since 2013 and 3 crawls from 2008 to 2012, which are in a different (older) format.
113
+ <d-footnote>We have not processed these 3 older crawls.</d-footnote> </p>
114
 
115
  <h3>Processing at scale</h3>
116
  <p>Given the sheer size of the data involved, one of the main
 
121
  href="https://github.com/huggingface/datatrove"><code>datatrove</code></a><d-cite bibtex-key="penedo2024datatrove"></d-cite>, an open-source data
122
  processing library that allowed us to seamlessly scale our filtering and deduplication setup to thousands of
123
  CPU cores. All the data processing steps involved in the creation of 🍷 FineWeb used this <a
124
+ href="https://github.com/huggingface/datatrove">library</a>. You will find the exact scripts we used in the
125
+ <a href="https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py"><code>datatrove</code> repository</a>.</p>
126
 
127
  <h3>What is good data?</h3>
128
  <p>This is probably the main question to keep in mind when
129
+ creating a dataset. In most contexts and, in particular, in the context of large language model pretraining <d-footnote>Note that this report is focused on the special field of web-scale datasets ("web-scale" typically meaning >100 billion tokens obtained from the web) used to pretrain a Large Language Model (by pretraining we mean the very first step in the training of a model, starting from random weights). We don't pretend to cover any other field of dataset creation nor that the lessons or hypothesis we develop in this document can extend to any field besides this specific field.</d-footnote>, "high quality" is not a very well defined term<d-cite bibtex-key="albalak2024survey"></d-cite>, and not even a property of documents that can always be clearly perceived through direct human observation alone.<d-cite bibtex-key="longpre2023pretrainers"></d-cite></p>
130
  <p>It is still common to train a model on a given corpus considered "clean"
131
  (typically wikipedia<d-footnote>Even though as we mentioned above the notion of "clean" is so ill-defined that it should probably not been seen as equivalent to wikipedia-type of text</d-footnote>) and use it to check the perplexity on the dataset
132
  that we were trying to curate<d-cite bibtex-key="wenzek2019ccnet"></d-cite>. Unfortunately this does not always correlate with improved performance on a set of downstream
133
+ tasks of interest<d-cite bibtex-key="soldaini2024dolma"></d-cite>, and as a result another often used approach is to train small models<d-footnote>"Small" in comparison to standard sizes of today's LLMs, i.e. small in comparison to 7-70 billion parameters. In this work "small" means about 1-2 billion parameters</d-footnote> on a representative subset of our dataset and evaluate them on
134
+ a set of evaluation tasks. Small models are used because training costs and time are a function of model size. In this second approach, it is important to
 
135
  choose a diverse and representative set of dataset-evaluation tasks and try not to overfit to any one individual benchmark as it would risk hurting the generality of the obtained LLM after pretraining.</p>
136
  <p>Yet another way to compare different datasets would be to
137
  train a model on each dataset and have humans rate and compare the generations of the models (like on the <a
138
  href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
139
  reliable results in terms of representing real model usage, but getting ablation results this way is unfortunately
140
+ expensive and slow. It also often requires for the models to have undergone an instruction finetuning stage to acquire conversational capabilities, as pretrained models are not directly designed to follow instructions and are thus much more sensitive to prompt details.<d-cite bibtex-key="ouyang2022training"></d-cite></p>
141
  <p>In this work, we went with the approach of training small
142
  models and evaluating them on a set of "early-signal" benchmark tasks. We believe this is a reasonable proxy for the quality
143
+ of the data used to train these models, when keeping in mind the above-mentioned caveat around overfitting on the evaluation benchmarks.</p>
144
  <h3>Ablations and evaluation setup</h3>
145
+ <p>To compare the impact of a given processing
146
+ step, we trained two models on two versions of the dataset, one version processed with the extra step (the one we wish to evaluate) and another version with this step
147
+ ablated (cut/removed). Apart from the data, these two models would be otherwise identical: the same number of parameters, architecture hyper-parameters, and trained
148
  on an equal number of randomly sampled tokens from each version of the data, for a single epoch — the only difference being thus the
149
+ training data. We then evaluated each model on the same set of tasks and compared average
150
  scores.</p>
151
+ <p>Our ablation models were trained using <a
152
  href="https://github.com/huggingface/nanotron"><code>nanotron</code></a> with this config [<strong>TODO:
153
+ INSERT SIMPLIFIED NANOTRON CONFIG HERE</strong>]. Our ablation models have 1.82B parameters (including embeddings), used the Llama
154
+ architecture with a 2048 sequence length, a global batch size of ~2 million tokens, and the GPT2 tokenizer. For most
155
  ablations we trained on ~28B tokens (roughly the Chinchilla<d-cite bibtex-key="hoffmann2022training"></d-cite> optimal training size for this
156
+ model size). To confirm relative performance improvements after each step of filtering we conducted longer training runs on 350 billion tokens as mentioned further below.</p>
157
  <p>We evaluated the models using <a
158
  href="https://github.com/huggingface/lighteval/"><code>lighteval</code></a>. We carefully selected a set of benchmark for ablations by selecting
159
  benchmarks that would provide good signal at a relatively small scale ("small" models trained on only "a few
 
161
  <ul>
162
  <li>small variance between runs trained on different samplings of the same
163
  dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
164
+ resulting scores to have as little sensitivity to exact data choice as possible (apart from larger ablations that we are concerned with)
165
  </li>
166
  </ul>
167
  <ul>
168
  <li>performance increasing monotonically (or close) over a training run:
169
+ ideally, as the number of seen tokens increases, the performance on a high-signal benchmark should not decrease
170
  (which would be indicative of unreliable results at a small scale)
171
  </li>
172
  </ul>
173
  <ul>
174
+ <li>performance above random baseline for this task by at least a few standard deviations: given our small ablation models and trainings we usually don't reach extremely high scores on any benchmark, but we want to make sure that the scores we get are above random noise.
175
  </li>
176
  </ul>
177
  <p>After consideration, we selected the following list of benchmarks:</p>
 
186
  <li>MMLU<d-cite bibtex-key="hendrycks2021measuring"></d-cite></li>
187
  </ul>
188
  <p>To
189
+ ensure our checkpoint evaluation stayed within a limited timeframe, we capped the longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
190
  min on a single node of 8 GPUs - done in parallel to the training).</p>
191
  <aside>You can find the full list of tasks and prompts we used <a
192
  href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/lighteval_tasks.py">here</a>.</aside>
src/style.css CHANGED
@@ -141,4 +141,103 @@ d-byline .byline {
141
 
142
  d-contents > nav a.active {
143
  text-decoration: underline;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  }
 
141
 
142
  d-contents > nav a.active {
143
  text-decoration: underline;
144
+ }
145
+
146
+ @media (max-width: 1199px) {
147
+ d-contents {
148
+ display: none;
149
+ justify-self: start;
150
+ align-self: start;
151
+ padding-bottom: 0.5em;
152
+ margin-bottom: 1em;
153
+ padding-left: 0.25em;
154
+ border-bottom: 1px solid rgba(0, 0, 0, 0.1);
155
+ border-bottom-width: 1px;
156
+ border-bottom-style: solid;
157
+ border-bottom-color: rgba(0, 0, 0, 0.1);
158
+ }
159
+ }
160
+
161
+ d-contents a:hover {
162
+ border-bottom: none;
163
+ }
164
+
165
+
166
+ @media (min-width: 1200px) {
167
+ d-article {
168
+ /* Ensure d-article does not prevent sticky positioning */
169
+ overflow: visible;
170
+ }
171
+
172
+ d-contents {
173
+ align-self: start;
174
+ grid-column-start: 1 !important;
175
+ grid-column-end: 4 !important;
176
+ grid-row: auto / span 6;
177
+ justify-self: end;
178
+ margin-top: 0em;
179
+ padding-right: 3em;
180
+ padding-left: 2em;
181
+ border-right: 1px solid rgba(0, 0, 0, 0.1);
182
+ border-right-width: 1px;
183
+ border-right-style: solid;
184
+ border-right-color: rgba(0, 0, 0, 0.1);
185
+ position: -webkit-sticky; /* For Safari */
186
+ position: sticky;
187
+ top: 10px; /* Adjust this value if needed */
188
+ }
189
+ }
190
+
191
+ d-contents nav h3 {
192
+ margin-top: 0;
193
+ margin-bottom: 1em;
194
+ }
195
+
196
+ d-contents nav div {
197
+ color: rgba(0, 0, 0, 0.8);
198
+ font-weight: bold;
199
+ }
200
+
201
+ d-contents nav a {
202
+ color: rgba(0, 0, 0, 0.8);
203
+ border-bottom: none;
204
+ text-decoration: none;
205
+ }
206
+
207
+ d-contents li {
208
+ list-style-type: none;
209
+ }
210
+
211
+ d-contents ul, d-article d-contents ul {
212
+ padding-left: 1em;
213
+ }
214
+
215
+ d-contents nav ul li {
216
+ margin-bottom: .25em;
217
+ }
218
+
219
+ d-contents nav a:hover {
220
+ text-decoration: underline solid rgba(0, 0, 0, 0.6);
221
+ }
222
+
223
+ d-contents nav ul {
224
+ margin-top: 0;
225
+ margin-bottom: 6px;
226
+ }
227
+
228
+
229
+ d-contents nav > div {
230
+ display: block;
231
+ outline: none;
232
+ margin-bottom: 0.5em;
233
+ }
234
+
235
+ d-contents nav > div > a {
236
+ font-size: 13px;
237
+ font-weight: 600;
238
+ }
239
+
240
+ d-contents nav > div > a:hover,
241
+ d-contents nav > ul > li > a:hover {
242
+ text-decoration: none;
243
  }