hynky HF staff commited on
Commit
fb0323d
·
2 Parent(s): 0d2b8da 09e5351

Merge branch 'main' of hf.co:spaces/HuggingFaceFW/blogpost-fineweb-v1

Browse files
Files changed (4) hide show
  1. bibliography.bib +84 -0
  2. index.html +77 -66
  3. src/clusters.js +1 -1
  4. src/plotting.js +2 -2
bibliography.bib CHANGED
@@ -234,4 +234,88 @@ url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}
234
  year = {2024},
235
  url = {https://ai.meta.com/blog/meta-llama-3-meta-ai-responsibility/},
236
  note = {Accessed: 2024-05-31}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  }
 
234
  year = {2024},
235
  url = {https://ai.meta.com/blog/meta-llama-3-meta-ai-responsibility/},
236
  note = {Accessed: 2024-05-31}
237
+ }
238
+ @inproceedings{talmor-etal-2019-commonsenseqa,
239
+ title = "CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge",
240
+ author = "Talmor, Alon and
241
+ Herzig, Jonathan and
242
+ Lourie, Nicholas and
243
+ Berant, Jonathan",
244
+ booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
245
+ month = jun,
246
+ year = "2019",
247
+ address = "Minneapolis, Minnesota",
248
+ publisher = "Association for Computational Linguistics",
249
+ url = "https://aclanthology.org/N19-1421",
250
+ doi = "10.18653/v1/N19-1421",
251
+ pages = "4149--4158",
252
+ archivePrefix = "arXiv",
253
+ eprint = "1811.00937",
254
+ primaryClass = "cs",
255
+ }
256
+ @inproceedings{zellers-etal-2019-hellaswag,
257
+ title = "HellaSwag: Can a Machine Really Finish Your Sentence?",
258
+ author = "Zellers, Rowan and
259
+ Holtzman, Ari and
260
+ Bisk, Yonatan and
261
+ Farhadi, Ali and
262
+ Choi, Yejin",
263
+ editor = "Korhonen, Anna and
264
+ Traum, David and
265
+ M{\`a}rquez, Llu{\'\i}s",
266
+ booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
267
+ month = jul,
268
+ year = "2019",
269
+ address = "Florence, Italy",
270
+ publisher = "Association for Computational Linguistics",
271
+ url = "https://aclanthology.org/P19-1472",
272
+ doi = "10.18653/v1/P19-1472",
273
+ pages = "4791--4800",
274
+ abstract = "Recent work by Zellers et al. (2018) introduced a new task of commonsense natural language inference: given an event description such as {``}A woman sits at a piano,{''} a machine must select the most likely followup: {``}She sets her fingers on the keys.{''} With the introduction of BERT, near human-level performance was reached. Does this mean that machines can perform human level commonsense inference? In this paper, we show that commonsense inference still proves difficult for even state-of-the-art models, by presenting HellaSwag, a new challenge dataset. Though its questions are trivial for humans ({\textgreater}95{\%} accuracy), state-of-the-art models struggle ({\textless}48{\%}). We achieve this via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. AF proves to be surprisingly robust. The key insight is to scale up the length and complexity of the dataset examples towards a critical {`}Goldilocks{'} zone wherein generated text is ridiculous to humans, yet often misclassified by state-of-the-art models. Our construction of HellaSwag, and its resulting difficulty, sheds light on the inner workings of deep pretrained models. More broadly, it suggests a new path forward for NLP research, in which benchmarks co-evolve with the evolving state-of-the-art in an adversarial way, so as to present ever-harder challenges.",
275
+ }
276
+ @inproceedings{OpenBookQA2018,
277
+ title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
278
+ author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
279
+ booktitle={EMNLP},
280
+ year={2018}
281
+ }
282
+ @misc{bisk2019piqa,
283
+ title={PIQA: Reasoning about Physical Commonsense in Natural Language},
284
+ author={Yonatan Bisk and Rowan Zellers and Ronan Le Bras and Jianfeng Gao and Yejin Choi},
285
+ year={2019},
286
+ eprint={1911.11641},
287
+ archivePrefix={arXiv},
288
+ primaryClass={cs.CL}
289
+ }
290
+ @misc{sap2019socialiqa,
291
+ title={SocialIQA: Commonsense Reasoning about Social Interactions},
292
+ author={Maarten Sap and Hannah Rashkin and Derek Chen and Ronan LeBras and Yejin Choi},
293
+ year={2019},
294
+ eprint={1904.09728},
295
+ archivePrefix={arXiv},
296
+ primaryClass={cs.CL}
297
+ }
298
+ @misc{sakaguchi2019winogrande,
299
+ title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
300
+ author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
301
+ year={2019},
302
+ eprint={1907.10641},
303
+ archivePrefix={arXiv},
304
+ primaryClass={cs.CL}
305
+ }
306
+ @misc{clark2018think,
307
+ title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
308
+ author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
309
+ year={2018},
310
+ eprint={1803.05457},
311
+ archivePrefix={arXiv},
312
+ primaryClass={cs.AI}
313
+ }
314
+ @misc{hendrycks2021measuring,
315
+ title={Measuring Massive Multitask Language Understanding},
316
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
317
+ year={2021},
318
+ eprint={2009.03300},
319
+ archivePrefix={arXiv},
320
+ primaryClass={cs.CY}
321
  }
index.html CHANGED
@@ -12,7 +12,7 @@
12
  <meta name="viewport" content="width=device-width, initial-scale=1">
13
  <meta charset="utf8">
14
  <base target="_blank">
15
- <title>FineWeb: 15T tokens of high quality web data</title>
16
  <style>
17
 
18
  /* ****************************************
@@ -188,6 +188,7 @@
188
  <p>🍷 FineWeb, a 15-trillion token dataset derived from 96 <a href="https://commoncrawl.org/">CommonCrawl</a> snapshots, produces better-performing LLMs than other open pretraining datasets. To advance the understanding of how best to curate high-quality pretraining datasets, we carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies.</p>
189
  <p>We are also excited to announce the release of <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>📚 FineWeb-Edu</strong></a>, a version of 🍷 FineWeb that was filtered for educational content, available in two sizes: <strong>1.3 trillion (very high quality) and 5.4 trillion (high quality) tokens</strong>. 📚 FineWeb-Edu outperforms all existing public web datasets, with models pretrained on it showing notable improvements on knowledge- and reasoning-intensive benchmarks like MMLU, ARC, and OpenBookQA. You can
190
  download it <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu">here</a>.</p>
 
191
 
192
  <p>As 🍷 FineWeb has gathered a lot of interest from the
193
  community, we decided to further explain the steps involved in creating it, our processing decisions and
@@ -237,7 +238,7 @@
237
  a set of evaluation tasks. As we are curating a dataset for pretraining a generalist LLM, it is important to
238
  choose a diverse set of tasks and try not to overfit to any one individual benchmark.</p>
239
  <p>Another way to evaluate different datasets would be to
240
- train a model on each one and have humans rate and compare the outputs of each one (like on the <a
241
  href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
242
  reliable results in terms of representing real model usage, but getting ablation results this way is too
243
  expensive and slow. It also often requires that the models have undergone at least an instruction finetuning stage, as pretrained models have difficulty following instructions.<d-cite bibtex-key="ouyang2022training"></d-cite></p>
@@ -264,15 +265,26 @@
264
  <ul>
265
  <li>small variance between runs trained on different samplings of the same
266
  dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
267
- resulting scores to have as little noise as possible
268
  </li>
269
  </ul>
270
  <ul>
271
  <li>performance increasing monotonically (or close) over a training run:
272
  ideally, as the number of seen tokens increases, the performance on this benchmark should not decrease
273
- (should not be too noisy)
274
  </li>
275
  </ul>
 
 
 
 
 
 
 
 
 
 
 
276
  <p>To
277
  have results quickly we capped longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
278
  min on a single node of 8 GPUs - done in parallel to the training).</p>
@@ -334,15 +346,13 @@
334
  extracted dumps (there are currently 96 dumps) we obtained roughly 36 trillion tokens of data (when
335
  tokenized with the <code>gpt2</code> tokenizer).</p>
336
  <h3>Deduplication</h3>
337
- <p>Deduplication is another important step, specially for web
338
- datasets. Methods to deduplicate datasets attempt to remove redundant/repeated data. Deduplication is one of
339
- the most important steps when creating large web datasets for LLMs.</p>
340
  <h4>Why deduplicate?</h4>
341
  <p>The web has many aggregators, mirrors, templated pages or
342
  just otherwise repeated content spread over different domains and webpages. Often, these duplicated pages
343
  can be introduced by the crawler itself, when different links point to the same page. </p>
344
  <p>Removing these duplicates (deduplicating) has been linked to an improvement in model performance<d-cite bibtex-key="lee2022deduplicating"></d-cite> and a reduction in memorization of pretraining data<d-cite bibtex-key="carlini2023quantifying"></d-cite>, which might
345
- allow for better generalization. Additionally, the performance uplift can also be tied to increased training
346
  efficiency: by removing duplicated content, for the same number of training tokens, a model will have seen
347
  more diverse data.<d-cite bibtex-key="muennighoff2023scaling"></d-cite><d-cite bibtex-key="hernandez2022scaling"></d-cite></p>
348
  <p>There are different ways to identify and even define
@@ -351,11 +361,11 @@
351
  similarity metric to mark documents as duplicates, or “exact” by checking for exact matches between two
352
  documents (or lines, paragraphs, or whatever other granularity level being used).</p>
353
  <h4>Our deduplication parameters</h4>
354
- <p>Similarly to RefinedWeb, we decided to apply MinHash, a
355
- fuzzy hash based deduplication technique that scales well and allows us to tune similarity thresholds (by changing the number and size of buckets) and the granularity of the matches (changing the n-gram size). We chose to compute minhashes on each document’s 5-grams, using
356
  112 hash functions in total, split into 14 buckets of 8 hashes each — targeting documents that are at least
357
  75% similar. Documents with the same 8 minhashes in any bucket are considered a duplicate of each other.</p>
358
- <p>This would mean that for two documents with a similarity (<code>s</code>)
359
  of 0.7, 0.75, 0.8 and 0.85, the probability that they would be identified as duplicates would be 56%, 77%,
360
  92% and 98.8% respectively ($$1-(1-s^8)^{14}$$). See the plot below for a match probability
361
  comparison between our setup with 112 hashes and the one from RefinedWeb, with 9000 hashes, divided into 450
@@ -370,19 +380,19 @@
370
  <p>It should also be noted that intra-document deduplication is already handled by our repetition filter, which removes documents with many repeated lines and paragraphs.</p>
371
  <h4>More deduplication is always better, right?</h4>
372
  <p>Our initial approach was to take the entire dataset (all
373
- 96 dumps) and deduplicate them as one big dataset using MinHash.</p>
374
  <p>We did this in an iterative manner: starting with the most
375
- recent dump (which at the time was 2023-50) and taking the oldest one last, we would deduplicate each dump
376
- not only against itself but also by removing any matches with duplicates from the previously processed
377
  dumps. </p>
378
  <p>For instance, for the second most recent dump (2023-40 at
379
- the time), we deduplicated it against the most recent one in addition to itself. In particular, the oldest
380
- dump was deduplicated against all other dumps. As a result, more data was removed in the oldest dumps (last
381
- to be deduplicated) than in the most recent ones.</p>
382
  <p>Deduplicating the dataset in this manner resulted in 4
383
  trillion tokens of data, but, quite surprisingly for us, when training on a randomly sampled 350 billion
384
  tokens subset, the model showed no improvement over one trained on the non deduplicated data (see orange and
385
- green curve below), scoring far below its predecessor RefinedWeb on our aggregate of tasks.</p>
386
  <div class="main-plot-container">
387
  <figure><img src="plots/dedup_all_dumps_bad.png"/></figure>
388
  <div id="plot-dedup_all_dumps_bad"></div>
@@ -394,7 +404,7 @@
394
  <li>pre deduplication, this dump had ~490 billion tokens</li>
395
  </ul>
396
  <ul>
397
- <li>after our iterative MinHash, ~31 billion tokens remained (94% of data
398
  removed)
399
  </li>
400
  </ul>
@@ -411,14 +421,13 @@
411
  iterative dedup process (<em>originally removed data</em>)<d-footnote>While there may be documents in <em>originally kept
412
  data</em> similar to documents in <em>originally removed data</em>, we estimate the overlap to be small (around 4 billion tokens)</d-footnote>
413
  </li>
414
-
415
  </ul>
416
  <div class="main-plot-container">
417
  <figure><img src="plots/removed_data_cross_dedup.png"/></figure>
418
  <div id="plot-removed_data_cross_dedup"></div>
419
  </div>
420
- <p>These results show that, for this older dump where we were
421
- removing over 90% of the original data, the data that was kept was actually <em>worse</em> than the data
422
  removed (considered independently of all the other dumps). This is also confirmed by visual inspection: <em>originally kept
423
  data</em> contains far more ads, lists of keywords and generally badly formatted text than <em>originally removed data</em>.</p>
424
  <h4>Taking a step back: individual dump dedup</h4>
@@ -434,12 +443,12 @@
434
  <p>We hypothesize that the main improvement gained from
435
  deduplication is the removal of very large clusters that are present in every single dump (you will find
436
  some examples of these clusters on the RefinedWeb paper, each containing <em>hundreds of thousands</em> of
437
- documents) and that further deduplication for low number of deduplications (less than ~100 i.e. the number
438
- of dumps) actually harm performance: data that does not find a duplicate match in any other dump might
439
  actually be worse quality/more out of distribution (as evidenced by the results on the 2013-48 data). </p>
440
  <p>While you might see some performance improvement when
441
- deduplicating a few dumps together, at the scale of all the dumps this upsampling of lower quality data side
442
- effect seems to have a great impact.</p>
443
  <p>One possibility to consider is that as filtering quality
444
  improves, this effect may not be as prevalent, since the filtering might be able to remove some of this
445
  lower quality data. We also experimented with applying different, and often “lighter”, deduplication
@@ -453,7 +462,7 @@
453
  tokens on measuring deduplication impact, we considered the following (very extreme and unrealistic
454
  regarding the degree of duplication observed) theoretical scenario:</p>
455
  <ul>
456
- <li>there are 100 CommonCrawl dumps (actually roughly true)</li>
457
  </ul>
458
  <ul>
459
  <li>each dump has been perfectly individually deduplicated (every single
@@ -485,30 +494,30 @@
485
  (#duplicates=1), despite the fact that in the entire dataset each document is repeated 100 times (once per
486
  dump). We start seeing some changes at the 100B scale (0.5% of the total dataset), with a large number of
487
  documents being repeated twice, and a few even 4-8 times. At the larger scale of 1T (5% of the total
488
- dataset), the majority of the documents are repeated up to 8 times, with a some being repeated up to 16
489
  times. </p>
490
  <p>We ran our performance evaluations for the deduplicated
491
  data at the 350B scale, which would, under this theoretical scenario, be made up of a significant portion of
492
  documents duplicated up to 8 times. This simulation illustrates the inherent difficulties associated with
493
- measuring deduplication impact on the training of LLMs, once the biggest document clusters have been
494
  removed.</p>
495
  <h4>Other (failed) global approaches</h4>
496
  <p>We attempted to improve the performance of the
497
- independently minhash deduped 20 trillion tokens of data by further deduplicating it (globally, over all crawls) with the following methods</p>
498
  <ul>
499
  <li>URL deduplication, where we only kept one document per normalized
500
- (lowercased) URL (71.5% of tokens removed, 5.6T left) — <em>🍷 FineWeb URL dedup</em></li>
501
  </ul>
502
  <ul>
503
  <li>Line deduplication:
504
  <ul>
505
  <li>remove all but 1 (randomly chosen) occurrence of each duplicated line (77.8% of
506
- tokens dropped, 4.4T left) — <em>🍷 FineWeb line dedup</em></li>
507
  </ul>
508
  <ul>
509
  <li>same as above, but only removing duplicate lines with at least 10
510
  words and dropping documents with fewer than 3 sentences after deduplication (85% of tokens
511
- dropped, 2.9T left) — <em>🍷 FineWeb line dedup w/ min words</em></li>
512
  </ul>
513
  <ul>
514
  <li>remove all but 1 occurrence of each span of 3 duplicated lines
@@ -526,22 +535,21 @@
526
  </div>
527
  <h3>Additional filtering</h3>
528
  <p>By this point we had reached the same performance as
529
- RefinedWeb, but on our aggregate of tasks, another heavily filtered dataset, the C4 dataset<d-cite bibtex-key="raffel2023exploring"></d-cite>, still showed stronger performance (with
530
  the caveat that it is a relatively small dataset for current web-scale standards).</p>
531
  <p>We therefore set out to find new filtering steps that
532
- would, at first, allow us to match the performance of C4 and eventually surpass it. A natural starting point
533
  was to look into the processing of C4 itself.</p>
534
  <h4>C4: A dataset that has stood the test of time</h4>
535
  <p>The <a href="https://huggingface.co/datasets/c4">C4
536
  dataset</a> was first released in 2019. It was obtained from the <code>2019-18</code> CommonCrawl dump by
537
  removing non english data, applying some heuristic filters on both the line and document level,
538
- deduplicating on the line level and removing documents containing words from a word blocklist.</p>
539
- <p>Despite its age and limited size (around 175B gpt2
540
- tokens), models trained on this dataset have strong performance, excelling in particular on the Hellaswag
541
- benchmark, one of the benchmarks in our “early signal” group with the stronger signal and highest
542
- signal-over-noise ratio. As such, it has stayed a common sub-set of typical LLM training, for instance in
543
- the relatively recent Llama1 model<d-cite bibtex-key="touvron2023llama"></d-cite>. We experimented applying
544
- each of the different filters used in C4 to a baseline of the independently deduped 🍷 FineWeb 2019-18 dump:</p>
545
  <div class="main-plot-container">
546
  <figure><img src="plots/c4_filters_hellaswag.png"/></figure>
547
  <div id="plot-c4_filters_hellaswag"></div>
@@ -569,7 +577,7 @@
569
  </li>
570
  </ul>
571
  <ul>
572
- <li>All filters except the very destructive terminal_punct perform better than
573
  terminal_punct by itself, while removing less in total (~7%)
574
  </li>
575
  </ul>
@@ -577,33 +585,35 @@
577
  the terminal punctuation one. We validated these results with a longer run, which you will find in a plot in
578
  the next section.</p>
579
  <h4>A statistical approach to develop heuristic filters</h4>
580
- <p>Due to our assumption that Full Minhash upsamples lower quality data in the oldest dumps, we were interested whether
581
- we could find heuristic filters which would remove them. In order to find such filters
582
- we collected a very large list of statistics (statistical metrics) — over <strong>50</strong> — on both the independently
583
- minhashed version and the result from the (worse quality) full dedup from 2013-48 and 2015-22 crawls (older crawls). We then compared the
584
- statistics at a macro level, by looking at the distribution of these metrics for each one.</p>
585
- <p>The collected statistics ranged from common document-level
586
  metrics (e.g. number of lines, avg. line/word length, etc) to inter-document repetition metrics (MassiveText
587
- inspired). Perhaps not too surprisingly given our findings for deduplication, we found significant
 
 
 
 
 
 
 
 
588
  disparities in most of the metrics for the two deduplication methods. For instance, the <code>line-char-duplicates</code>
589
  metric (nb. of characters in duplicated lines / nb. characters), roughly doubled from the independent dedup
590
- (0.0053 for 2015-22 and 0.0058 for 2013-48), to the full dedup (0.011 for 2015-22 and 0.01 for 2013-48),
591
  indicating that the latter had higher inter-document repetition.</p>
592
- <p>To choose the metrics for filtering, we computed Wasserstein distance between the two versions of 2013-48 crawl for all our metrics
593
- and then select the ones with the heighest distance. We would then inspect the histograms, empirically choose a threshold
594
- and filter the data and inspect the removed documents. This process yielded 17 candidate
595
- threshold-filter pairs. In the image below, you can see 3 of these histograms.</p>
596
  <div class="main-plot-container">
597
  <figure><img src="plots/custom_filters.png"/></figure>
598
  <div id="plot-stats"></div>
599
  </div>
600
 
601
- <p>As an example, we inspected the histograms of Fraction of lines ending with punctuation metric (see the image above) and observed the increased document density of Full Minhash at around 0.12 ratio.
602
- We then filtered with this threshold and found out that the removed data had a higher amount of short lists or consisted of only document layout text (Home Sign up etc...).
603
  </p>
604
 
605
- <p>To assess the effectiveness of these newly created
606
- filters, we conducted <strong>28B tokens </strong>ablation runs on the <strong>2019-18 crawl</strong>. Out
607
  of all those runs, we identified three filters (the ones based on the histograms above) that demonstrated
608
  the most significant improvements on the aggregate score:</p>
609
  <ul>
@@ -622,12 +632,13 @@
622
  </li>
623
  </ul>
624
  <ul>
625
- <li>When applying the 3 together, ~22% of tokens were removed</li>
626
  </ul>
627
  <div class="main-plot-container">
628
  <figure><img src="plots/custom_filters.png"/></figure>
629
  <div id="plot-custom-filters"></div>
630
  </div>
 
631
  <h2>The final dataset</h2>
632
  <p>The final 🍷 FineWeb dataset comprises 15T tokens and
633
  includes the following previously mentioned steps, in order, each providing a performance boost on our group
@@ -685,19 +696,20 @@
685
  <figure><img src="plots/dataset_ablations.png"/></figure>
686
  <div id="plot-dataset_ablations"></div>
687
  </div>
 
688
  <h2>📚 FineWeb-Edu</h2>
689
  <p>A new approach has recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was used in the trainings of Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Phi3<d-cite bibtex-key="abdin2024phi"></d-cite> but its large-scale impact on web data filtering hasn't been fully explored or published.</p>
690
  <p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the paper<d-cite bibtex-key="abdin2024phi"></d-cite> stating:</p>
691
  <blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
692
  <p>Similarly, Llama 3 blog post<d-cite bibtex-key="meta2024responsible"></d-cite> notes:</p>
693
  <blockquote>We found that previous generations of Llama are good at identifying high-quality data, so we used Llama 2 to help build the text-quality classifiers that are powering Llama 3.</blockquote>
694
- <p>However, these classifiers and filtered datasets are not publicly available. To enhance 🍷 FineWeb's quality, we developed an educational quality classifier using annotations generated by <a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">Llama-3-70B-Instruct</a> to create 📚 FineWeb-Edu.</p>
695
  <h3>Annotation</h3>
696
- <p>We used Llama-3-70B-Instruct to annotate 500k samples from the 🍷 FineWeb dataset, scoring each for their educational quality on a scale from 0 to 5.</p>
697
  <p>We explored various prompts and found that the additive scale by Yuan et al.<d-cite bibtex-key="yuan2024self"></d-cite> worked best. This scale allows the LLM to reason about each additional point awarded, unlike the single-rating Likert scale which fits samples into predefined boxes. Then, to avoid the LLM favoring highly technical pages like arXiv abstracts and submissions, we focused on grade-school and middle-school level knowledge. By setting a threshold of 3 (on a scale of 0 to 5) during the filtering process, we were able to also retain some high-level educational pages.</p>
698
  <div style="text-align: center; margin: 20px 0;">
699
  <img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/fjZQ4izIj1rx1xQnBTKKr.png" alt="Prompt for LLM annotation" style="width: 90%; max-width: 800px; height: auto;">
700
- <figcaption style="font-style: italic; margin-top: 10px;">Prompt used for Llama3 annotations of the educational score, also available on <a href="https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier/blob/main/utils/prompt.txt">here</a>.</figcaption>
701
  </div>
702
  <p>We also experimented with <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">Mixtral-8x-7B-Instruct</a> and <a href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">Mixtral-8x22B-Instruct</a> and a jury of all three models<d-cite bibtex-key="verga2024replacing"></d-cite> but found that Llama3 alone gave the most reliable results.</p>
703
  <h3>Classifier Training</h3>
@@ -728,9 +740,8 @@
728
  <p>Given that a threshold of 2 also demonstrated strong performance while retaining more data, we are releasing an additional dataset filtered with this threshold, containing 5.4 trillion tokens under <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu-score-2">HuggingFaceFW/fineweb-edu-score-2</a>.</p>
729
  <p>You can find the two datasets along with the classifier used for the filtering in this <a href="https://huggingface.co/collections/HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd">collection</a>.</p>
730
  <h2>Next steps</h2>
731
- <p>We want to continue improving FineWeb and will also
732
- release a technical report with more details soon.</p>
733
- <p>Adapting the FineWeb recipe [wip]</p>
734
  </d-article>
735
 
736
  <d-appendix>
 
12
  <meta name="viewport" content="width=device-width, initial-scale=1">
13
  <meta charset="utf8">
14
  <base target="_blank">
15
+ <title>FineWeb: decanting the web for the finest text data at scale</title>
16
  <style>
17
 
18
  /* ****************************************
 
188
  <p>🍷 FineWeb, a 15-trillion token dataset derived from 96 <a href="https://commoncrawl.org/">CommonCrawl</a> snapshots, produces better-performing LLMs than other open pretraining datasets. To advance the understanding of how best to curate high-quality pretraining datasets, we carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies.</p>
189
  <p>We are also excited to announce the release of <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>📚 FineWeb-Edu</strong></a>, a version of 🍷 FineWeb that was filtered for educational content, available in two sizes: <strong>1.3 trillion (very high quality) and 5.4 trillion (high quality) tokens</strong>. 📚 FineWeb-Edu outperforms all existing public web datasets, with models pretrained on it showing notable improvements on knowledge- and reasoning-intensive benchmarks like MMLU, ARC, and OpenBookQA. You can
190
  download it <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu">here</a>.</p>
191
+ <p>Both datasets are released under the permissive <strong><a href="https://opendatacommons.org/licenses/by/1-0/">ODC-By 1.0 license</a></strong></p>
192
 
193
  <p>As 🍷 FineWeb has gathered a lot of interest from the
194
  community, we decided to further explain the steps involved in creating it, our processing decisions and
 
238
  a set of evaluation tasks. As we are curating a dataset for pretraining a generalist LLM, it is important to
239
  choose a diverse set of tasks and try not to overfit to any one individual benchmark.</p>
240
  <p>Another way to evaluate different datasets would be to
241
+ train a model on each one and have humans rate and compare their outputs (like on the <a
242
  href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
243
  reliable results in terms of representing real model usage, but getting ablation results this way is too
244
  expensive and slow. It also often requires that the models have undergone at least an instruction finetuning stage, as pretrained models have difficulty following instructions.<d-cite bibtex-key="ouyang2022training"></d-cite></p>
 
265
  <ul>
266
  <li>small variance between runs trained on different samplings of the same
267
  dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
268
+ resulting scores to have as little evaluation noise as possible
269
  </li>
270
  </ul>
271
  <ul>
272
  <li>performance increasing monotonically (or close) over a training run:
273
  ideally, as the number of seen tokens increases, the performance on this benchmark should not decrease
274
+ (which would be indicative of unreliable results at a small scale)
275
  </li>
276
  </ul>
277
+ <p>We selected the following list of benchmarks:</p>
278
+ <ul>
279
+ <li>CommonSense QA<d-cite bibtex-key="talmor-etal-2019-commonsenseqa"></d-cite></li>
280
+ <li>HellaSwag<d-cite bibtex-key="zellers-etal-2019-hellaswag"></d-cite></li>
281
+ <li>OpenBook QA<d-cite bibtex-key="OpenBookQA2018"></d-cite></li>
282
+ <li>PIQA<d-cite bibtex-key="bisk2019piqa"></d-cite></li>
283
+ <li>SIQA<d-cite bibtex-key="sap2019socialiqa"></d-cite></li>
284
+ <li>WinoGrande<d-cite bibtex-key="sakaguchi2019winogrande"></d-cite></li>
285
+ <li>ARC<d-cite bibtex-key="clark2018think"></d-cite></li>
286
+ <li>MMLU<d-cite bibtex-key="hendrycks2021measuring"></d-cite></li>
287
+ </ul>
288
  <p>To
289
  have results quickly we capped longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
290
  min on a single node of 8 GPUs - done in parallel to the training).</p>
 
346
  extracted dumps (there are currently 96 dumps) we obtained roughly 36 trillion tokens of data (when
347
  tokenized with the <code>gpt2</code> tokenizer).</p>
348
  <h3>Deduplication</h3>
349
+ <p>Deduplication is one of the most important steps when creating large web datasets for LLM pretraining. Methods to deduplicate datasets attempt to identify and remove redundant/repeated data from the dataset. </p>
 
 
350
  <h4>Why deduplicate?</h4>
351
  <p>The web has many aggregators, mirrors, templated pages or
352
  just otherwise repeated content spread over different domains and webpages. Often, these duplicated pages
353
  can be introduced by the crawler itself, when different links point to the same page. </p>
354
  <p>Removing these duplicates (deduplicating) has been linked to an improvement in model performance<d-cite bibtex-key="lee2022deduplicating"></d-cite> and a reduction in memorization of pretraining data<d-cite bibtex-key="carlini2023quantifying"></d-cite>, which might
355
+ allow for better generalization. Additionally, the performance uplift obtained through deduplication can also be tied to increased training
356
  efficiency: by removing duplicated content, for the same number of training tokens, a model will have seen
357
  more diverse data.<d-cite bibtex-key="muennighoff2023scaling"></d-cite><d-cite bibtex-key="hernandez2022scaling"></d-cite></p>
358
  <p>There are different ways to identify and even define
 
361
  similarity metric to mark documents as duplicates, or “exact” by checking for exact matches between two
362
  documents (or lines, paragraphs, or whatever other granularity level being used).</p>
363
  <h4>Our deduplication parameters</h4>
364
+ <p>Similarly to RefinedWeb<d-cite bibtex-key="penedo2023refinedweb"></d-cite>, we decided to apply MinHash, a
365
+ fuzzy hash based deduplication technique that scales well and allows us to tune similarity thresholds (by changing the number and size of buckets) and the granularity of the matches (by changing the n-gram size). We chose to compute minhashes on each document’s 5-grams, using
366
  112 hash functions in total, split into 14 buckets of 8 hashes each — targeting documents that are at least
367
  75% similar. Documents with the same 8 minhashes in any bucket are considered a duplicate of each other.</p>
368
+ <p>This would mean that for two documents with a similarity ($$s$$)
369
  of 0.7, 0.75, 0.8 and 0.85, the probability that they would be identified as duplicates would be 56%, 77%,
370
  92% and 98.8% respectively ($$1-(1-s^8)^{14}$$). See the plot below for a match probability
371
  comparison between our setup with 112 hashes and the one from RefinedWeb, with 9000 hashes, divided into 450
 
380
  <p>It should also be noted that intra-document deduplication is already handled by our repetition filter, which removes documents with many repeated lines and paragraphs.</p>
381
  <h4>More deduplication is always better, right?</h4>
382
  <p>Our initial approach was to take the entire dataset (all
383
+ 90+ dumps) and deduplicate them together as one big dataset using MinHash.</p>
384
  <p>We did this in an iterative manner: starting with the most
385
+ recent dump (which at the time was 2023-50) and proceeding chronologically until the oldest one, we would deduplicate each dump
386
+ not only within itself, but we would also remove any matches with documents from the previously processed (more recent)
387
  dumps. </p>
388
  <p>For instance, for the second most recent dump (2023-40 at
389
+ the time), we deduplicated it against the most recent one in addition to within itself. In particular, the oldest
390
+ dump was deduplicated against all other dumps. As a result, more data was removed from the oldest dumps (last
391
+ to be deduplicated) than from the most recent ones.</p>
392
  <p>Deduplicating the dataset in this manner resulted in 4
393
  trillion tokens of data, but, quite surprisingly for us, when training on a randomly sampled 350 billion
394
  tokens subset, the model showed no improvement over one trained on the non deduplicated data (see orange and
395
+ green curves below), scoring far below its predecessor RefinedWeb on our aggregate of tasks.</p>
396
  <div class="main-plot-container">
397
  <figure><img src="plots/dedup_all_dumps_bad.png"/></figure>
398
  <div id="plot-dedup_all_dumps_bad"></div>
 
404
  <li>pre deduplication, this dump had ~490 billion tokens</li>
405
  </ul>
406
  <ul>
407
+ <li>after our iterative MinHash, ~31 billion tokens remained (94% of data had been
408
  removed)
409
  </li>
410
  </ul>
 
421
  iterative dedup process (<em>originally removed data</em>)<d-footnote>While there may be documents in <em>originally kept
422
  data</em> similar to documents in <em>originally removed data</em>, we estimate the overlap to be small (around 4 billion tokens)</d-footnote>
423
  </li>
 
424
  </ul>
425
  <div class="main-plot-container">
426
  <figure><img src="plots/removed_data_cross_dedup.png"/></figure>
427
  <div id="plot-removed_data_cross_dedup"></div>
428
  </div>
429
+ <p>These results show that, for this older dump from which we had
430
+ removed over 90% of the original data, the data that was kept was actually <em>worse</em> than the data
431
  removed (considered independently of all the other dumps). This is also confirmed by visual inspection: <em>originally kept
432
  data</em> contains far more ads, lists of keywords and generally badly formatted text than <em>originally removed data</em>.</p>
433
  <h4>Taking a step back: individual dump dedup</h4>
 
443
  <p>We hypothesize that the main improvement gained from
444
  deduplication is the removal of very large clusters that are present in every single dump (you will find
445
  some examples of these clusters on the RefinedWeb paper, each containing <em>hundreds of thousands</em> of
446
+ documents) and that further deduplication for clusters with a low number of duplicates (less than ~100 i.e. the number
447
+ of dumps) actually harms performance: data that does not find a duplicate match in any other dump might
448
  actually be worse quality/more out of distribution (as evidenced by the results on the 2013-48 data). </p>
449
  <p>While you might see some performance improvement when
450
+ deduplicating a few dumps together, at the scale of the entire dataset (all the dumps), the effect from this upsampling of lower quality data side
451
+ effect seems to be more impactful.</p>
452
  <p>One possibility to consider is that as filtering quality
453
  improves, this effect may not be as prevalent, since the filtering might be able to remove some of this
454
  lower quality data. We also experimented with applying different, and often “lighter”, deduplication
 
462
  tokens on measuring deduplication impact, we considered the following (very extreme and unrealistic
463
  regarding the degree of duplication observed) theoretical scenario:</p>
464
  <ul>
465
+ <li>there are 100 CommonCrawl dumps (roughly accurate)</li>
466
  </ul>
467
  <ul>
468
  <li>each dump has been perfectly individually deduplicated (every single
 
494
  (#duplicates=1), despite the fact that in the entire dataset each document is repeated 100 times (once per
495
  dump). We start seeing some changes at the 100B scale (0.5% of the total dataset), with a large number of
496
  documents being repeated twice, and a few even 4-8 times. At the larger scale of 1T (5% of the total
497
+ dataset), the majority of the documents are repeated up to 8 times, with some being repeated up to 16
498
  times. </p>
499
  <p>We ran our performance evaluations for the deduplicated
500
  data at the 350B scale, which would, under this theoretical scenario, be made up of a significant portion of
501
  documents duplicated up to 8 times. This simulation illustrates the inherent difficulties associated with
502
+ measuring deduplication impact on the training of LLMs, once the biggest duplicate clusters have been
503
  removed.</p>
504
  <h4>Other (failed) global approaches</h4>
505
  <p>We attempted to improve the performance of the
506
+ independently minhash deduped 20 trillion tokens of data by further deduplicating it (globally, over all dumps) with the following methods:</p>
507
  <ul>
508
  <li>URL deduplication, where we only kept one document per normalized
509
+ (lowercased) URL (71.5% of tokens removed, 5.6T left) — <em>FineWeb URL dedup</em></li>
510
  </ul>
511
  <ul>
512
  <li>Line deduplication:
513
  <ul>
514
  <li>remove all but 1 (randomly chosen) occurrence of each duplicated line (77.8% of
515
+ tokens dropped, 4.4T left) — <em>FineWeb line dedup</em></li>
516
  </ul>
517
  <ul>
518
  <li>same as above, but only removing duplicate lines with at least 10
519
  words and dropping documents with fewer than 3 sentences after deduplication (85% of tokens
520
+ dropped, 2.9T left) — <em>FineWeb line dedup w/ min words</em></li>
521
  </ul>
522
  <ul>
523
  <li>remove all but 1 occurrence of each span of 3 duplicated lines
 
535
  </div>
536
  <h3>Additional filtering</h3>
537
  <p>By this point we had reached the same performance as
538
+ RefinedWeb with base filtering + independent MinHash, but on our aggregate of tasks, another heavily filtered dataset, the C4 dataset<d-cite bibtex-key="raffel2023exploring"></d-cite>, still showed stronger performance (with
539
  the caveat that it is a relatively small dataset for current web-scale standards).</p>
540
  <p>We therefore set out to find new filtering steps that
541
+ would, at first, allow us to match the performance of C4 and, at a second stage, surpass it. A natural starting point
542
  was to look into the processing of C4 itself.</p>
543
  <h4>C4: A dataset that has stood the test of time</h4>
544
  <p>The <a href="https://huggingface.co/datasets/c4">C4
545
  dataset</a> was first released in 2019. It was obtained from the <code>2019-18</code> CommonCrawl dump by
546
  removing non english data, applying some heuristic filters on both the line and document level,
547
+ deduplicating on the line level, and removing documents containing words from a word blocklist.</p>
548
+ <p>Despite its age and limited size for current standards (around 175B gpt2 tokens), this dataset is, to this day, a common sub-set of typical LLM training, being used in models such as the relatively recent Llama1<d-cite bibtex-key="touvron2023llama"></d-cite>.
549
+ This success is due to the strong performance that models trained on this dataset exhibit, excelling in particular on the Hellaswag
550
+ benchmark <d-cite bibtex-key="zellers-etal-2019-hellaswag"></d-cite>, one of the benchmarks in our “early signal” group with the highest
551
+ signal-to-noise ratio. We experimented applying
552
+ each of the different filters used in C4 to a baseline of the independently deduped FineWeb 2019-18 dump:</p>
 
553
  <div class="main-plot-container">
554
  <figure><img src="plots/c4_filters_hellaswag.png"/></figure>
555
  <div id="plot-c4_filters_hellaswag"></div>
 
577
  </li>
578
  </ul>
579
  <ul>
580
+ <li>"All filters except the (very destructive) terminal_punct" performs better than
581
  terminal_punct by itself, while removing less in total (~7%)
582
  </li>
583
  </ul>
 
585
  the terminal punctuation one. We validated these results with a longer run, which you will find in a plot in
586
  the next section.</p>
587
  <h4>A statistical approach to develop heuristic filters</h4>
588
+ <p>To develop new heuristic filters and select their thresholds we devised a systematic process:</p>
589
+ <ol><li>we started by collecting a very large list of high level statistics (over <strong>50</strong>) ranging from common document-level
 
 
 
 
590
  metrics (e.g. number of lines, avg. line/word length, etc) to inter-document repetition metrics (MassiveText
591
+ inspired), on both a high quality and a lower quality web dataset;</li>
592
+ <li>we selected the metrics for which the Wasserstein distance between the two distributions (of the metric computed on each dataset) was larger;</li>
593
+ <li>we inspected the histograms of the two distributions and empirically chose a threshold that would make the lower quality dataset more closely resemble the higher quality one on this metric;</li>
594
+ <li>we validated the resulting filter (metric-threshold pair) by using it on a reference dataset and running small ablations.</li>
595
+ </ol>
596
+ <p>Due to our assumption that global MinHash greatly upsamples lower quality data in the oldest dumps, we computed metrics on both the independently
597
+ MinHashed and the (worse quality) global MinHashed versions of the 2013-48 and 2015-22 crawls (two older crawls). We then compared the
598
+ statistics at a macro level, by looking at the distribution of these metrics for each one.</p>
599
+ <p>Perhaps not too surprisingly given our findings for deduplication, we found significant
600
  disparities in most of the metrics for the two deduplication methods. For instance, the <code>line-char-duplicates</code>
601
  metric (nb. of characters in duplicated lines / nb. characters), roughly doubled from the independent dedup
602
+ (0.0053 for 2015-22 and 0.0058 for 2013-48), to the global dedup (0.011 for 2015-22 and 0.01 for 2013-48),
603
  indicating that the latter had higher inter-document repetition.</p>
604
+ <p>Following the process listed above for these datasets yielded 17 candidate
605
+ metric-threshold pairs. In the image below, you can see 3 of these histograms:</p>
 
 
606
  <div class="main-plot-container">
607
  <figure><img src="plots/custom_filters.png"/></figure>
608
  <div id="plot-stats"></div>
609
  </div>
610
 
611
+ <p>As an example, we inspected the histograms of "fraction of lines ending with punctuation" (see the image above) and observed an increased document density of global MinHash at around 0.12.
612
+ We then filtered with this threshold and found that the removed data had a higher amount of short lists or consisted of only document layout text ("Home", "Sign up", etc).
613
  </p>
614
 
615
+ <p>We then assessed the effectiveness of these 17 newly created
616
+ filters, by conducting <strong>28B tokens</strong> ablation runs on the <strong>2019-18 crawl</strong>. Out
617
  of all those runs, we identified three filters (the ones based on the histograms above) that demonstrated
618
  the most significant improvements on the aggregate score:</p>
619
  <ul>
 
632
  </li>
633
  </ul>
634
  <ul>
635
+ <li>When applying the 3 together, ~22% of tokens were removed.</li>
636
  </ul>
637
  <div class="main-plot-container">
638
  <figure><img src="plots/custom_filters.png"/></figure>
639
  <div id="plot-custom-filters"></div>
640
  </div>
641
+ <p>These filters allowed us to further improve performance and to, notably, surpass the C4 dataset performance.</p>
642
  <h2>The final dataset</h2>
643
  <p>The final 🍷 FineWeb dataset comprises 15T tokens and
644
  includes the following previously mentioned steps, in order, each providing a performance boost on our group
 
696
  <figure><img src="plots/dataset_ablations.png"/></figure>
697
  <div id="plot-dataset_ablations"></div>
698
  </div>
699
+ <p>Large language models pretrained on 🍷 FineWeb, the largest publicly available clean LLM pretraining dataset, are better-performing than other open pretraining datasets.</p>
700
  <h2>📚 FineWeb-Edu</h2>
701
  <p>A new approach has recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was used in the trainings of Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Phi3<d-cite bibtex-key="abdin2024phi"></d-cite> but its large-scale impact on web data filtering hasn't been fully explored or published.</p>
702
  <p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the paper<d-cite bibtex-key="abdin2024phi"></d-cite> stating:</p>
703
  <blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
704
  <p>Similarly, Llama 3 blog post<d-cite bibtex-key="meta2024responsible"></d-cite> notes:</p>
705
  <blockquote>We found that previous generations of Llama are good at identifying high-quality data, so we used Llama 2 to help build the text-quality classifiers that are powering Llama 3.</blockquote>
706
+ <p>However, these classifiers and filtered datasets are not publicly available. To further enhance 🍷 FineWeb's quality, we developed an educational quality classifier using annotations generated by <a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">Llama-3-70B-Instruct</a> to create <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>📚 FineWeb-Edu</strong></a>.</p>
707
  <h3>Annotation</h3>
708
+ <p>We used Llama-3-70B-Instruct to annotate 500k samples from 🍷 FineWeb, scoring each for their educational quality on a scale from 0 to 5.</p>
709
  <p>We explored various prompts and found that the additive scale by Yuan et al.<d-cite bibtex-key="yuan2024self"></d-cite> worked best. This scale allows the LLM to reason about each additional point awarded, unlike the single-rating Likert scale which fits samples into predefined boxes. Then, to avoid the LLM favoring highly technical pages like arXiv abstracts and submissions, we focused on grade-school and middle-school level knowledge. By setting a threshold of 3 (on a scale of 0 to 5) during the filtering process, we were able to also retain some high-level educational pages.</p>
710
  <div style="text-align: center; margin: 20px 0;">
711
  <img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/fjZQ4izIj1rx1xQnBTKKr.png" alt="Prompt for LLM annotation" style="width: 90%; max-width: 800px; height: auto;">
712
+ <figcaption style="font-style: italic; margin-top: 10px;">Prompt used for Llama3 annotations of the educational score, also available <a href="https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier/blob/main/utils/prompt.txt">here</a>.</figcaption>
713
  </div>
714
  <p>We also experimented with <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">Mixtral-8x-7B-Instruct</a> and <a href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">Mixtral-8x22B-Instruct</a> and a jury of all three models<d-cite bibtex-key="verga2024replacing"></d-cite> but found that Llama3 alone gave the most reliable results.</p>
715
  <h3>Classifier Training</h3>
 
740
  <p>Given that a threshold of 2 also demonstrated strong performance while retaining more data, we are releasing an additional dataset filtered with this threshold, containing 5.4 trillion tokens under <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu-score-2">HuggingFaceFW/fineweb-edu-score-2</a>.</p>
741
  <p>You can find the two datasets along with the classifier used for the filtering in this <a href="https://huggingface.co/collections/HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd">collection</a>.</p>
742
  <h2>Next steps</h2>
743
+ <p>Through our open data efforts we hope to give every model trainer the ability to create state-of-the-art large language models. As part of this process, we plan to continue iterating on FineWeb and to release more specialised filtered subsets of web data, in a fully open and reproducible manner.</p>
744
+ <p>While English currently dominates the large language model landscape, we believe that making high quality training data for other languages more easily accessible would allow millions of non english speakers to benefit from these technologies and, as such, will also strive to adapt the FineWeb Recipe to a multilingual version.</p>
 
745
  </d-article>
746
 
747
  <d-appendix>
src/clusters.js CHANGED
@@ -33,7 +33,7 @@ const DEFAULT_XAXIS = {
33
  showgrid: false,
34
  zeroline: false,
35
  title: {
36
- text: "<a href='https://github.com/huggingface/text-clustering' target='_blank' style='color: inherit;'>Fineweb dataset</a>",
37
  font: {
38
  size: 16,
39
  style: "italic",
 
33
  showgrid: false,
34
  zeroline: false,
35
  title: {
36
+ text: "The 🍷 FineWeb dataset, <a href='https://github.com/huggingface/text-clustering' target='_blank' style='color: inherit;'>clustered</a> and annotated with educational score labels",
37
  font: {
38
  size: 16,
39
  style: "italic",
src/plotting.js CHANGED
@@ -11,9 +11,9 @@ const BAR_SETTINGS = {
11
  const TASK_ID_TO_NAME = {
12
  // Ablations
13
  agg_score: "Aggregate Score",
14
- "commonsense_qa/acc_norm": "Commonsense QA Norm",
15
  "hellaswag/acc_norm": "HellaSwag",
16
- "openbookqa/acc_norm": "OpenBook QA Norm",
17
  "piqa/acc_norm": "PIQA",
18
  "siqa/acc_norm": "Social IQA",
19
  "winogrande/acc_norm": "WinoGrande",
 
11
  const TASK_ID_TO_NAME = {
12
  // Ablations
13
  agg_score: "Aggregate Score",
14
+ "commonsense_qa/acc_norm": "Commonsense QA",
15
  "hellaswag/acc_norm": "HellaSwag",
16
+ "openbookqa/acc_norm": "OpenBook QA",
17
  "piqa/acc_norm": "PIQA",
18
  "siqa/acc_norm": "Social IQA",
19
  "winogrande/acc_norm": "WinoGrande",