Spaces:

HuggingFaceFW
/

blogpost-fineweb-v1

Running

App Files Files Community

hynky HF staff commited on May 31, 2024

Commit

fb0323d

2 Parent(s): 0d2b8da 09e5351

Merge branch 'main' of hf.co:spaces/HuggingFaceFW/blogpost-fineweb-v1

Browse files

Files changed (4) hide show

bibliography.bib +84 -0
index.html +77 -66
src/clusters.js +1 -1
src/plotting.js +2 -2

bibliography.bib CHANGED Viewed

@@ -234,4 +234,88 @@ url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}
   year = {2024},
   url = {https://ai.meta.com/blog/meta-llama-3-meta-ai-responsibility/},
   note = {Accessed: 2024-05-31}
 }

   year = {2024},
   url = {https://ai.meta.com/blog/meta-llama-3-meta-ai-responsibility/},
   note = {Accessed: 2024-05-31}
+}
+@inproceedings{talmor-etal-2019-commonsenseqa,
+    title = "CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge",
+    author = "Talmor, Alon  and
+      Herzig, Jonathan  and
+      Lourie, Nicholas  and
+      Berant, Jonathan",
+    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
+    month = jun,
+    year = "2019",
+    address = "Minneapolis, Minnesota",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/N19-1421",
+    doi = "10.18653/v1/N19-1421",
+    pages = "4149--4158",
+    archivePrefix = "arXiv",
+    eprint        = "1811.00937",
+    primaryClass  = "cs",
+}
+@inproceedings{zellers-etal-2019-hellaswag,
+    title = "HellaSwag: Can a Machine Really Finish Your Sentence?",
+    author = "Zellers, Rowan  and
+      Holtzman, Ari  and
+      Bisk, Yonatan  and
+      Farhadi, Ali  and
+      Choi, Yejin",
+    editor = "Korhonen, Anna  and
+      Traum, David  and
+      M{\`a}rquez, Llu{\'\i}s",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/P19-1472",
+    doi = "10.18653/v1/P19-1472",
+    pages = "4791--4800",
+    abstract = "Recent work by Zellers et al. (2018) introduced a new task of commonsense natural language inference: given an event description such as {``}A woman sits at a piano,{''} a machine must select the most likely followup: {``}She sets her fingers on the keys.{''} With the introduction of BERT, near human-level performance was reached. Does this mean that machines can perform human level commonsense inference? In this paper, we show that commonsense inference still proves difficult for even state-of-the-art models, by presenting HellaSwag, a new challenge dataset. Though its questions are trivial for humans ({\textgreater}95{\%} accuracy), state-of-the-art models struggle ({\textless}48{\%}). We achieve this via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. AF proves to be surprisingly robust. The key insight is to scale up the length and complexity of the dataset examples towards a critical {`}Goldilocks{'} zone wherein generated text is ridiculous to humans, yet often misclassified by state-of-the-art models. Our construction of HellaSwag, and its resulting difficulty, sheds light on the inner workings of deep pretrained models. More broadly, it suggests a new path forward for NLP research, in which benchmarks co-evolve with the evolving state-of-the-art in an adversarial way, so as to present ever-harder challenges.",
+}
+@inproceedings{OpenBookQA2018,
+ title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
+ author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
+ booktitle={EMNLP},
+ year={2018}
+}
+@misc{bisk2019piqa,
+      title={PIQA: Reasoning about Physical Commonsense in Natural Language},
+      author={Yonatan Bisk and Rowan Zellers and Ronan Le Bras and Jianfeng Gao and Yejin Choi},
+      year={2019},
+      eprint={1911.11641},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{sap2019socialiqa,
+      title={SocialIQA: Commonsense Reasoning about Social Interactions},
+      author={Maarten Sap and Hannah Rashkin and Derek Chen and Ronan LeBras and Yejin Choi},
+      year={2019},
+      eprint={1904.09728},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{sakaguchi2019winogrande,
+      title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
+      author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
+      year={2019},
+      eprint={1907.10641},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{clark2018think,
+      title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
+      author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
+      year={2018},
+      eprint={1803.05457},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
+@misc{hendrycks2021measuring,
+      title={Measuring Massive Multitask Language Understanding},
+      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+      year={2021},
+      eprint={2009.03300},
+      archivePrefix={arXiv},
+      primaryClass={cs.CY}
 }

index.html CHANGED Viewed

@@ -12,7 +12,7 @@
     <meta name="viewport" content="width=device-width, initial-scale=1">
     <meta charset="utf8">
     <base target="_blank">
-    <title>FineWeb: 15T tokens of high quality web data</title>
     <style>
         /* ****************************************
@@ -188,6 +188,7 @@
     <p>🍷 FineWeb, a 15-trillion token dataset derived from 96 <a href="https://commoncrawl.org/">CommonCrawl</a> snapshots, produces better-performing LLMs than other open pretraining datasets. To advance the understanding of how best to curate high-quality pretraining datasets, we carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies.</p>
     <p>We are also excited to announce the release of <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>📚 FineWeb-Edu</strong></a>, a version of 🍷 FineWeb that was filtered for educational content, available in two sizes: <strong>1.3 trillion (very high quality) and 5.4 trillion (high quality) tokens</strong>. 📚 FineWeb-Edu outperforms all existing public web datasets, with models pretrained on it showing notable improvements on knowledge- and reasoning-intensive benchmarks like MMLU, ARC, and OpenBookQA. You can
         download it <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu">here</a>.</p>
     <p>As 🍷 FineWeb has gathered a lot of interest from the
         community, we decided to further explain the steps involved in creating it, our processing decisions and
@@ -237,7 +238,7 @@
         a set of evaluation tasks. As we are curating a dataset for pretraining a generalist LLM, it is important to
         choose a diverse set of tasks and try not to overfit to any one individual benchmark.</p>
     <p>Another way to evaluate different datasets would be to
-        train a model on each one and have humans rate and compare the outputs of each one (like on the <a
                 href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
         reliable results in terms of representing real model usage, but getting ablation results this way is too
         expensive and slow. It also often requires that the models have undergone at least an instruction finetuning stage, as pretrained models have difficulty following instructions.<d-cite bibtex-key="ouyang2022training"></d-cite></p>
@@ -264,15 +265,26 @@
     <ul>
         <li>small variance between runs trained on different samplings of the same
             dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
-            resulting scores to have as little noise as possible
         </li>
     </ul>
     <ul>
         <li>performance increasing monotonically (or close) over a training run:
             ideally, as the number of seen tokens increases, the performance on this benchmark should not decrease
-            (should not be too noisy)
         </li>
     </ul>
     <p>To
         have results quickly we capped longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
         min on a single node of 8 GPUs - done in parallel to the training).</p>
@@ -334,15 +346,13 @@
         extracted dumps (there are currently 96 dumps) we obtained roughly 36 trillion tokens of data (when
         tokenized with the <code>gpt2</code> tokenizer).</p>
     <h3>Deduplication</h3>
-    <p>Deduplication is another important step, specially for web
-        datasets. Methods to deduplicate datasets attempt to remove redundant/repeated data. Deduplication is one of
-        the most important steps when creating large web datasets for LLMs.</p>
     <h4>Why deduplicate?</h4>
     <p>The web has many aggregators, mirrors, templated pages or
         just otherwise repeated content spread over different domains and webpages. Often, these duplicated pages
         can be introduced by the crawler itself, when different links point to the same page. </p>
     <p>Removing these duplicates (deduplicating) has been linked to an improvement in model performance<d-cite bibtex-key="lee2022deduplicating"></d-cite> and a reduction in memorization of pretraining data<d-cite bibtex-key="carlini2023quantifying"></d-cite>, which might
-        allow for better generalization. Additionally, the performance uplift can also be tied to increased training
         efficiency: by removing duplicated content, for the same number of training tokens, a model will have seen
         more diverse data.<d-cite bibtex-key="muennighoff2023scaling"></d-cite><d-cite bibtex-key="hernandez2022scaling"></d-cite></p>
     <p>There are different ways to identify and even define
@@ -351,11 +361,11 @@
         similarity metric to mark documents as duplicates, or “exact” by checking for exact matches between two
         documents (or lines, paragraphs, or whatever other granularity level being used).</p>
     <h4>Our deduplication parameters</h4>
-    <p>Similarly to RefinedWeb, we decided to apply MinHash, a
-        fuzzy hash based deduplication technique that scales well and allows us to tune similarity thresholds (by changing the number and size of buckets) and the granularity of the matches (changing the n-gram size). We chose to compute minhashes on each document’s 5-grams, using
         112 hash functions in total, split into 14 buckets of 8 hashes each — targeting documents that are at least
         75% similar. Documents with the same 8 minhashes in any bucket are considered a duplicate of each other.</p>
-    <p>This would mean that for two documents with a similarity (<code>s</code>)
         of 0.7, 0.75, 0.8 and 0.85, the probability that they would be identified as duplicates would be 56%, 77%,
         92% and 98.8% respectively ($$1-(1-s^8)^{14}$$). See the plot below for a match probability
         comparison between our setup with 112 hashes and the one from RefinedWeb, with 9000 hashes, divided into 450
@@ -370,19 +380,19 @@
     <p>It should also be noted that intra-document deduplication is already handled by our repetition filter, which removes documents with many repeated lines and paragraphs.</p>
     <h4>More deduplication is always better, right?</h4>
     <p>Our initial approach was to take the entire dataset (all
-        96 dumps) and deduplicate them as one big dataset using MinHash.</p>
     <p>We did this in an iterative manner: starting with the most
-        recent dump (which at the time was 2023-50) and taking the oldest one last, we would deduplicate each dump
-        not only against itself but also by removing any matches with duplicates from the previously processed
         dumps. </p>
     <p>For instance, for the second most recent dump (2023-40 at
-        the time), we deduplicated it against the most recent one in addition to itself. In particular, the oldest
-        dump was deduplicated against all other dumps. As a result, more data was removed in the oldest dumps (last
-        to be deduplicated) than in the most recent ones.</p>
     <p>Deduplicating the dataset in this manner resulted in 4
         trillion tokens of data, but, quite surprisingly for us, when training on a randomly sampled 350 billion
         tokens subset, the model showed no improvement over one trained on the non deduplicated data (see orange and
-        green curve below), scoring far below its predecessor RefinedWeb on our aggregate of tasks.</p>
     <div class="main-plot-container">
         <figure><img src="plots/dedup_all_dumps_bad.png"/></figure>
         <div id="plot-dedup_all_dumps_bad"></div>
@@ -394,7 +404,7 @@
         <li>pre deduplication, this dump had ~490 billion tokens</li>
     </ul>
     <ul>
-        <li>after our iterative MinHash, ~31 billion tokens remained (94% of data
             removed)
         </li>
     </ul>
@@ -411,14 +421,13 @@
             iterative dedup process (<em>originally removed data</em>)<d-footnote>While there may be documents in <em>originally kept
             data</em> similar to documents in <em>originally removed data</em>, we estimate the overlap to be small (around 4 billion tokens)</d-footnote>
         </li>
     </ul>
     <div class="main-plot-container">
         <figure><img src="plots/removed_data_cross_dedup.png"/></figure>
         <div id="plot-removed_data_cross_dedup"></div>
     </div>
-    <p>These results show that, for this older dump where we were
-        removing over 90% of the original data, the data that was kept was actually <em>worse</em> than the data
         removed (considered independently of all the other dumps). This is also confirmed by visual inspection: <em>originally kept
             data</em> contains far more ads, lists of keywords and generally badly formatted text than <em>originally removed data</em>.</p>
     <h4>Taking a step back: individual dump dedup</h4>
@@ -434,12 +443,12 @@
     <p>We hypothesize that the main improvement gained from
         deduplication is the removal of very large clusters that are present in every single dump (you will find
         some examples of these clusters on the RefinedWeb paper, each containing <em>hundreds of thousands</em> of
-        documents) and that further deduplication for low number of deduplications (less than ~100 i.e. the number
-        of dumps) actually harm performance: data that does not find a duplicate match in any other dump might
         actually be worse quality/more out of distribution (as evidenced by the results on the 2013-48 data). </p>
     <p>While you might see some performance improvement when
-        deduplicating a few dumps together, at the scale of all the dumps this upsampling of lower quality data side
-        effect seems to have a great impact.</p>
     <p>One possibility to consider is that as filtering quality
         improves, this effect may not be as prevalent, since the filtering might be able to remove some of this
         lower quality data. We also experimented with applying different, and often “lighter”, deduplication
@@ -453,7 +462,7 @@
         tokens on measuring deduplication impact, we considered the following (very extreme and unrealistic
         regarding the degree of duplication observed) theoretical scenario:</p>
     <ul>
-        <li>there are 100 CommonCrawl dumps (actually roughly true)</li>
     </ul>
     <ul>
         <li>each dump has been perfectly individually deduplicated (every single
@@ -485,30 +494,30 @@
         (#duplicates=1), despite the fact that in the entire dataset each document is repeated 100 times (once per
         dump). We start seeing some changes at the 100B scale (0.5% of the total dataset), with a large number of
         documents being repeated twice, and a few even 4-8 times. At the larger scale of 1T (5% of the total
-        dataset), the majority of the documents are repeated up to 8 times, with a some being repeated up to 16
         times. </p>
     <p>We ran our performance evaluations for the deduplicated
         data at the 350B scale, which would, under this theoretical scenario, be made up of a significant portion of
         documents duplicated up to 8 times. This simulation illustrates the inherent difficulties associated with
-        measuring deduplication impact on the training of LLMs, once the biggest document clusters have been
         removed.</p>
     <h4>Other (failed) global approaches</h4>
     <p>We attempted to improve the performance of the
-        independently minhash deduped 20 trillion tokens of data by further deduplicating it (globally, over all crawls) with the following methods</p>
     <ul>
         <li>URL deduplication, where we only kept one document per normalized
-            (lowercased) URL (71.5% of tokens removed, 5.6T left) — <em>🍷 FineWeb URL dedup</em></li>
     </ul>
     <ul>
         <li>Line deduplication:
             <ul>
                 <li>remove all but 1 (randomly chosen) occurrence of each duplicated line (77.8% of
-                    tokens dropped, 4.4T left) — <em>🍷 FineWeb line dedup</em></li>
             </ul>
             <ul>
                 <li>same as above, but only removing duplicate lines with at least 10
                     words and dropping documents with fewer than 3 sentences after deduplication (85% of tokens
-                    dropped, 2.9T left) — <em>🍷 FineWeb line dedup w/ min words</em></li>
             </ul>
             <ul>
                 <li>remove all but 1 occurrence of each span of 3 duplicated lines
@@ -526,22 +535,21 @@
     </div>
     <h3>Additional filtering</h3>
     <p>By this point we had reached the same performance as
-        RefinedWeb, but on our aggregate of tasks, another heavily filtered dataset, the C4 dataset<d-cite bibtex-key="raffel2023exploring"></d-cite>, still showed stronger performance (with
         the caveat that it is a relatively small dataset for current web-scale standards).</p>
     <p>We therefore set out to find new filtering steps that
-        would, at first, allow us to match the performance of C4 and eventually surpass it. A natural starting point
         was to look into the processing of C4 itself.</p>
     <h4>C4: A dataset that has stood the test of time</h4>
     <p>The <a href="https://huggingface.co/datasets/c4">C4
         dataset</a> was first released in 2019. It was obtained from the <code>2019-18</code> CommonCrawl dump by
         removing non english data, applying some heuristic filters on both the line and document level,
-        deduplicating on the line level and removing documents containing words from a word blocklist.</p>
-    <p>Despite its age and limited size (around 175B gpt2
-        tokens), models trained on this dataset have strong performance, excelling in particular on the Hellaswag
-        benchmark, one of the benchmarks in our “early signal” group with the stronger signal and highest
-        signal-over-noise ratio. As such, it has stayed a common sub-set of typical LLM training, for instance in
-        the relatively recent Llama1 model<d-cite bibtex-key="touvron2023llama"></d-cite>. We experimented applying
-        each of the different filters used in C4 to a baseline of the independently deduped 🍷 FineWeb 2019-18 dump:</p>
     <div class="main-plot-container">
         <figure><img src="plots/c4_filters_hellaswag.png"/></figure>
         <div id="plot-c4_filters_hellaswag"></div>
@@ -569,7 +577,7 @@
         </li>
     </ul>
     <ul>
-        <li>All filters except the very destructive terminal_punct perform better than
             terminal_punct by itself, while removing less in total (~7%)
         </li>
     </ul>
@@ -577,33 +585,35 @@
         the terminal punctuation one. We validated these results with a longer run, which you will find in a plot in
         the next section.</p>
     <h4>A statistical approach to develop heuristic filters</h4>
-    <p>Due to our assumption that Full Minhash upsamples lower quality data in the oldest dumps, we were interested whether
-        we could find heuristic filters which would remove them. In order to find such filters
-        we collected a very large list of statistics (statistical metrics) — over <strong>50</strong> — on both the independently
-        minhashed version and the result from the (worse quality) full dedup from 2013-48 and 2015-22 crawls (older crawls). We then compared the
-        statistics at a macro level, by looking at the distribution of these metrics for each one.</p>
-    <p>The collected statistics ranged from common document-level
         metrics (e.g. number of lines, avg. line/word length, etc) to inter-document repetition metrics (MassiveText
-        inspired). Perhaps not too surprisingly given our findings for deduplication, we found significant
         disparities in most of the metrics for the two deduplication methods. For instance, the <code>line-char-duplicates</code>
         metric (nb. of characters in duplicated lines / nb. characters), roughly doubled from the independent dedup
-        (0.0053 for 2015-22 and 0.0058 for 2013-48), to the full dedup (0.011 for 2015-22 and 0.01 for 2013-48),
         indicating that the latter had higher inter-document repetition.</p>
-    <p>To choose the metrics for filtering, we computed Wasserstein distance between the two versions of 2013-48 crawl for all our metrics
-        and then select the ones with the heighest distance. We would then inspect the histograms, empirically choose a threshold
-        and filter the data and inspect the removed documents. This process yielded 17 candidate
-        threshold-filter pairs. In the image below, you can see 3 of these histograms.</p>
     <div class="main-plot-container">
         <figure><img src="plots/custom_filters.png"/></figure>
         <div id="plot-stats"></div>
     </div>
-    <p>As an example, we inspected the histograms of Fraction of lines ending with punctuation metric (see the image above) and observed the increased document density of Full Minhash at around 0.12 ratio.
-        We then filtered with this threshold and found out that the removed data had a higher amount of short lists or consisted of only document layout text (Home Sign up etc...).
     </p>
-    <p>To assess the effectiveness of these newly created
-        filters, we conducted <strong>28B tokens </strong>ablation runs on the <strong>2019-18 crawl</strong>. Out
         of all those runs, we identified three filters (the ones based on the histograms above) that demonstrated
         the most significant improvements on the aggregate score:</p>
     <ul>
@@ -622,12 +632,13 @@
         </li>
     </ul>
     <ul>
-        <li>When applying the 3 together, ~22% of tokens were removed</li>
     </ul>
     <div class="main-plot-container">
         <figure><img src="plots/custom_filters.png"/></figure>
         <div id="plot-custom-filters"></div>
     </div>
     <h2>The final dataset</h2>
     <p>The final 🍷 FineWeb dataset comprises 15T tokens and
         includes the following previously mentioned steps, in order, each providing a performance boost on our group
@@ -685,19 +696,20 @@
         <figure><img src="plots/dataset_ablations.png"/></figure>
         <div id="plot-dataset_ablations"></div>
     </div>
     <h2>📚 FineWeb-Edu</h2>
     <p>A new approach has recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was used in the trainings of Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Phi3<d-cite bibtex-key="abdin2024phi"></d-cite> but its large-scale impact on web data filtering hasn't been fully explored or published.</p>
     <p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the paper<d-cite bibtex-key="abdin2024phi"></d-cite> stating:</p>
     <blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
     <p>Similarly, Llama 3 blog post<d-cite bibtex-key="meta2024responsible"></d-cite> notes:</p>
     <blockquote>We found that previous generations of Llama are good at identifying high-quality data, so we used Llama 2 to help build the text-quality classifiers that are powering Llama 3.</blockquote>
-    <p>However, these classifiers and filtered datasets are not publicly available. To enhance 🍷 FineWeb's quality, we developed an educational quality classifier using annotations generated by <a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">Llama-3-70B-Instruct</a> to create 📚 FineWeb-Edu.</p>
     <h3>Annotation</h3>
-    <p>We used Llama-3-70B-Instruct to annotate 500k samples from the 🍷 FineWeb dataset, scoring each for their educational quality on a scale from 0 to 5.</p>
     <p>We explored various prompts and found that the additive scale by Yuan et al.<d-cite bibtex-key="yuan2024self"></d-cite> worked best. This scale allows the LLM to reason about each additional point awarded, unlike the single-rating Likert scale which fits samples into predefined boxes. Then, to avoid the LLM favoring highly technical pages like arXiv abstracts and submissions, we focused on grade-school and middle-school level knowledge. By setting a threshold of 3 (on a scale of 0 to 5) during the filtering process, we were able to also retain some high-level educational pages.</p>
     <div style="text-align: center; margin: 20px 0;">
         <img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/fjZQ4izIj1rx1xQnBTKKr.png" alt="Prompt for LLM annotation" style="width: 90%; max-width: 800px; height: auto;">
-        <figcaption style="font-style: italic; margin-top: 10px;">Prompt used for Llama3 annotations of the educational score, also available on <a href="https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier/blob/main/utils/prompt.txt">here</a>.</figcaption>
     </div>
     <p>We also experimented with  <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">Mixtral-8x-7B-Instruct</a> and <a href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">Mixtral-8x22B-Instruct</a> and a jury of all three models<d-cite bibtex-key="verga2024replacing"></d-cite> but found that Llama3 alone gave the most reliable results.</p>
     <h3>Classifier Training</h3>
@@ -728,9 +740,8 @@
     <p>Given that a threshold of 2 also demonstrated strong performance while retaining more data, we are releasing an additional dataset filtered with this threshold, containing 5.4 trillion tokens under <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu-score-2">HuggingFaceFW/fineweb-edu-score-2</a>.</p>
     <p>You can find the two datasets along with the classifier used for the filtering in this <a href="https://huggingface.co/collections/HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd">collection</a>.</p>
     <h2>Next steps</h2>
-    <p>We want to continue improving FineWeb and will also
-        release a technical report with more details soon.</p>
-    <p>Adapting the FineWeb recipe [wip]</p>
 </d-article>
 <d-appendix>

     <meta name="viewport" content="width=device-width, initial-scale=1">
     <meta charset="utf8">
     <base target="_blank">
+    <title>FineWeb: decanting the web for the finest text data at scale</title>
     <style>
         /* ****************************************
     <p>🍷 FineWeb, a 15-trillion token dataset derived from 96 <a href="https://commoncrawl.org/">CommonCrawl</a> snapshots, produces better-performing LLMs than other open pretraining datasets. To advance the understanding of how best to curate high-quality pretraining datasets, we carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies.</p>
     <p>We are also excited to announce the release of <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>📚 FineWeb-Edu</strong></a>, a version of 🍷 FineWeb that was filtered for educational content, available in two sizes: <strong>1.3 trillion (very high quality) and 5.4 trillion (high quality) tokens</strong>. 📚 FineWeb-Edu outperforms all existing public web datasets, with models pretrained on it showing notable improvements on knowledge- and reasoning-intensive benchmarks like MMLU, ARC, and OpenBookQA. You can
         download it <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu">here</a>.</p>
+    <p>Both datasets are released under the permissive <strong><a href="https://opendatacommons.org/licenses/by/1-0/">ODC-By 1.0 license</a></strong></p>
     <p>As 🍷 FineWeb has gathered a lot of interest from the
         community, we decided to further explain the steps involved in creating it, our processing decisions and
         a set of evaluation tasks. As we are curating a dataset for pretraining a generalist LLM, it is important to
         choose a diverse set of tasks and try not to overfit to any one individual benchmark.</p>
     <p>Another way to evaluate different datasets would be to
+        train a model on each one and have humans rate and compare their outputs (like on the <a
                 href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
         reliable results in terms of representing real model usage, but getting ablation results this way is too
         expensive and slow. It also often requires that the models have undergone at least an instruction finetuning stage, as pretrained models have difficulty following instructions.<d-cite bibtex-key="ouyang2022training"></d-cite></p>
     <ul>
         <li>small variance between runs trained on different samplings of the same
             dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
+            resulting scores to have as little evaluation noise as possible
         </li>
     </ul>
     <ul>
         <li>performance increasing monotonically (or close) over a training run:
             ideally, as the number of seen tokens increases, the performance on this benchmark should not decrease
+            (which would be indicative of unreliable results at a small scale)
         </li>
     </ul>
+    <p>We selected the following list of benchmarks:</p>
+    <ul>
+        <li>CommonSense QA<d-cite bibtex-key="talmor-etal-2019-commonsenseqa"></d-cite></li>
+        <li>HellaSwag<d-cite bibtex-key="zellers-etal-2019-hellaswag"></d-cite></li>
+        <li>OpenBook QA<d-cite bibtex-key="OpenBookQA2018"></d-cite></li>
+        <li>PIQA<d-cite bibtex-key="bisk2019piqa"></d-cite></li>
+        <li>SIQA<d-cite bibtex-key="sap2019socialiqa"></d-cite></li>
+        <li>WinoGrande<d-cite bibtex-key="sakaguchi2019winogrande"></d-cite></li>
+        <li>ARC<d-cite bibtex-key="clark2018think"></d-cite></li>
+        <li>MMLU<d-cite bibtex-key="hendrycks2021measuring"></d-cite></li>
+    </ul>
     <p>To
         have results quickly we capped longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
         min on a single node of 8 GPUs - done in parallel to the training).</p>
         extracted dumps (there are currently 96 dumps) we obtained roughly 36 trillion tokens of data (when
         tokenized with the <code>gpt2</code> tokenizer).</p>
     <h3>Deduplication</h3>
+    <p>Deduplication is one of the most important steps when creating large web datasets for LLM pretraining. Methods to deduplicate datasets attempt to identify and remove redundant/repeated data from the dataset. </p>
     <h4>Why deduplicate?</h4>
     <p>The web has many aggregators, mirrors, templated pages or
         just otherwise repeated content spread over different domains and webpages. Often, these duplicated pages
         can be introduced by the crawler itself, when different links point to the same page. </p>
     <p>Removing these duplicates (deduplicating) has been linked to an improvement in model performance<d-cite bibtex-key="lee2022deduplicating"></d-cite> and a reduction in memorization of pretraining data<d-cite bibtex-key="carlini2023quantifying"></d-cite>, which might
+        allow for better generalization. Additionally, the performance uplift obtained through deduplication can also be tied to increased training
         efficiency: by removing duplicated content, for the same number of training tokens, a model will have seen
         more diverse data.<d-cite bibtex-key="muennighoff2023scaling"></d-cite><d-cite bibtex-key="hernandez2022scaling"></d-cite></p>
     <p>There are different ways to identify and even define
         similarity metric to mark documents as duplicates, or “exact” by checking for exact matches between two
         documents (or lines, paragraphs, or whatever other granularity level being used).</p>
     <h4>Our deduplication parameters</h4>
+    <p>Similarly to RefinedWeb<d-cite bibtex-key="penedo2023refinedweb"></d-cite>, we decided to apply MinHash, a
+        fuzzy hash based deduplication technique that scales well and allows us to tune similarity thresholds (by changing the number and size of buckets) and the granularity of the matches (by changing the n-gram size). We chose to compute minhashes on each document’s 5-grams, using
         112 hash functions in total, split into 14 buckets of 8 hashes each — targeting documents that are at least
         75% similar. Documents with the same 8 minhashes in any bucket are considered a duplicate of each other.</p>
+    <p>This would mean that for two documents with a similarity ($$s$$)
         of 0.7, 0.75, 0.8 and 0.85, the probability that they would be identified as duplicates would be 56%, 77%,
         92% and 98.8% respectively ($$1-(1-s^8)^{14}$$). See the plot below for a match probability
         comparison between our setup with 112 hashes and the one from RefinedWeb, with 9000 hashes, divided into 450
     <p>It should also be noted that intra-document deduplication is already handled by our repetition filter, which removes documents with many repeated lines and paragraphs.</p>
     <h4>More deduplication is always better, right?</h4>
     <p>Our initial approach was to take the entire dataset (all
+        90+ dumps) and deduplicate them together as one big dataset using MinHash.</p>
     <p>We did this in an iterative manner: starting with the most
+        recent dump (which at the time was 2023-50) and proceeding chronologically until the oldest one, we would deduplicate each dump
+        not only within itself, but we would also remove any matches with documents from the previously processed (more recent)
         dumps. </p>
     <p>For instance, for the second most recent dump (2023-40 at
+        the time), we deduplicated it against the most recent one in addition to within itself. In particular, the oldest
+        dump was deduplicated against all other dumps. As a result, more data was removed from the oldest dumps (last
+        to be deduplicated) than from the most recent ones.</p>
     <p>Deduplicating the dataset in this manner resulted in 4
         trillion tokens of data, but, quite surprisingly for us, when training on a randomly sampled 350 billion
         tokens subset, the model showed no improvement over one trained on the non deduplicated data (see orange and
+        green curves below), scoring far below its predecessor RefinedWeb on our aggregate of tasks.</p>
     <div class="main-plot-container">
         <figure><img src="plots/dedup_all_dumps_bad.png"/></figure>
         <div id="plot-dedup_all_dumps_bad"></div>
         <li>pre deduplication, this dump had ~490 billion tokens</li>
     </ul>
     <ul>
+        <li>after our iterative MinHash, ~31 billion tokens remained (94% of data had been
             removed)
         </li>
     </ul>
             iterative dedup process (<em>originally removed data</em>)<d-footnote>While there may be documents in <em>originally kept
             data</em> similar to documents in <em>originally removed data</em>, we estimate the overlap to be small (around 4 billion tokens)</d-footnote>
         </li>
     </ul>
     <div class="main-plot-container">
         <figure><img src="plots/removed_data_cross_dedup.png"/></figure>
         <div id="plot-removed_data_cross_dedup"></div>
     </div>
+    <p>These results show that, for this older dump from which we had
+        removed over 90% of the original data, the data that was kept was actually <em>worse</em> than the data
         removed (considered independently of all the other dumps). This is also confirmed by visual inspection: <em>originally kept
             data</em> contains far more ads, lists of keywords and generally badly formatted text than <em>originally removed data</em>.</p>
     <h4>Taking a step back: individual dump dedup</h4>
     <p>We hypothesize that the main improvement gained from
         deduplication is the removal of very large clusters that are present in every single dump (you will find
         some examples of these clusters on the RefinedWeb paper, each containing <em>hundreds of thousands</em> of
+        documents) and that further deduplication for clusters with a low number of duplicates (less than ~100 i.e. the number
+        of dumps) actually harms performance: data that does not find a duplicate match in any other dump might
         actually be worse quality/more out of distribution (as evidenced by the results on the 2013-48 data). </p>
     <p>While you might see some performance improvement when
+        deduplicating a few dumps together, at the scale of the entire dataset (all the dumps), the effect from this upsampling of lower quality data side
+        effect seems to be more impactful.</p>
     <p>One possibility to consider is that as filtering quality
         improves, this effect may not be as prevalent, since the filtering might be able to remove some of this
         lower quality data. We also experimented with applying different, and often “lighter”, deduplication
         tokens on measuring deduplication impact, we considered the following (very extreme and unrealistic
         regarding the degree of duplication observed) theoretical scenario:</p>
     <ul>
+        <li>there are 100 CommonCrawl dumps (roughly accurate)</li>
     </ul>
     <ul>
         <li>each dump has been perfectly individually deduplicated (every single
         (#duplicates=1), despite the fact that in the entire dataset each document is repeated 100 times (once per
         dump). We start seeing some changes at the 100B scale (0.5% of the total dataset), with a large number of
         documents being repeated twice, and a few even 4-8 times. At the larger scale of 1T (5% of the total
+        dataset), the majority of the documents are repeated up to 8 times, with some being repeated up to 16
         times. </p>
     <p>We ran our performance evaluations for the deduplicated
         data at the 350B scale, which would, under this theoretical scenario, be made up of a significant portion of
         documents duplicated up to 8 times. This simulation illustrates the inherent difficulties associated with
+        measuring deduplication impact on the training of LLMs, once the biggest duplicate clusters have been
         removed.</p>
     <h4>Other (failed) global approaches</h4>
     <p>We attempted to improve the performance of the
+        independently minhash deduped 20 trillion tokens of data by further deduplicating it (globally, over all dumps) with the following methods:</p>
     <ul>
         <li>URL deduplication, where we only kept one document per normalized
+            (lowercased) URL (71.5% of tokens removed, 5.6T left) — <em>FineWeb URL dedup</em></li>
     </ul>
     <ul>
         <li>Line deduplication:
             <ul>
                 <li>remove all but 1 (randomly chosen) occurrence of each duplicated line (77.8% of
+                    tokens dropped, 4.4T left) — <em>FineWeb line dedup</em></li>
             </ul>
             <ul>
                 <li>same as above, but only removing duplicate lines with at least 10
                     words and dropping documents with fewer than 3 sentences after deduplication (85% of tokens
+                    dropped, 2.9T left) — <em>FineWeb line dedup w/ min words</em></li>
             </ul>
             <ul>
                 <li>remove all but 1 occurrence of each span of 3 duplicated lines
     </div>
     <h3>Additional filtering</h3>
     <p>By this point we had reached the same performance as
+        RefinedWeb with base filtering + independent MinHash, but on our aggregate of tasks, another heavily filtered dataset, the C4 dataset<d-cite bibtex-key="raffel2023exploring"></d-cite>, still showed stronger performance (with
         the caveat that it is a relatively small dataset for current web-scale standards).</p>
     <p>We therefore set out to find new filtering steps that
+        would, at first, allow us to match the performance of C4 and, at a second stage, surpass it. A natural starting point
         was to look into the processing of C4 itself.</p>
     <h4>C4: A dataset that has stood the test of time</h4>
     <p>The <a href="https://huggingface.co/datasets/c4">C4
         dataset</a> was first released in 2019. It was obtained from the <code>2019-18</code> CommonCrawl dump by
         removing non english data, applying some heuristic filters on both the line and document level,
+        deduplicating on the line level, and removing documents containing words from a word blocklist.</p>
+    <p>Despite its age and limited size for current standards (around 175B gpt2 tokens), this dataset is, to this day, a common sub-set of typical LLM training, being used in models such as the relatively recent Llama1<d-cite bibtex-key="touvron2023llama"></d-cite>.
+        This success is due to the strong performance that models trained on this dataset exhibit, excelling in particular on the Hellaswag
+        benchmark <d-cite bibtex-key="zellers-etal-2019-hellaswag"></d-cite>, one of the benchmarks in our “early signal” group with the highest
+        signal-to-noise ratio. We experimented applying
+        each of the different filters used in C4 to a baseline of the independently deduped FineWeb 2019-18 dump:</p>
     <div class="main-plot-container">
         <figure><img src="plots/c4_filters_hellaswag.png"/></figure>
         <div id="plot-c4_filters_hellaswag"></div>
         </li>
     </ul>
     <ul>
+        <li>"All filters except the (very destructive) terminal_punct" performs better than
             terminal_punct by itself, while removing less in total (~7%)
         </li>
     </ul>
         the terminal punctuation one. We validated these results with a longer run, which you will find in a plot in
         the next section.</p>
     <h4>A statistical approach to develop heuristic filters</h4>
+    <p>To develop new heuristic filters and select their thresholds we devised a systematic process:</p>
+    <ol><li>we started by collecting a very large list of high level statistics (over <strong>50</strong>) ranging from common document-level
         metrics (e.g. number of lines, avg. line/word length, etc) to inter-document repetition metrics (MassiveText
+        inspired), on both a high quality and a lower quality web dataset;</li>
+        <li>we selected the metrics for which the Wasserstein distance between the two distributions (of the metric computed on each dataset) was larger;</li>
+        <li>we inspected the histograms of the two distributions and empirically chose a threshold that would make the lower quality dataset more closely resemble the higher quality one on this metric;</li>
+        <li>we validated the resulting filter (metric-threshold pair) by using it on a reference dataset and running small ablations.</li>
+    </ol>
+    <p>Due to our assumption that global MinHash greatly upsamples lower quality data in the oldest dumps, we computed metrics on both the independently
+        MinHashed and the (worse quality) global MinHashed versions of the 2013-48 and 2015-22 crawls (two older crawls). We then compared the
+        statistics at a macro level, by looking at the distribution of these metrics for each one.</p>
+    <p>Perhaps not too surprisingly given our findings for deduplication, we found significant
         disparities in most of the metrics for the two deduplication methods. For instance, the <code>line-char-duplicates</code>
         metric (nb. of characters in duplicated lines / nb. characters), roughly doubled from the independent dedup
+        (0.0053 for 2015-22 and 0.0058 for 2013-48), to the global dedup (0.011 for 2015-22 and 0.01 for 2013-48),
         indicating that the latter had higher inter-document repetition.</p>
+    <p>Following the process listed above for these datasets yielded 17 candidate
+        metric-threshold pairs. In the image below, you can see 3 of these histograms:</p>
     <div class="main-plot-container">
         <figure><img src="plots/custom_filters.png"/></figure>
         <div id="plot-stats"></div>
     </div>
+    <p>As an example, we inspected the histograms of "fraction of lines ending with punctuation" (see the image above) and observed an increased document density of global MinHash at around 0.12.
+        We then filtered with this threshold and found that the removed data had a higher amount of short lists or consisted of only document layout text ("Home", "Sign up", etc).
     </p>
+    <p>We then assessed the effectiveness of these 17 newly created
+        filters, by conducting <strong>28B tokens</strong> ablation runs on the <strong>2019-18 crawl</strong>. Out
         of all those runs, we identified three filters (the ones based on the histograms above) that demonstrated
         the most significant improvements on the aggregate score:</p>
     <ul>
         </li>
     </ul>
     <ul>
+        <li>When applying the 3 together, ~22% of tokens were removed.</li>
     </ul>
     <div class="main-plot-container">
         <figure><img src="plots/custom_filters.png"/></figure>
         <div id="plot-custom-filters"></div>
     </div>
+    <p>These filters allowed us to further improve performance and to, notably, surpass the C4 dataset performance.</p>
     <h2>The final dataset</h2>
     <p>The final 🍷 FineWeb dataset comprises 15T tokens and
         includes the following previously mentioned steps, in order, each providing a performance boost on our group
         <figure><img src="plots/dataset_ablations.png"/></figure>
         <div id="plot-dataset_ablations"></div>
     </div>
+    <p>Large language models pretrained on 🍷 FineWeb, the largest publicly available clean LLM pretraining dataset, are better-performing than other open pretraining datasets.</p>
     <h2>📚 FineWeb-Edu</h2>
     <p>A new approach has recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was used in the trainings of Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Phi3<d-cite bibtex-key="abdin2024phi"></d-cite> but its large-scale impact on web data filtering hasn't been fully explored or published.</p>
     <p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the paper<d-cite bibtex-key="abdin2024phi"></d-cite> stating:</p>
     <blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
     <p>Similarly, Llama 3 blog post<d-cite bibtex-key="meta2024responsible"></d-cite> notes:</p>
     <blockquote>We found that previous generations of Llama are good at identifying high-quality data, so we used Llama 2 to help build the text-quality classifiers that are powering Llama 3.</blockquote>
+    <p>However, these classifiers and filtered datasets are not publicly available. To further enhance 🍷 FineWeb's quality, we developed an educational quality classifier using annotations generated by <a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">Llama-3-70B-Instruct</a> to create <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>📚 FineWeb-Edu</strong></a>.</p>
     <h3>Annotation</h3>
+    <p>We used Llama-3-70B-Instruct to annotate 500k samples from 🍷 FineWeb, scoring each for their educational quality on a scale from 0 to 5.</p>
     <p>We explored various prompts and found that the additive scale by Yuan et al.<d-cite bibtex-key="yuan2024self"></d-cite> worked best. This scale allows the LLM to reason about each additional point awarded, unlike the single-rating Likert scale which fits samples into predefined boxes. Then, to avoid the LLM favoring highly technical pages like arXiv abstracts and submissions, we focused on grade-school and middle-school level knowledge. By setting a threshold of 3 (on a scale of 0 to 5) during the filtering process, we were able to also retain some high-level educational pages.</p>
     <div style="text-align: center; margin: 20px 0;">
         <img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/fjZQ4izIj1rx1xQnBTKKr.png" alt="Prompt for LLM annotation" style="width: 90%; max-width: 800px; height: auto;">
+        <figcaption style="font-style: italic; margin-top: 10px;">Prompt used for Llama3 annotations of the educational score, also available <a href="https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier/blob/main/utils/prompt.txt">here</a>.</figcaption>
     </div>
     <p>We also experimented with  <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">Mixtral-8x-7B-Instruct</a> and <a href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">Mixtral-8x22B-Instruct</a> and a jury of all three models<d-cite bibtex-key="verga2024replacing"></d-cite> but found that Llama3 alone gave the most reliable results.</p>
     <h3>Classifier Training</h3>
     <p>Given that a threshold of 2 also demonstrated strong performance while retaining more data, we are releasing an additional dataset filtered with this threshold, containing 5.4 trillion tokens under <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu-score-2">HuggingFaceFW/fineweb-edu-score-2</a>.</p>
     <p>You can find the two datasets along with the classifier used for the filtering in this <a href="https://huggingface.co/collections/HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd">collection</a>.</p>
     <h2>Next steps</h2>
+    <p>Through our open data efforts we hope to give every model trainer the ability to create state-of-the-art large language models. As part of this process, we plan to continue iterating on FineWeb and to release more specialised filtered subsets of web data, in a fully open and reproducible manner.</p>
+    <p>While English currently dominates the large language model landscape, we believe that making high quality training data for other languages more easily accessible would allow millions of non english speakers to benefit from these technologies and, as such, will also strive to adapt the FineWeb Recipe to a multilingual version.</p>
 </d-article>
 <d-appendix>

src/clusters.js CHANGED Viewed

@@ -33,7 +33,7 @@ const DEFAULT_XAXIS = {
     showgrid: false,
     zeroline: false,
     title: {
-        text: "<a href='https://github.com/huggingface/text-clustering' target='_blank' style='color: inherit;'>Fineweb dataset</a>",
         font: {
             size: 16,
             style: "italic",

     showgrid: false,
     zeroline: false,
     title: {
+        text: "The 🍷 FineWeb dataset, <a href='https://github.com/huggingface/text-clustering' target='_blank' style='color: inherit;'>clustered</a> and annotated with educational score labels",
         font: {
             size: 16,
             style: "italic",

src/plotting.js CHANGED Viewed

@@ -11,9 +11,9 @@ const BAR_SETTINGS = {
 const TASK_ID_TO_NAME = {
     // Ablations
     agg_score: "Aggregate Score",
-    "commonsense_qa/acc_norm": "Commonsense QA Norm",
     "hellaswag/acc_norm": "HellaSwag",
-    "openbookqa/acc_norm": "OpenBook QA Norm",
     "piqa/acc_norm": "PIQA",
     "siqa/acc_norm": "Social IQA",
     "winogrande/acc_norm": "WinoGrande",

 const TASK_ID_TO_NAME = {
     // Ablations
     agg_score: "Aggregate Score",
+    "commonsense_qa/acc_norm": "Commonsense QA",
     "hellaswag/acc_norm": "HellaSwag",
+    "openbookqa/acc_norm": "OpenBook QA",
     "piqa/acc_norm": "PIQA",
     "siqa/acc_norm": "Social IQA",
     "winogrande/acc_norm": "WinoGrande",