from fasthtml.common import * from fasthtml.components import * from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline from plotly import graph_objects as go from fh_plotly import plotly2fasthtml import pandas as pd import json from rich import print import curated import web import common import results app, rt = fast_app( debug=True, pico=False, hdrs=( Meta(charset="UTF-8"), Meta(name="viewport", content="width=device-width, initial-scale=1.0"), Script(src="https://distill.pub/template.v2.js"), Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"), Script(src="https://cdn.plot.ly/plotly-latest.min.js"), Link(rel="stylesheet", href="style.css"), MarkdownJS(), HighlightJS(langs=["python", "javascript", "html", "css"]), ), ) @app.get("/") def main(): return Div( D_front_matter(), D_title( H1( "TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models", cls="l-body", style="text-align: center;", ), Div( Img(src="images/llm360_logo.png"), id="title-plot", cls="main-plot-container l-page", ), ), D_article( D_contents( Nav( H3("Table of Contents"), Div( A("TxT360", href="#_self"), hx_get="/intro", hx_target="#inner-text", ), Div( Ul( Li( A( "About TxT360", href="/intro#section1", hx_get="/intro#section1", hx_target="#inner-text", ) ), Li( A( "Global Deduplication", href="/intro#section2", hx_get="/intro#section2", hx_target="#inner-text", ) ), Li( A( "Controllable Upweighting", href="/intro#section3", hx_get="/intro#section3", hx_target="#inner-text", ) ), Li( A( "Full Documentation", href="/intro#section4", hx_get="/intro#section4", hx_target="#inner-text", ) ), ), ), Div( A("Web Data", href="#inner-text"), hx_get="/webdata", hx_target="#inner-text", ), Div( A("Curated Sources", href="#inner-text"), hx_get="/curated", hx_target="#inner-text", ), Div( A("Common Steps", href="#inner-text"), hx_get="/common", hx_target="#inner-text", ), Div( A("TxT360 Results", href="#inner-text"), hx_get="/results", hx_target="#inner-text", ), role="navigation", cls="l-text figcaption", ), ), intro(), ), ) @app.get("/intro") def intro(): return Div( Section( H2("Introduction"), P("""We are excited to introduce TxT360, a large-scale, comprehensive, and fully transparent dataset designed for Large Language Model (LLM) pre-training. TxT360 is engineered to strike a balance between the quantity and quality of pre-training data, pushing the limit on both fronts. This comprehensive dataset encompasses both expansive web-based data and highly curated data sources, making it one of the most robust LLM pre-training corpora available today. Our web data component includes 99 snapshots from Common Crawl, amassing 5.7 trillion tokens and occupying 11 TB of disk space in jsonl.gz format. On the curated side, TxT360 integrates one of the most extensive collections of high-quality sources across multiple domains, ensuring diverse and rich content referred to as curated sources, 14 sources across 10 domains. To maintain the highest quality, we meticulously pre-processed the web data to filter out low-quality content and conducted thorough reviews of the curated sources. This process not only unified their formats but also identified and rectified any anomalies. Not only do we 100% open-source our processing scripts, but we also release the details of our data reviews, revealing the decision-making processes behind data selection and quality assurance. This level of transparency allows researchers and practitioners to fully understand the dataset’s composition and make informed decisions when using TxT360 for training. Additionally, TxT360 includes detailed documentation and analysis of the data, covering distribution statistics, domain coverage, and processing pipeline, which helps users navigate and utilize the dataset effectively. Overall, TxT360 represents a significant step forward in the availability and transparency of large-scale training data for language models, setting a new standard for dataset quality and openness."""), id="section1", ), Section( H2("Background"), P( """ The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). The community has introduced a variety of datasets for this purpose, including purely web-based datasets like RefinedWeb [1], RedPajama-Data-V2 [2], DCLM [3], and FineWeb [4], as well as comprehensive datasets derived from multiple highly-curated data sources such as The Pile [5], RedPajama-Data-V1 [6], and Dolma [7] . It is commonly known that web-based datasets provide a vast quantity of data, while highly-curated multi-source datasets consistently deliver high quality and diversity, both critical for effective LLM pre-training. However, despite the advancements in both types of data, each type of dataset has its limitations. For instance, the processing scripts for the web dataset, RefinedWeb, known for its high quality, are not public, and only about 10% of the entire dataset has been disclosed. Conversely, the web component of existing highly-curated multi-source datasets is relatively small compared to purely web-based datasets, limiting their coverage and diversity compared to the scale of information from the internet. By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training. """ ), id="section2", ), Section( H2("Main Content"), P("""The performance of a large language model (LLM) depends heavily on the quality and size of its pretraining dataset. However, the pretraining datasets for state-of-the-art open LLMs like Llama 3 and Mixtral are not publicly available and very little is known about how they were created. Reading time: 45 min. For the best reading experience, we recommend not using a mobile phone. Recently, we released 🍷 FineWeb, a new, large-scale (15-trillion tokens, 44TB disk space) dataset for LLM pretraining. FineWeb is derived from 96 CommonCrawl snapshots and produces better-performing LLMs than other open pretraining datasets. To bring more clarity in machine learning and advance the open understanding of how to train good quality large language models, we carefully documented and ablated all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies. The present long form report is a deep dive in how to create a large and high-quality web-scale dataset for LLM pretraining. The dataset itself, 🍷 FineWeb, is available here. We are extremely thankful to the whole distill.pub team (Christopher Olah, Shan Carter, Ludwig Schubert in particular) for creating the template on which we based this blog post. Thanks also for inspiring us with exquisitely crafted articles and blog posts. In this report we also introduce πŸ“š FineWeb-Edu, a subset of FineWeb constructed using scalable automated high-quality annotations for educational value, and which outperforms all openly accessible web-datasets on a number of educational benchmarks such as MMLU, ARC, and OpenBookQA. πŸ“š FineWeb-Edu is available in two sizes/filtering-level: 1.3 trillion (very high educational content) and 5.4 trillion (high educational content) tokens (all tokens are measured with GPT2 tokenizer). You can download it here. Both datasets are released under the permissive ODC-By 1.0 license TLDR: This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb recipe (listing and explaining all of our design choices), and the process followed to create its πŸ“š FineWeb-Edu subset."""), id="section3", ), Section( H2("Conclusion"), P("""This is the conclusion section where we summarize the key points discussed in the blog post and provide final thoughts."""), id="section4", ), id="inner-text", ) rt("/curated")(curated.curated) rt("/curated/{target}")(curated.update) rt("/webdata")(web.web_data) rt("/webdata/{target}")(web.update) rt("/common")(common.common_steps) rt("/results")(results.results) serve()