Spaces:
Sleeping
Sleeping
victormiller
commited on
Update main.py
Browse files
main.py
CHANGED
@@ -127,7 +127,9 @@ intro_text = P(
|
|
127 |
"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
|
128 |
|
129 |
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
|
130 |
-
|
|
|
|
|
131 |
intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
|
132 |
intro_2 = P("2. Employs carefully selected filters designed for each data source")
|
133 |
intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
|
@@ -224,11 +226,7 @@ def intro():
|
|
224 |
H2("Introduction"),
|
225 |
intro_text,
|
226 |
intro_list,
|
227 |
-
|
228 |
-
intro_2,
|
229 |
-
intro_3,
|
230 |
-
intro_4,
|
231 |
-
intro_5,
|
232 |
id="section1",
|
233 |
),
|
234 |
Section(
|
|
|
127 |
"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
|
128 |
|
129 |
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
|
130 |
+
|
131 |
+
intro_list1 = Ol(Li("Curates commonly used pretraining datasets, including all CommonCrawl"),Li("Employs carefully selected filters designed for each data source"))
|
132 |
+
|
133 |
intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
|
134 |
intro_2 = P("2. Employs carefully selected filters designed for each data source")
|
135 |
intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
|
|
|
226 |
H2("Introduction"),
|
227 |
intro_text,
|
228 |
intro_list,
|
229 |
+
intro_list1,
|
|
|
|
|
|
|
|
|
230 |
id="section1",
|
231 |
),
|
232 |
Section(
|