fh-new-vm1

Sleeping

victormiller commited on Sep 25, 2024

Commit

c58264f

verified ·

1 Parent(s): 43b3d40

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -127,7 +127,9 @@ intro_text = P(
 """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
 intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
 intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
 intro_2 = P("2. Employs carefully selected filters designed for each data source")
 intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
@@ -224,11 +226,7 @@ def intro():
             H2("Introduction"),
             intro_text,
             intro_list,
-            intro_1,
-            intro_2,
-            intro_3,
-            intro_4,
-            intro_5,
             id="section1",
         ),
         Section(

 """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
 intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
+intro_list1 = Ol(Li("Curates commonly used pretraining datasets, including all CommonCrawl"),Li("Employs carefully selected filters designed for each data source"))
 intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
 intro_2 = P("2. Employs carefully selected filters designed for each data source")
 intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
             H2("Introduction"),
             intro_text,
             intro_list,
+            intro_list1,
             id="section1",
         ),
         Section(