victormiller commited on
Commit
c58264f
·
verified ·
1 Parent(s): 43b3d40

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +4 -6
main.py CHANGED
@@ -127,7 +127,9 @@ intro_text = P(
127
  """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
128
 
129
  intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
130
-
 
 
131
  intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
132
  intro_2 = P("2. Employs carefully selected filters designed for each data source")
133
  intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
@@ -224,11 +226,7 @@ def intro():
224
  H2("Introduction"),
225
  intro_text,
226
  intro_list,
227
- intro_1,
228
- intro_2,
229
- intro_3,
230
- intro_4,
231
- intro_5,
232
  id="section1",
233
  ),
234
  Section(
 
127
  """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
128
 
129
  intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
130
+
131
+ intro_list1 = Ol(Li("Curates commonly used pretraining datasets, including all CommonCrawl"),Li("Employs carefully selected filters designed for each data source"))
132
+
133
  intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
134
  intro_2 = P("2. Employs carefully selected filters designed for each data source")
135
  intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
 
226
  H2("Introduction"),
227
  intro_text,
228
  intro_list,
229
+ intro_list1,
 
 
 
 
230
  id="section1",
231
  ),
232
  Section(