victormiller commited on
Commit
e55aeba
·
verified ·
1 Parent(s): c58264f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +7 -6
main.py CHANGED
@@ -128,13 +128,14 @@ intro_text = P(
128
 
129
  intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
130
 
131
- intro_list1 = Ol(Li("Curates commonly used pretraining datasets, including all CommonCrawl"),Li("Employs carefully selected filters designed for each data source"))
 
 
 
 
 
 
132
 
133
- intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
134
- intro_2 = P("2. Employs carefully selected filters designed for each data source")
135
- intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
136
- intro_4 = P("4. Retains all deduplication metadata for custom upweighting")
137
- intro_5 = P("5. Is Production ready! Download here [link to HF repo]")
138
 
139
  previous_background = P(
140
  """ The quality and size of a pre-training dataset
 
128
 
129
  intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
130
 
131
+ intro_list1 = Ol(
132
+ Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
133
+ Li("Employs carefully selected filters designed for each data source"),
134
+ Li("Provides only unique data elements via globally deduplicated across all datasets"),
135
+ Li("Retains all deduplication metadata for custom upweighting"),
136
+ Li("Is Production ready! Download here [link to HF repo]")
137
+ )
138
 
 
 
 
 
 
139
 
140
  previous_background = P(
141
  """ The quality and size of a pre-training dataset