Spaces:
Sleeping
Sleeping
victormiller
commited on
Update main.py
Browse files
main.py
CHANGED
@@ -128,13 +128,14 @@ intro_text = P(
|
|
128 |
|
129 |
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
|
130 |
|
131 |
-
intro_list1 = Ol(
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
-
intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
|
134 |
-
intro_2 = P("2. Employs carefully selected filters designed for each data source")
|
135 |
-
intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
|
136 |
-
intro_4 = P("4. Retains all deduplication metadata for custom upweighting")
|
137 |
-
intro_5 = P("5. Is Production ready! Download here [link to HF repo]")
|
138 |
|
139 |
previous_background = P(
|
140 |
""" The quality and size of a pre-training dataset
|
|
|
128 |
|
129 |
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
|
130 |
|
131 |
+
intro_list1 = Ol(
|
132 |
+
Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
|
133 |
+
Li("Employs carefully selected filters designed for each data source"),
|
134 |
+
Li("Provides only unique data elements via globally deduplicated across all datasets"),
|
135 |
+
Li("Retains all deduplication metadata for custom upweighting"),
|
136 |
+
Li("Is Production ready! Download here [link to HF repo]")
|
137 |
+
)
|
138 |
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
previous_background = P(
|
141 |
""" The quality and size of a pre-training dataset
|