victormiller commited on
Commit
fb20585
·
verified ·
1 Parent(s): 5d3f993

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +42 -20
main.py CHANGED
@@ -117,13 +117,20 @@ def main():
117
  ),
118
  )
119
 
 
 
120
 
121
- @app.get("/intro")
122
- def intro():
123
- return Div(
124
- Section(
125
- H2("Introduction"),
126
- P("""We are excited to introduce TxT360, a
 
 
 
 
 
127
  large-scale, comprehensive, and fully transparent
128
  dataset designed for Large Language Model (LLM)
129
  pre-training. TxT360 is engineered to strike a
@@ -161,12 +168,9 @@ def intro():
161
  represents a significant step forward in the
162
  availability and transparency of large-scale
163
  training data for language models, setting a new
164
- standard for dataset quality and openness."""),
165
- id="section1",
166
- ),
167
- Section(
168
- H2("Background"),
169
- P(
170
  """ The quality and size of a pre-training dataset
171
  play a crucial role in the performance of large
172
  language models (LLMs). The community has
@@ -197,11 +201,8 @@ def intro():
197
  rigorous standards required for state-of-the-art
198
  LLM pre-training. """
199
  ),
200
- id="section2",
201
- ),
202
- Section(
203
- H2("Main Content"),
204
- P("""The performance of a large language model (LLM)
205
  depends heavily on the quality and size of its
206
  pretraining dataset. However, the pretraining
207
  datasets for state-of-the-art open LLMs like Llama
@@ -246,13 +247,34 @@ def intro():
246
  (listing and explaining all of our design choices),
247
  and the process followed to create its 📚
248
  FineWeb-Edu subset."""),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  id="section3",
250
  ),
251
  Section(
252
  H2("Conclusion"),
253
- P("""This is the conclusion section where we
254
- summarize the key points discussed in the blog post
255
- and provide final thoughts."""),
256
  id="section4",
257
  ),
258
  id="inner-text",
 
117
  ),
118
  )
119
 
120
+ intro_text = P(
121
+ """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
122
 
123
+ intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
124
+
125
+ intro_list1 = Ol(
126
+ Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
127
+ Li("Employs carefully selected filters designed for each data source"),
128
+ Li("Provides only unique data elements via globally deduplicated across all datasets"),
129
+ Li("Retains all deduplication metadata for custom upweighting"),
130
+ Li("Is Production ready! Download here [link to HF repo]")
131
+ )
132
+
133
+ previous_intro = P("""We are excited to introduce TxT360, a
134
  large-scale, comprehensive, and fully transparent
135
  dataset designed for Large Language Model (LLM)
136
  pre-training. TxT360 is engineered to strike a
 
168
  represents a significant step forward in the
169
  availability and transparency of large-scale
170
  training data for language models, setting a new
171
+ standard for dataset quality and openness.""")
172
+
173
+ previous_background = P(
 
 
 
174
  """ The quality and size of a pre-training dataset
175
  play a crucial role in the performance of large
176
  language models (LLMs). The community has
 
201
  rigorous standards required for state-of-the-art
202
  LLM pre-training. """
203
  ),
204
+
205
+ previous_content = P("""The performance of a large language model (LLM)
 
 
 
206
  depends heavily on the quality and size of its
207
  pretraining dataset. However, the pretraining
208
  datasets for state-of-the-art open LLMs like Llama
 
247
  (listing and explaining all of our design choices),
248
  and the process followed to create its 📚
249
  FineWeb-Edu subset."""),
250
+
251
+ previous_conclusion = P("""This is the conclusion section where we
252
+ summarize the key points discussed in the blog post
253
+ and provide final thoughts."""),
254
+
255
+ @app.get("/intro")
256
+ def intro():
257
+ return Div(
258
+ Section(
259
+ H2("About TxT360"),
260
+ intro_text,
261
+ intro_list,
262
+ intro_list1,
263
+ id="section1",
264
+ ),
265
+ Section(
266
+ H2("Background"),
267
+
268
+ id="section2",
269
+ ),
270
+ Section(
271
+ H2("Main Content"),
272
+
273
  id="section3",
274
  ),
275
  Section(
276
  H2("Conclusion"),
277
+
 
 
278
  id="section4",
279
  ),
280
  id="inner-text",