victormiller commited on
Commit
2f958f8
·
verified ·
1 Parent(s): 48d8ec3

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +13 -6
web.py CHANGED
@@ -242,6 +242,7 @@ attrs.fraction_of_characters_in_duplicate_lines = sum(
242
 
243
  def web_data():
244
  return Div(
 
245
  Div(
246
  H2("Common Crawl Snapshot Processing"),
247
  H3("What This Section Contains"),
@@ -287,6 +288,8 @@ def web_data():
287
  margin-bottom: 15px
288
  """,
289
  ),
 
 
290
  H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
291
  P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
292
  table_div_filter_data,
@@ -325,8 +328,9 @@ def web_data():
325
 
326
  # P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
327
 
328
-
329
- H2("Stage 1: Document Preparation"),
 
330
 
331
 
332
  P(B("Text Extraction: "), """
@@ -486,8 +490,9 @@ def web_data():
486
  """,
487
  ),
488
 
489
-
490
- H2("2. Line-Level Removal"),
 
491
  P("""
492
  Before filtering low-quality documents, we perform the line-level removal to remove low-quality lines.
493
  This ensured that computing quality signals would align with the final kept texts.
@@ -599,8 +604,9 @@ def web_data():
599
  margin-bottom: 15px
600
  """,
601
  ),
602
-
603
- H2("3. Document-Level Filtering"),
 
604
  P("""
605
  In this section, we introduce each quality signal used to filter out low-quality documents.
606
  """),
@@ -1660,4 +1666,5 @@ def web_data():
1660
  margin-bottom: 15px
1661
  """,
1662
  ),
 
1663
  )
 
242
 
243
  def web_data():
244
  return Div(
245
+ Section(
246
  Div(
247
  H2("Common Crawl Snapshot Processing"),
248
  H3("What This Section Contains"),
 
288
  margin-bottom: 15px
289
  """,
290
  ),
291
+ id="section1",),
292
+ Section(
293
  H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
294
  P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
295
  table_div_filter_data,
 
328
 
329
  # P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
330
 
331
+ id="section2",),
332
+ Section(
333
+ H2("Document Preparation"),
334
 
335
 
336
  P(B("Text Extraction: "), """
 
490
  """,
491
  ),
492
 
493
+ id="section3",),
494
+ Section(
495
+ H2("Line-Level Removal"),
496
  P("""
497
  Before filtering low-quality documents, we perform the line-level removal to remove low-quality lines.
498
  This ensured that computing quality signals would align with the final kept texts.
 
604
  margin-bottom: 15px
605
  """,
606
  ),
607
+ id="section4",),
608
+ Section(
609
+ H2("Document-Level Filtering"),
610
  P("""
611
  In this section, we introduce each quality signal used to filter out low-quality documents.
612
  """),
 
1666
  margin-bottom: 15px
1667
  """,
1668
  ),
1669
+ id="section5",),
1670
  )