victormiller commited on
Commit
b9b2095
·
verified ·
1 Parent(s): 117a05e

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +20 -47
web.py CHANGED
@@ -254,46 +254,14 @@ def web_data():
254
  Li("Local Deduplication", style = "margin-bottom: 5px"),
255
  Li("Each section is complete with code and comparisons to Dolma, DataTrove, and/or RedPajama-V-2", style = "margin-bottom: 5px"),
256
  ),
257
- ),
258
-
259
- Div(
260
- H2("Common Crawl Data Processing Summary"),
261
- P(
262
- "To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
263
- A("Common Crawl", href="https://commoncrawl.org/"),
264
- ", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
265
- ),
266
- style="margin-top: 20px;",
267
- ),
268
- Div(
269
- Ul(
270
- Li(
271
- A(
272
- "Raw Documentation",
273
- href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
274
- )
275
- ),
276
- Li(
277
- A(
278
- "Github link of Web Data Pipeline",
279
- href="https://github.com/CIAI-LLM/WebDataProcessing.git",
280
- )
281
- ),
282
- ),
283
- style="""
284
- background-color: #d4edda; /* Light green background */
285
- border: 1px solid #c3e6cb; /* Green border */
286
- border-radius: 5px;
287
- padding: 15px 15px 0px 15px;
288
- margin-bottom: 15px
289
- """,
290
  ),
291
  id="section1",),
292
  Section(
293
  H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
294
  P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
295
  table_div_filter_data,
296
- P("The table below provides a comparison of the quality filters that have been applied to each dataset."),
297
  table_div_qf_filter_data,
298
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
299
  Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
@@ -408,7 +376,7 @@ def web_data():
408
  """),
409
 
410
  Details(
411
- Summary("24 URL domains with more than 4k matches"),
412
  Div (
413
  DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
414
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
@@ -425,7 +393,7 @@ def web_data():
425
  We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
426
  """),
427
  Details(
428
- Summary("6 url domains that are removed from the blocklist"),
429
  Div (
430
  DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
431
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
@@ -439,7 +407,7 @@ def web_data():
439
  ),
440
 
441
  Details(
442
- Summary("Sample documents whose urls are blocked by the refined url blocklist"),
443
  Div(
444
  DV(
445
  "data/bad_url_doc.jsonl",
@@ -460,7 +428,7 @@ def web_data():
460
  """),
461
 
462
  Details(
463
- Summary("curated url domains that are excluded from our dataset"),
464
  Div (
465
  DVS(
466
  non_web_urls,
@@ -477,7 +445,7 @@ def web_data():
477
  ),
478
 
479
  Details(
480
- Summary("Sample documents whose urls are in our curated url domain list"),
481
  Div (
482
  DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
483
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
@@ -539,7 +507,7 @@ def web_data():
539
  The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
540
  """),
541
  Details(
542
- Summary("Javascript Examples Filtered by C4 but Kept in TxT360"),
543
  Div (
544
  DV(
545
  "data/sample_java.jsonl",
@@ -589,7 +557,7 @@ def web_data():
589
  the bad words from English but also consider the bad words from other languages.
590
  """),
591
  Details(
592
- Summary("Sample documents with toxic lines"),
593
  Div (
594
  DVS(
595
  json.load(open("data/toxic_lines.json")),
@@ -611,7 +579,7 @@ def web_data():
611
  In this section, we introduce each quality signal used to filter out low-quality documents.
612
  """),
613
  Details(
614
- Summary("Overview of all the quality signals that are used for filtering"),
615
  Div (
616
  DVS(
617
  json.load(open("data/all_signals.json")),
@@ -732,7 +700,6 @@ def web_data():
732
  We adjusted the method in Dolma for counting characters within lines by excluding whitespace. This modification
733
  ensures consistency with the overall document character count calculation.
734
  """),
735
- H3("TxT360 Implementation"),
736
  Details(
737
  Summary("TxT360 Implementation"),
738
  Div(
@@ -1153,9 +1120,6 @@ def web_data():
1153
  margin-bottom: 15px
1154
  """,
1155
  ),
1156
- H5(
1157
- "Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
1158
- ),
1159
  Details(
1160
  Summary("Documents Filtered by Duplicated n-Grams (n=5,...,10)"),
1161
  Div(
@@ -1300,13 +1264,22 @@ def web_data():
1300
  Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
1301
  Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
1302
  ),
1303
- H3("Word Count"),
1304
  Details(
 
1305
  Summary("Implementations from Dolma"),
1306
  D_code("""
1307
  words = text.split()
1308
  word_count = len(words)
1309
  """, block="block", language="python"),
 
 
 
 
 
 
 
 
1310
  ),
1311
  Details(
1312
  Summary("Implementations from RedPajama-V2"),
 
254
  Li("Local Deduplication", style = "margin-bottom: 5px"),
255
  Li("Each section is complete with code and comparisons to Dolma, DataTrove, and/or RedPajama-V-2", style = "margin-bottom: 5px"),
256
  ),
257
+ P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  ),
259
  id="section1",),
260
  Section(
261
  H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
262
  P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
263
  table_div_filter_data,
264
+ P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and effecient filtering processing that should be consider for any filtering project. However, we are leaving that option to TxT360's end users."),
265
  table_div_qf_filter_data,
266
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
267
  Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
 
376
  """),
377
 
378
  Details(
379
+ Summary(" List of 24 URLs with 4k+ Matches"),
380
  Div (
381
  DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
382
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
 
393
  We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
394
  """),
395
  Details(
396
+ Summary("6 URLS Manually Removed from the Blocklist"),
397
  Div (
398
  DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
399
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
 
407
  ),
408
 
409
  Details(
410
+ Summary("Blocked Document Examples from the URL Blocklist"),
411
  Div(
412
  DV(
413
  "data/bad_url_doc.jsonl",
 
428
  """),
429
 
430
  Details(
431
+ Summary("TxT360 Excluded URLs"),
432
  Div (
433
  DVS(
434
  non_web_urls,
 
445
  ),
446
 
447
  Details(
448
+ Summary("TxT360 Excluded URLs Example Documents"),
449
  Div (
450
  DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
451
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
 
507
  The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
508
  """),
509
  Details(
510
+ Summary("Javascript Documents Filtered by C4 but Kept in TxT360"),
511
  Div (
512
  DV(
513
  "data/sample_java.jsonl",
 
557
  the bad words from English but also consider the bad words from other languages.
558
  """),
559
  Details(
560
+ Summary("Toxic Line Examples (WARNING: MAY CONTAIN OFFENSIVE MATERIAL)"),
561
  Div (
562
  DVS(
563
  json.load(open("data/toxic_lines.json")),
 
579
  In this section, we introduce each quality signal used to filter out low-quality documents.
580
  """),
581
  Details(
582
+ Summary("Quality Signals Used For Filtering"),
583
  Div (
584
  DVS(
585
  json.load(open("data/all_signals.json")),
 
700
  We adjusted the method in Dolma for counting characters within lines by excluding whitespace. This modification
701
  ensures consistency with the overall document character count calculation.
702
  """),
 
703
  Details(
704
  Summary("TxT360 Implementation"),
705
  Div(
 
1120
  margin-bottom: 15px
1121
  """,
1122
  ),
 
 
 
1123
  Details(
1124
  Summary("Documents Filtered by Duplicated n-Grams (n=5,...,10)"),
1125
  Div(
 
1264
  Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
1265
  Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
1266
  ),
1267
+ H3("Word Count Filters"),
1268
  Details(
1269
+ Div(
1270
  Summary("Implementations from Dolma"),
1271
  D_code("""
1272
  words = text.split()
1273
  word_count = len(words)
1274
  """, block="block", language="python"),
1275
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1276
+ ),
1277
+ style="""
1278
+ background-color: #EAFFF1; /* Light yellow background */
1279
+ padding: 15px;
1280
+ border-radius: 12px;
1281
+ margin-bottom: 15px
1282
+ """,
1283
  ),
1284
  Details(
1285
  Summary("Implementations from RedPajama-V2"),