victormiller commited on
Commit
e04322e
·
verified ·
1 Parent(s): 32c6d51

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +6 -9
web.py CHANGED
@@ -431,10 +431,10 @@ def web_data():
431
  """),
432
  P("We directly read WARC files instead of WET files and extracted text using Trafilatura. Similar to RefinedWeb, we avoid using Machine Learning (ML)-based metrics for filtering documents to prevent bias introduced by ML models. Importantly, we apply global deduplication across the entire dataset, whereas previous works only use local deduplication. Note that although The Pile also employed global deduplication on its web data (Pile-CC), this accounted for just 0.6\% of 74 snapshots."),
433
 
434
- # Details(
435
- # Summary("Text Extraction Examples"),
436
- # DV2("data/sample_wet.json", "data/sample_warc.json", 3),
437
- # ),
438
  #DV2("data/sample_wet.json", "data/sample_warc.json", 3),
439
 
440
  H4("1.2 Language Identification"),
@@ -443,12 +443,9 @@ def web_data():
443
  This step removes over 60% of the whole data.
444
  """),
445
 
446
- Details(
447
- Summary("Sample documents that are classified as non-English"),
448
- DV("data/sample_non_en.json", 3),
449
- ),
450
 
451
- #DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
452
 
453
 
454
  DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
 
431
  """),
432
  P("We directly read WARC files instead of WET files and extracted text using Trafilatura. Similar to RefinedWeb, we avoid using Machine Learning (ML)-based metrics for filtering documents to prevent bias introduced by ML models. Importantly, we apply global deduplication across the entire dataset, whereas previous works only use local deduplication. Note that although The Pile also employed global deduplication on its web data (Pile-CC), this accounted for just 0.6\% of 74 snapshots."),
433
 
434
+ Details(
435
+ Summary("Text Extraction Examples"),
436
+ DV2("data/sample_wet.json", "data/sample_warc.json", 3),
437
+ ),
438
  #DV2("data/sample_wet.json", "data/sample_warc.json", 3),
439
 
440
  H4("1.2 Language Identification"),
 
443
  This step removes over 60% of the whole data.
444
  """),
445
 
446
+
 
 
 
447
 
448
+ DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
449
 
450
 
451
  DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),