CarisMu commited on
Commit
02f8831
·
verified ·
1 Parent(s): 59cb00c

change the backgrounds of collapsable sample text

Browse files
Files changed (1) hide show
  1. web.py +219 -3
web.py CHANGED
@@ -297,8 +297,9 @@ def web_data():
297
  Summary("Text Extraction Examples"),
298
  DV2("data/sample_wet.json", "data/sample_warc.json", 3),
299
  style="""
300
- background-color: #D3D3D3; /* Light grey background */
301
- border: 1px solid #949494; /* Grey border */
 
302
  border-radius: 12px;
303
  """, #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
304
  ),
@@ -314,6 +315,11 @@ def web_data():
314
  Details(
315
  Summary("Non-English Documents"),
316
  DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
 
 
 
 
 
317
  ),
318
 
319
  #DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
@@ -321,6 +327,11 @@ def web_data():
321
  Details(
322
  Summary("English Documents Scoring Lower than 0.65"),
323
  DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
 
 
 
 
 
324
  ),
325
 
326
  H3("1.3 URL Filtering"),
@@ -338,6 +349,11 @@ def web_data():
338
  Details(
339
  Summary("24 URL domains with more than 4k matches"),
340
  DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
 
 
 
 
 
341
  ),
342
 
343
  P("""
@@ -346,6 +362,11 @@ def web_data():
346
  Details(
347
  Summary("6 url domains that are removed from the blocklist"),
348
  DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
 
 
 
 
 
349
  ),
350
 
351
  Details(
@@ -354,7 +375,12 @@ def web_data():
354
  "data/bad_url_doc.jsonl",
355
  3,
356
  "Sample documents whose urls are blocked by the refined url blocklist",
357
- ),
 
 
 
 
 
358
  ),
359
 
360
  H3("1.3.2 Excluded High Quality Sources"),
@@ -368,11 +394,21 @@ def web_data():
368
  non_web_urls,
369
  "curated url domains that are excluded from our dataset",
370
  ),
 
 
 
 
 
371
  ),
372
 
373
  Details(
374
  Summary("Sample documents whose urls are in our curated url domain list"),
375
  DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
 
 
 
 
 
376
  ),
377
 
378
 
@@ -401,6 +437,11 @@ def web_data():
401
  0,
402
  "Sample documents with lines that are removed by the rule of terminal punctuation",
403
  ),
 
 
 
 
 
404
  ),
405
 
406
 
@@ -422,6 +463,11 @@ def web_data():
422
  0,
423
  "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
424
  ),
 
 
 
 
 
425
  ),
426
  H3("2.2 Other Rules from RefinedWeb"),
427
  P("""
@@ -440,6 +486,11 @@ def web_data():
440
  0,
441
  "Sample documents with lines that are removed by the RefinedWeb rules",
442
  ),
 
 
 
 
 
443
  ),
444
  H3("2.3 Toxic Lines"),
445
  P("""
@@ -455,6 +506,11 @@ def web_data():
455
  json.load(open("data/toxic_lines.json")),
456
  "Sample documents with toxic lines",
457
  ),
 
 
 
 
 
458
  ),
459
 
460
  H2("3. Document-Level Filtering"),
@@ -467,6 +523,11 @@ def web_data():
467
  json.load(open("data/all_signals.json")),
468
  "Overview of all the quality signals that are used for filtering",
469
  ),
 
 
 
 
 
470
  ),
471
  P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
472
  Most quality signals were initially introduced by Gopher [2] and subsequently adopted by later
@@ -505,6 +566,11 @@ def web_data():
505
  len(line) * count for line, count in line_counts.items() if count > 1
506
  ) / max(character_count, 1)
507
  """, block="block", language="python"),
 
 
 
 
 
508
  ),
509
  Details(
510
  Summary("Implementations from DataTrove"),
@@ -539,6 +605,11 @@ def web_data():
539
  if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
540
  return False, "dup_line_char_frac"
541
  """, block="block", language="python"),
 
 
 
 
 
542
  ),
543
  P("""
544
  After evaluating the implementations of Dolma and DataTrove (note: RedPajama V2 does not implement these two quality
@@ -580,6 +651,11 @@ def web_data():
580
  sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
581
  line_counts.items() if count > 1) / character_count
582
  """, block="block", language="python"),
 
 
 
 
 
583
  ),
584
  Details(
585
  Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
@@ -588,6 +664,11 @@ def web_data():
588
  0,
589
  "Sample documents filtered by excessive line repetitions / characters in repeated lines",
590
  ),
 
 
 
 
 
591
  ),
592
  H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
593
  P("""
@@ -611,6 +692,11 @@ def web_data():
611
  value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
612
  attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
613
  """, block="block", language="python"),
 
 
 
 
 
614
  ),
615
  Details(
616
  Summary("Implementations from RedPajama-V2"),
@@ -649,6 +735,11 @@ def web_data():
649
  score = round(score, PRECISION)
650
  return [(0, len(document), score)]
651
  """, block="block", language="python"),
 
 
 
 
 
652
  ),
653
 
654
  Details(
@@ -672,6 +763,11 @@ def web_data():
672
  if top_char_length / len(text) > n_frac:
673
  return False, f"top_n_gram"
674
  """, block="block", language="python"),
 
 
 
 
 
675
  ),
676
  P("""
677
  There are almost no contradictions between each implementations of fractions of characters in the most common
@@ -699,6 +795,11 @@ def web_data():
699
  value = count * sum(len(w) for w in most_common_ngram) / character_count
700
  attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
701
  """, block="block", language="python"),
 
 
 
 
 
702
  ),
703
  Details(
704
  Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
@@ -707,6 +808,11 @@ def web_data():
707
  0,
708
  "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
709
  ),
 
 
 
 
 
710
  ),
711
  H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
712
  P("""
@@ -733,6 +839,11 @@ def web_data():
733
  ) / max(ng_char_count, 1)
734
  attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
735
  """, block="block", language="python"),
 
 
 
 
 
736
  ),
737
  Details(
738
  Summary("Implementations from RedPajama-V2"),
@@ -786,6 +897,11 @@ def web_data():
786
  score = round(score, PRECISION)
787
  return [(0, len(document), score)]
788
  """, block="block", language="python"),
 
 
 
 
 
789
  ),
790
 
791
  Details(
@@ -811,6 +927,11 @@ def web_data():
811
  if n_duplicates_char / len(text) > n_frac:
812
  return False, f"duplicated_n_grams"
813
  """, block="block", language="python"),
 
 
 
 
 
814
  ),
815
  P("""
816
  For the computation of fraction of characters in duplicate n-gram, Dolma uses the number of characters in all
@@ -864,6 +985,11 @@ def web_data():
864
  score = get_dup_ngram_frac(n, ngram_counts, text)
865
  attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
866
  """, block="block", language="python"),
 
 
 
 
 
867
  ),
868
  Details(
869
  Summary("An example to show the difference between above implementations"),
@@ -878,6 +1004,11 @@ def web_data():
878
 
879
  In our implementation, there are 17*6 characters in total with 10*6 characters that are duplicated after excluding the first occurence. This results in a fraction of 10/17.
880
  """),
 
 
 
 
 
881
  ),
882
  H5(
883
  "Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
@@ -889,6 +1020,11 @@ def web_data():
889
  0,
890
  "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
891
  ),
 
 
 
 
 
892
  ),
893
  H3("3.2 Line-wise Heuristics"),
894
  P("""
@@ -915,6 +1051,11 @@ def web_data():
915
  D_code("""
916
  ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
917
  """, block="block", language="python"),
 
 
 
 
 
918
  ),
919
  Details(
920
  Summary("Bullet Point Identification Implemetations"),
@@ -959,6 +1100,11 @@ def web_data():
959
  "*", # * star
960
  )
961
  """, block="block", language="python"),
 
 
 
 
 
962
  ),
963
 
964
 
@@ -969,6 +1115,11 @@ def web_data():
969
  0,
970
  "Sample documents that are filtered out by line-wise heuristics",
971
  ),
 
 
 
 
 
972
  ),
973
 
974
  H3("3.3 Statistics-based Heuristics"),
@@ -1029,6 +1180,11 @@ def web_data():
1029
  text = unicodedata.normalize("NFD", text)
1030
  return text
1031
  """, block="block", language="python"),
 
 
 
 
 
1032
  ),
1033
 
1034
  Details(
@@ -1040,6 +1196,11 @@ def web_data():
1040
  non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
1041
  n_non_symbol_words_words = len(non_symbol_words)
1042
  """, block="block", language="python"),
 
 
 
 
 
1043
  ),
1044
  P("""
1045
  Both Dolma and RedPajama V2 split texts into words using white spaces and newline symbols. However,
@@ -1084,6 +1245,11 @@ def web_data():
1084
  score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
1085
  return [(0, len(document), score)]
1086
  """, block="block", language="python"),
 
 
 
 
 
1087
  ),
1088
  P("""
1089
  However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
@@ -1100,6 +1266,11 @@ def web_data():
1100
  ...
1101
  attrs.num_of_sentences = count_sentences(text)
1102
  """, block="block", language="python"),
 
 
 
 
 
1103
  ),
1104
 
1105
  H3("Symbol to Word Ratio"),
@@ -1116,6 +1287,11 @@ def web_data():
1116
  word_count, 1
1117
  )
1118
  """, block="block", language="python"),
 
 
 
 
 
1119
  ),
1120
  Details(
1121
  Summary("Implementations from RedPajama-V2"),
@@ -1142,6 +1318,11 @@ def web_data():
1142
  score = round(score, PRECISION)
1143
  return [(0, len(document), score)]
1144
  """, block="block", language="python"),
 
 
 
 
 
1145
  ),
1146
 
1147
  Details(
@@ -1152,6 +1333,11 @@ def web_data():
1152
  if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
1153
  return False, "gopher_too_many_ellipsis"
1154
  """, block="block", language="python"),
 
 
 
 
 
1155
  ),
1156
  Details(
1157
  Summary("TxT360 Implementation"),
@@ -1162,6 +1348,11 @@ def web_data():
1162
  ...
1163
  attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
1164
  """, block="block", language="python"),
 
 
 
 
 
1165
  ),
1166
 
1167
  H3("Fraction of Alphabetic Words"),
@@ -1172,6 +1363,11 @@ def web_data():
1172
  1 for word in words if any(c.isalpha() for c in word)
1173
  ) / max(word_count, 1)
1174
  """, block="block", language="python"),
 
 
 
 
 
1175
  ),
1176
  Details(
1177
  Summary("Implementations from RedPajama-V2"),
@@ -1196,6 +1392,11 @@ def web_data():
1196
  score = round(score, PRECISION)
1197
  return [(0, len(document), score)]
1198
  """, block="block", language="python"),
 
 
 
 
 
1199
  ),
1200
  Details(
1201
  Summary("Implementations from DataTrove"),
@@ -1207,6 +1408,11 @@ def web_data():
1207
  ):
1208
  return False, "gopher_below_alpha_threshold"
1209
  """, block="block", language="python"),
 
 
 
 
 
1210
  ),
1211
  P("""
1212
  Both Dolma and DataTrove use `char.isalpha()` to detect whether a word contains alphabetic characters while
@@ -1233,6 +1439,11 @@ def web_data():
1233
  0,
1234
  "Sample documents that are filtered out by statistics-based heuristics",
1235
  ),
 
 
 
 
 
1236
  ),
1237
  H3("3.4 Others"),
1238
  P("""
@@ -1243,6 +1454,11 @@ def web_data():
1243
  Details(
1244
  Summary("Sample documents containing 'lorem ipsum'"),
1245
  DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
 
 
 
 
 
1246
  ),
1247
  H2("4. Deduplication"),
1248
  P("""
 
297
  Summary("Text Extraction Examples"),
298
  DV2("data/sample_wet.json", "data/sample_warc.json", 3),
299
  style="""
300
+ background-color: #F0F8FF; /* Light blue background */
301
+ padding: 15px;
302
+ # border: 1px solid #949494; /* Grey border */
303
  border-radius: 12px;
304
  """, #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
305
  ),
 
315
  Details(
316
  Summary("Non-English Documents"),
317
  DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
318
+ style="""
319
+ background-color: #FFC0CB; /* Light pink background */
320
+ padding: 15px;
321
+ border-radius: 12px;
322
+ """,
323
  ),
324
 
325
  #DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
 
327
  Details(
328
  Summary("English Documents Scoring Lower than 0.65"),
329
  DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
330
+ style="""
331
+ background-color: #EAFFF1; /* Light green background */
332
+ padding: 15px;
333
+ border-radius: 12px;
334
+ """,
335
  ),
336
 
337
  H3("1.3 URL Filtering"),
 
349
  Details(
350
  Summary("24 URL domains with more than 4k matches"),
351
  DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
352
+ style="""
353
+ background-color: #FFC0CB; /* Light pink background */
354
+ padding: 15px;
355
+ border-radius: 12px;
356
+ """,
357
  ),
358
 
359
  P("""
 
362
  Details(
363
  Summary("6 url domains that are removed from the blocklist"),
364
  DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
365
+ style="""
366
+ background-color: #FFC0CB; /* Light pink background */
367
+ padding: 15px;
368
+ border-radius: 12px;
369
+ """,
370
  ),
371
 
372
  Details(
 
375
  "data/bad_url_doc.jsonl",
376
  3,
377
  "Sample documents whose urls are blocked by the refined url blocklist",
378
+ ),
379
+ style="""
380
+ background-color: #FFC0CB; /* Light pink background */
381
+ padding: 15px;
382
+ border-radius: 12px;
383
+ """,
384
  ),
385
 
386
  H3("1.3.2 Excluded High Quality Sources"),
 
394
  non_web_urls,
395
  "curated url domains that are excluded from our dataset",
396
  ),
397
+ style="""
398
+ background-color: #FFC0CB; /* Light pink background */
399
+ padding: 15px;
400
+ border-radius: 12px;
401
+ """,
402
  ),
403
 
404
  Details(
405
  Summary("Sample documents whose urls are in our curated url domain list"),
406
  DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
407
+ style="""
408
+ background-color: #EAFFF1; /* Light green background */
409
+ padding: 15px;
410
+ border-radius: 12px;
411
+ """,
412
  ),
413
 
414
 
 
437
  0,
438
  "Sample documents with lines that are removed by the rule of terminal punctuation",
439
  ),
440
+ style="""
441
+ background-color: #FFC0CB; /* Light pink background */
442
+ padding: 15px;
443
+ border-radius: 12px;
444
+ """,
445
  ),
446
 
447
 
 
463
  0,
464
  "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
465
  ),
466
+ style="""
467
+ background-color: #FFC0CB; /* Light pink background */
468
+ padding: 15px;
469
+ border-radius: 12px;
470
+ """,
471
  ),
472
  H3("2.2 Other Rules from RefinedWeb"),
473
  P("""
 
486
  0,
487
  "Sample documents with lines that are removed by the RefinedWeb rules",
488
  ),
489
+ style="""
490
+ background-color: #FFC0CB; /* Light pink background */
491
+ padding: 15px;
492
+ border-radius: 12px;
493
+ """,
494
  ),
495
  H3("2.3 Toxic Lines"),
496
  P("""
 
506
  json.load(open("data/toxic_lines.json")),
507
  "Sample documents with toxic lines",
508
  ),
509
+ style="""
510
+ background-color: #FFC0CB; /* Light pink background */
511
+ padding: 15px;
512
+ border-radius: 12px;
513
+ """,
514
  ),
515
 
516
  H2("3. Document-Level Filtering"),
 
523
  json.load(open("data/all_signals.json")),
524
  "Overview of all the quality signals that are used for filtering",
525
  ),
526
+ style="""
527
+ background-color: #EAFFF1; /* Light green background */
528
+ padding: 15px;
529
+ border-radius: 12px;
530
+ """,
531
  ),
532
  P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
533
  Most quality signals were initially introduced by Gopher [2] and subsequently adopted by later
 
566
  len(line) * count for line, count in line_counts.items() if count > 1
567
  ) / max(character_count, 1)
568
  """, block="block", language="python"),
569
+ style="""
570
+ background-color: #FFFAEA; /* Light yellow background */
571
+ padding: 15px;
572
+ border-radius: 12px;
573
+ """,
574
  ),
575
  Details(
576
  Summary("Implementations from DataTrove"),
 
605
  if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
606
  return False, "dup_line_char_frac"
607
  """, block="block", language="python"),
608
+ style="""
609
+ background-color: #FFFAEA; /* Light yellow background */
610
+ padding: 15px;
611
+ border-radius: 12px;
612
+ """,
613
  ),
614
  P("""
615
  After evaluating the implementations of Dolma and DataTrove (note: RedPajama V2 does not implement these two quality
 
651
  sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
652
  line_counts.items() if count > 1) / character_count
653
  """, block="block", language="python"),
654
+ style="""
655
+ background-color: #EAFFF1; /* Light green background */
656
+ padding: 15px;
657
+ border-radius: 12px;
658
+ """,
659
  ),
660
  Details(
661
  Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
 
664
  0,
665
  "Sample documents filtered by excessive line repetitions / characters in repeated lines",
666
  ),
667
+ style="""
668
+ background-color: #EAFFF1; /* Light green background */
669
+ padding: 15px;
670
+ border-radius: 12px;
671
+ """,
672
  ),
673
  H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
674
  P("""
 
692
  value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
693
  attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
694
  """, block="block", language="python"),
695
+ style="""
696
+ background-color: #FFFAEA; /* Light yellow background */
697
+ padding: 15px;
698
+ border-radius: 12px;
699
+ """,
700
  ),
701
  Details(
702
  Summary("Implementations from RedPajama-V2"),
 
735
  score = round(score, PRECISION)
736
  return [(0, len(document), score)]
737
  """, block="block", language="python"),
738
+ style="""
739
+ background-color: #FFFAEA; /* Light yellow background */
740
+ padding: 15px;
741
+ border-radius: 12px;
742
+ """,
743
  ),
744
 
745
  Details(
 
763
  if top_char_length / len(text) > n_frac:
764
  return False, f"top_n_gram"
765
  """, block="block", language="python"),
766
+ style="""
767
+ background-color: #FFFAEA; /* Light yellow background */
768
+ padding: 15px;
769
+ border-radius: 12px;
770
+ """,
771
  ),
772
  P("""
773
  There are almost no contradictions between each implementations of fractions of characters in the most common
 
795
  value = count * sum(len(w) for w in most_common_ngram) / character_count
796
  attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
797
  """, block="block", language="python"),
798
+ style="""
799
+ background-color: #EAFFF1; /* Light green background */
800
+ padding: 15px;
801
+ border-radius: 12px;
802
+ """,
803
  ),
804
  Details(
805
  Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
 
808
  0,
809
  "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
810
  ),
811
+ style="""
812
+ background-color: #EAFFF1; /* Light green background */
813
+ padding: 15px;
814
+ border-radius: 12px;
815
+ """,
816
  ),
817
  H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
818
  P("""
 
839
  ) / max(ng_char_count, 1)
840
  attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
841
  """, block="block", language="python"),
842
+ style="""
843
+ background-color: #FFFAEA; /* Light yellow background */
844
+ padding: 15px;
845
+ border-radius: 12px;
846
+ """,
847
  ),
848
  Details(
849
  Summary("Implementations from RedPajama-V2"),
 
897
  score = round(score, PRECISION)
898
  return [(0, len(document), score)]
899
  """, block="block", language="python"),
900
+ style="""
901
+ background-color: #FFFAEA; /* Light yellow background */
902
+ padding: 15px;
903
+ border-radius: 12px;
904
+ """,
905
  ),
906
 
907
  Details(
 
927
  if n_duplicates_char / len(text) > n_frac:
928
  return False, f"duplicated_n_grams"
929
  """, block="block", language="python"),
930
+ style="""
931
+ background-color: #FFFAEA; /* Light yellow background */
932
+ padding: 15px;
933
+ border-radius: 12px;
934
+ """,
935
  ),
936
  P("""
937
  For the computation of fraction of characters in duplicate n-gram, Dolma uses the number of characters in all
 
985
  score = get_dup_ngram_frac(n, ngram_counts, text)
986
  attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
987
  """, block="block", language="python"),
988
+ style="""
989
+ background-color: #EAFFF1; /* Light green background */
990
+ padding: 15px;
991
+ border-radius: 12px;
992
+ """,
993
  ),
994
  Details(
995
  Summary("An example to show the difference between above implementations"),
 
1004
 
1005
  In our implementation, there are 17*6 characters in total with 10*6 characters that are duplicated after excluding the first occurence. This results in a fraction of 10/17.
1006
  """),
1007
+ style="""
1008
+ background-color: #EAFFF1; /* Light green background */
1009
+ padding: 15px;
1010
+ border-radius: 12px;
1011
+ """,
1012
  ),
1013
  H5(
1014
  "Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
 
1020
  0,
1021
  "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
1022
  ),
1023
+ style="""
1024
+ background-color: #EAFFF1; /* Light green background */
1025
+ padding: 15px;
1026
+ border-radius: 12px;
1027
+ """,
1028
  ),
1029
  H3("3.2 Line-wise Heuristics"),
1030
  P("""
 
1051
  D_code("""
1052
  ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
1053
  """, block="block", language="python"),
1054
+ style="""
1055
+ background-color: #FFFAEA; /* Light yellow background */
1056
+ padding: 15px;
1057
+ border-radius: 12px;
1058
+ """,
1059
  ),
1060
  Details(
1061
  Summary("Bullet Point Identification Implemetations"),
 
1100
  "*", # * star
1101
  )
1102
  """, block="block", language="python"),
1103
+ style="""
1104
+ background-color: #FFFAEA; /* Light yellow background */
1105
+ padding: 15px;
1106
+ border-radius: 12px;
1107
+ """,
1108
  ),
1109
 
1110
 
 
1115
  0,
1116
  "Sample documents that are filtered out by line-wise heuristics",
1117
  ),
1118
+ style="""
1119
+ background-color: #EAFFF1; /* Light green background */
1120
+ padding: 15px;
1121
+ border-radius: 12px;
1122
+ """,
1123
  ),
1124
 
1125
  H3("3.3 Statistics-based Heuristics"),
 
1180
  text = unicodedata.normalize("NFD", text)
1181
  return text
1182
  """, block="block", language="python"),
1183
+ style="""
1184
+ background-color: #FFFAEA; /* Light yellow background */
1185
+ padding: 15px;
1186
+ border-radius: 12px;
1187
+ """,
1188
  ),
1189
 
1190
  Details(
 
1196
  non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
1197
  n_non_symbol_words_words = len(non_symbol_words)
1198
  """, block="block", language="python"),
1199
+ style="""
1200
+ background-color: #FFFAEA; /* Light yellow background */
1201
+ padding: 15px;
1202
+ border-radius: 12px;
1203
+ """,
1204
  ),
1205
  P("""
1206
  Both Dolma and RedPajama V2 split texts into words using white spaces and newline symbols. However,
 
1245
  score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
1246
  return [(0, len(document), score)]
1247
  """, block="block", language="python"),
1248
+ style="""
1249
+ background-color: #FFFAEA; /* Light yellow background */
1250
+ padding: 15px;
1251
+ border-radius: 12px;
1252
+ """,
1253
  ),
1254
  P("""
1255
  However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
 
1266
  ...
1267
  attrs.num_of_sentences = count_sentences(text)
1268
  """, block="block", language="python"),
1269
+ style="""
1270
+ background-color: #EAFFF1; /* Light green background */
1271
+ padding: 15px;
1272
+ border-radius: 12px;
1273
+ """,
1274
  ),
1275
 
1276
  H3("Symbol to Word Ratio"),
 
1287
  word_count, 1
1288
  )
1289
  """, block="block", language="python"),
1290
+ style="""
1291
+ background-color: #FFFAEA; /* Light yellow background */
1292
+ padding: 15px;
1293
+ border-radius: 12px;
1294
+ """,
1295
  ),
1296
  Details(
1297
  Summary("Implementations from RedPajama-V2"),
 
1318
  score = round(score, PRECISION)
1319
  return [(0, len(document), score)]
1320
  """, block="block", language="python"),
1321
+ style="""
1322
+ background-color: #FFFAEA; /* Light yellow background */
1323
+ padding: 15px;
1324
+ border-radius: 12px;
1325
+ """,
1326
  ),
1327
 
1328
  Details(
 
1333
  if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
1334
  return False, "gopher_too_many_ellipsis"
1335
  """, block="block", language="python"),
1336
+ style="""
1337
+ background-color: #FFFAEA; /* Light yellow background */
1338
+ padding: 15px;
1339
+ border-radius: 12px;
1340
+ """,
1341
  ),
1342
  Details(
1343
  Summary("TxT360 Implementation"),
 
1348
  ...
1349
  attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
1350
  """, block="block", language="python"),
1351
+ style="""
1352
+ background-color: #EAFFF1; /* Light green background */
1353
+ padding: 15px;
1354
+ border-radius: 12px;
1355
+ """,
1356
  ),
1357
 
1358
  H3("Fraction of Alphabetic Words"),
 
1363
  1 for word in words if any(c.isalpha() for c in word)
1364
  ) / max(word_count, 1)
1365
  """, block="block", language="python"),
1366
+ style="""
1367
+ background-color: #FFFAEA; /* Light yellow background */
1368
+ padding: 15px;
1369
+ border-radius: 12px;
1370
+ """,
1371
  ),
1372
  Details(
1373
  Summary("Implementations from RedPajama-V2"),
 
1392
  score = round(score, PRECISION)
1393
  return [(0, len(document), score)]
1394
  """, block="block", language="python"),
1395
+ style="""
1396
+ background-color: #FFFAEA; /* Light yellow background */
1397
+ padding: 15px;
1398
+ border-radius: 12px;
1399
+ """,
1400
  ),
1401
  Details(
1402
  Summary("Implementations from DataTrove"),
 
1408
  ):
1409
  return False, "gopher_below_alpha_threshold"
1410
  """, block="block", language="python"),
1411
+ style="""
1412
+ background-color: #FFFAEA; /* Light yellow background */
1413
+ padding: 15px;
1414
+ border-radius: 12px;
1415
+ """,
1416
  ),
1417
  P("""
1418
  Both Dolma and DataTrove use `char.isalpha()` to detect whether a word contains alphabetic characters while
 
1439
  0,
1440
  "Sample documents that are filtered out by statistics-based heuristics",
1441
  ),
1442
+ style="""
1443
+ background-color: #EAFFF1; /* Light green background */
1444
+ padding: 15px;
1445
+ border-radius: 12px;
1446
+ """,
1447
  ),
1448
  H3("3.4 Others"),
1449
  P("""
 
1454
  Details(
1455
  Summary("Sample documents containing 'lorem ipsum'"),
1456
  DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
1457
+ style="""
1458
+ background-color: #FFC0CB; /* Light pink background */
1459
+ padding: 15px;
1460
+ border-radius: 12px;
1461
+ """,
1462
  ),
1463
  H2("4. Deduplication"),
1464
  P("""