victormiller commited on
Commit
be782f3
·
verified ·
1 Parent(s): e04322e

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +4 -10
web.py CHANGED
@@ -708,25 +708,19 @@ def web_data():
708
  P("""
709
  There is minimal variation among existing pipeline implementations. We simply compute the mean word length as follows:
710
  """),
711
- Div(
712
- Code("""
713
  words = text.split()
714
  word_count = len(words)
715
  character_count = sum(len(word) for word in words)
716
  mean_word_length = character_count / word_count
717
- """),
718
- cls="code-block",
719
- ),
720
  P("""
721
  It's worth noting that Dolma used the median word length instead of the mean in their codes.
722
  """),
723
- Div(
724
- Code("""
725
  from statistics import median
726
  median_word_length = median(len(word) for word in words)
727
- """),
728
- cls="code-block",
729
- ),
730
  H5("Number of Sentences"),
731
  P("""
732
  The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
 
708
  P("""
709
  There is minimal variation among existing pipeline implementations. We simply compute the mean word length as follows:
710
  """),
711
+ D_code("""
 
712
  words = text.split()
713
  word_count = len(words)
714
  character_count = sum(len(word) for word in words)
715
  mean_word_length = character_count / word_count
716
+ """, block="block", language="python"),
 
 
717
  P("""
718
  It's worth noting that Dolma used the median word length instead of the mean in their codes.
719
  """),
720
+ D_code("""
 
721
  from statistics import median
722
  median_word_length = median(len(word) for word in words)
723
+ """, block="block", language="python"),
 
 
724
  H5("Number of Sentences"),
725
  P("""
726
  The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions