victormiller commited on
Commit
7533364
·
verified ·
1 Parent(s): e4314a9

Update results.py

Browse files
Files changed (1) hide show
  1. results.py +61 -1
results.py CHANGED
@@ -614,6 +614,62 @@ fig.update_layout(
614
  # Show the figure
615
  llama_graph6 = fig
616
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
 
618
  intro_div = Div(
619
  H2("Perplexity Evaluation on Duplicate Data"),
@@ -621,13 +677,16 @@ intro_div = Div(
621
  P("We took one of the model-based data quality evaluation strategies adopted by [DataComp-LM](https://arxiv.org/abs/2406.11794), which used perplexity filtering as a candidate for quality filtering. DataComp-LM followed [CCNet’s](https://arxiv.org/abs/1911.00359) practice to use a 5-gram Kneser-Ney model as implemented in the [KenLM](https://github.com/kpu/kenlm) library for efficient perplexity calculation. Following this practice, we estimated data quality by taking a KenLM model (from [edugp/kenlm](https://huggingface.co/edugp/kenlm)) trained on English Wikipedia data to compute perplexity on data with different duplication patterns. Lower perplexity is regarded as a signal of higher quality."),
622
  H3("Sampling Strategy"),
623
  P("We started from a processed Common Crawl (CC) ablation dataset divided by the number of duplicates of each document. For each CC dump, we have different buckets each holding chunks of document with different duplicate count ranges (1-1, 2-5, 6-10, 11-100, 101-1000, 1001-30000000). We sampled the first 10k documents from each chunk with their meta data."),
624
-
625
  )
626
 
627
  upsampling_exp = Div(
628
  H2("Upsampling Experiment: TxT360 vs FineWeb"),
629
  H3("Experiment Setup"),
630
  P("We performed a comparison of 1.5T tokens from FineWeb and 1.5T tokens of TxT360 across 10 diverse evaluations. Our FineWeb evaluation is based on a random sample 1.5T tokens from FineWeb (base). For TxT360, we also random sample 1.5T tokens by upsampling data instances with more duplicates. Concretely, the upsampling weight is set to 3 for data points with duplicates in the range from 2 to 5, 5 for the range from 5 to 100, 8 for that from 101 to 1000, and 10 for more than 1000 duplicates."),
 
 
 
 
631
  )
632
 
633
  perp1_div = Div(
@@ -707,6 +766,7 @@ def results():
707
  return Div(
708
  Section(
709
  intro_div,
 
710
  perp1_div,
711
  llama_div,
712
  P("test plotly"),
 
614
  # Show the figure
615
  llama_graph6 = fig
616
 
617
+ ##txt360 vs fineweb comparison table
618
+ dataset_comparison = pd.DataFrame(
619
+ {
620
+ "Metric": [
621
+ "BoolQ",
622
+ "PIQA",
623
+ "HellaSwag",
624
+ "Winogrande",
625
+ "MMLU",
626
+ "Natural Questions",
627
+ "TriviaQA",
628
+ "GSM8K",
629
+ "MATH",
630
+ "MedQA",
631
+ ],
632
+ "TxT360 - Upsampling": [
633
+ "70.31",
634
+ "80.36",
635
+ "73.54",
636
+ "68.43",
637
+ "30.26",
638
+ "22.22",
639
+ "58.52",
640
+ "3.41",
641
+ "28.04",
642
+ "25.61",
643
+ ],
644
+ "FineWeb-1.5T": [
645
+ "71.5",
646
+ "82.1",
647
+ "79.46",
648
+ "70.96",
649
+ "28.5",
650
+ "19.03",
651
+ "36.65",
652
+ "5.31",
653
+ "29.65",
654
+ "27.26",
655
+ ],
656
+ "TxT360 Difference": [
657
+ "(1.19)",
658
+ "(1.74)",
659
+ "(5.92)",
660
+ "(2.53)",
661
+ "1.76",
662
+ "3.19",
663
+ "21.87",
664
+ "(1.9)",
665
+ "(1.61)",
666
+ "(1.65)",
667
+ ],
668
+ }
669
+ )
670
+
671
+ table_html = dataset_comparison.to_html(index=False, border=0)
672
+ table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
673
 
674
  intro_div = Div(
675
  H2("Perplexity Evaluation on Duplicate Data"),
 
677
  P("We took one of the model-based data quality evaluation strategies adopted by [DataComp-LM](https://arxiv.org/abs/2406.11794), which used perplexity filtering as a candidate for quality filtering. DataComp-LM followed [CCNet’s](https://arxiv.org/abs/1911.00359) practice to use a 5-gram Kneser-Ney model as implemented in the [KenLM](https://github.com/kpu/kenlm) library for efficient perplexity calculation. Following this practice, we estimated data quality by taking a KenLM model (from [edugp/kenlm](https://huggingface.co/edugp/kenlm)) trained on English Wikipedia data to compute perplexity on data with different duplication patterns. Lower perplexity is regarded as a signal of higher quality."),
678
  H3("Sampling Strategy"),
679
  P("We started from a processed Common Crawl (CC) ablation dataset divided by the number of duplicates of each document. For each CC dump, we have different buckets each holding chunks of document with different duplicate count ranges (1-1, 2-5, 6-10, 11-100, 101-1000, 1001-30000000). We sampled the first 10k documents from each chunk with their meta data."),
 
680
  )
681
 
682
  upsampling_exp = Div(
683
  H2("Upsampling Experiment: TxT360 vs FineWeb"),
684
  H3("Experiment Setup"),
685
  P("We performed a comparison of 1.5T tokens from FineWeb and 1.5T tokens of TxT360 across 10 diverse evaluations. Our FineWeb evaluation is based on a random sample 1.5T tokens from FineWeb (base). For TxT360, we also random sample 1.5T tokens by upsampling data instances with more duplicates. Concretely, the upsampling weight is set to 3 for data points with duplicates in the range from 2 to 5, 5 for the range from 5 to 100, 8 for that from 101 to 1000, and 10 for more than 1000 duplicates."),
686
+ table_div_1,
687
+ P("To account for differing dataset sizes, the evaluation scores represent the final evaluation score after the entire dataset has been processed."),
688
+ H3("Training Evaluations"),
689
+ P("We also conducted full scale training using TxT360 and FineWeb-1.5T. Below are plots of the training and validation loss curves for each dataset. We can see that TxT360 achieves a lower training and validation loss compared to FineWeb-1.5T. "),
690
  )
691
 
692
  perp1_div = Div(
 
766
  return Div(
767
  Section(
768
  intro_div,
769
+ upsampling_exp,
770
  perp1_div,
771
  llama_div,
772
  P("test plotly"),