victormiller commited on
Commit
5672cf7
·
verified ·
1 Parent(s): f36591a

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +61 -113
curated.py CHANGED
@@ -458,15 +458,14 @@ filtering_process = Div(
458
  Section(
459
  H3("ArXiv"),
460
  H4("Download and Extraction"),
461
- P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), "We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
462
  H4("Filtering"),
463
- P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
464
  Ol(
465
- Li("min_word: less than 500 words (not inclusive) are discarded"),
466
- Li("Language: any language other than English are discarded"),
467
- Li("Frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
468
- Li("Unigram log probablity: Must have higher than -20 average unigram log probability. To calculate the average log word probability, we use word frequencies extracted from the 1T Web Ngram corpus; specifically, we use the list available created by Rachel Tatman. A copy is hosted here."),
469
- Li("number 4 above had hyperlinks that need to be included"),
470
  ),
471
  H4("Local Deduplication Process"),
472
  Ol(
@@ -510,7 +509,7 @@ filtering_process = Div(
510
  table_div_s2o,
511
  ),
512
  Section(
513
- H3("PubMed"),
514
  H4("Download and Extraction"),
515
  Ol(
516
  Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
@@ -541,12 +540,7 @@ filtering_process = Div(
541
  Section(
542
  H3("Phil Papers"),
543
  H4("Download and Extraction"),
544
- Ol(
545
- Li("Original pdf files download location was downloaded from https://philarchive.org/oai.pl "),
546
- Li("All pdf files were downloaded"),
547
- Li("Pdf was converted to text using java -jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}"),
548
- Li("Language was detected and added using langdetect library"),
549
- ),
550
  H4("Filtering"),
551
  Ol(
552
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
@@ -555,120 +549,84 @@ filtering_process = Div(
555
  Ol(
556
  Li("Local dedup was done with all papers combined."),
557
  ),
558
- H4("Global Deduplication Process"),
559
- Ol(
560
- Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
561
- ),
562
  table_div_phil,
563
  ),
564
  Section(
565
  H3("Europarl"),
566
  H4("Download and Extraction"),
567
- Ol(
568
- Li("Original data was downloaded from http://www.statmt.org/europarl/v7/europarl.tgz"),
569
- Li("Finally the remaining files are converted to jsonl lines"),
570
- ),
571
  H4("Filtering"),
572
- Ol(
573
- Li("Smaller than 200 characters of documents are removed while downloading so no others filtered were run"),
574
- Li("Tags were also removed while downloading"),
575
- ),
576
  H4("Local Deduplication Process"),
577
  Ol(
578
  Li("Local dedup was done within europarl itself"),
579
  ),
580
- H4("Global Deduplication Process"),
581
- Ol(
582
- Li("After local dedup, remaining europarl was deduped again with all the datasets combined"),
583
- ),
584
  table_div_up,
585
  ),
586
  Section(
587
  H3("HackerNews"),
588
  H4("Download and Extraction"),
589
- Ol(
590
- Li("Data was parsed using hackernews story ids starting using https://hacker-news.firebaseio.com/v0/item/"),
591
- Li("Story ids was started from 1 till 37500000 (all stories that gives error while pinging the url was removed). Each post is a story, with each reply another story"),
592
- Li("As there were too many requests error, there was a wait(2 sec) statement included in the code"),
593
- Li("As the number of stories were large and containing all the replies was time consuming and possibility of introducing too much error, only longest depth threads were included from 3rd level onwards. So we include the title then all the replies (2nd level) but replies to those replies (3rd level) were only the ones which has maximum depth."),
594
- ),
595
  H4("Filtering"),
596
  Ol(
597
- Li("Min word: 10"),
598
- Li("Language: Only english"),
599
- Li("Unigram log probablity"),
600
  ),
601
  H4("Local Deduplication Process"),
602
  Ol(
603
  Li("Local dedup was done within hackernews itself"),
604
  ),
605
- H4("Global Deduplication Process"),
606
- Ol(
607
- Li("After local dedup, remaining data was deduped again with all the datasets combined"),
608
- ),
609
  table_div_hn,
610
  ),
611
  Section(
612
  H3("USPTO"),
613
  H4("Download and Extraction"),
614
- Ol(
615
- Li("Data was downloaded and extracted using tags from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),
616
- Li("There were three different format that needed three different functions to download and extract the data based on year: Pre_2002, 2002_to_2004, post_2004"),
617
-
618
- ),
619
  H4("Filtering"),
620
  Ol(
621
- Li("Min word: 50"),
622
- Li("Language: Only english"),
623
- Li("Unigram log probablity"),
624
  ),
625
  H4("Local Deduplication Process"),
626
  Ol(
627
  Li("Local dedup was done within USPTO itself"),
628
  ),
629
- H4("Global Deduplication Process"),
630
- Ol(
631
- Li("After local dedup, remaining data was deduped again with all the datasets combined"),
632
- ),
633
  table_div_uspto,
634
  ),
635
  Section(
636
  H3("FreeLaw"),
637
  H4("Download and Extraction"),
638
- Ol(
639
- Li("CSV format bulk data was downloaded from https://storage.courtlistener.com/bulk-data/"),
640
- Li("They have multiple dumps as shown below with lot of duplicates (exact number is given in the table at the top)"),
641
- Li("there is an image to show here!"),
642
- Li("As these are csv files, they have multiple columns where text can be present, so we extracted text from the following columns using html2text function which just convert and extract tags from html tags"),
643
- Li("image to show"),
644
- Li("Text was also extracted from row named 'plain_text'"),
645
- Li("Priority is always given to plain_text first then from 6 to 1 in the subsequent order following pile logic"),
646
- ),
 
647
  H4("Filtering"),
648
  Ol(
649
- Li("Min word: 50"),
650
- Li("Language: Only english"),
651
- Li("Unigram log probablity"),
652
  ),
653
  H4("Local Deduplication Process"),
654
  Ol(
655
  Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
656
  ),
657
- H4("Global Deduplication Process"),
658
- Ol(
659
- Li("After local dedup, remaining data was deduped again with all the datasets combined"),
660
- ),
661
  table_div_freelaw,
662
  ),
663
  Section(
664
  H3("StackExchange"),
665
  H4("Download and Extraction"),
666
- Ol(
667
- Li("Archive dump was used to download data from all the stackexchange sub urls, eg., math.stackexchange etc."),
668
- Li("Raw data is in XML format with lot of metadata. We only used two files Posts.xml and Comments.xml"),
669
- Li("We parsed using post_id to connect each question to answer and then to comments so our data has same hierarchy as stackexchange UI"),
670
- Li("""
671
- 1. Questions:
672
  2. Comment1:
673
  3. Comment2:
674
  4. Answer1:
@@ -677,87 +635,77 @@ filtering_process = Div(
677
  7. Answer2:
678
  8. Comment1:
679
  9. Comment2:
680
- """),
681
- ),
682
  H4("Filtering"),
683
  Ol(
684
- Li("Min word: 10"),
685
  ),
686
  H4("Local Deduplication Process"),
687
  Ol(
688
  Li("Local dedup was done within stackexchange itself"),
689
  ),
690
- H4("Global Deduplication Process"),
691
- Ol(
692
- Li("After local dedup, remaining data was deduped again with all the datasets combined"),
693
- ),
694
  table_div_se,
695
  ),
696
  Section(
697
  H3("Ubuntu IRC"),
698
  H4("Download and Extraction"),
699
- Ol(
700
- Li("All the data was downloaded from https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/ based on the year"),
701
- Li("During extraction, we cleaned the logs using following functions"),
702
- Li("image here"),
703
- ),
 
 
 
 
 
 
 
704
  H4("Filtering"),
705
  Ol(
706
- Li("Min word: 10"),
707
- Li("Language: Only english"),
708
- Li("Unigram log probablity"),
709
  ),
710
  H4("Local Deduplication Process"),
711
  Ol(
712
  Li("Local dedup was done within Ubuntu IRC itself"),
713
  ),
714
- H4("Global Deduplication Process"),
715
- Ol(
716
- Li("After local dedup, remaining data was deduped again with all the datasets combined"),
717
- ),
718
  table_div_uirc,
719
  ),
720
  Section(
721
  H3("DM Maths"),
722
  H4("Download and Extraction"),
723
- Ol(
724
- Li("Directly downloaded from hugging-face dump dm_maths/"),
725
- Li("Data was converted in jsonl format where each lines are : Question: TEXT Answer: TEXT"),
726
- ),
727
  H4("Filtering"),
728
  Ol(
729
- Li("None"),
730
  ),
731
  H4("Local Deduplication Process"),
732
  Ol(
733
  Li("None"),
734
  ),
735
- H4("Global Deduplication Process"),
736
- Ol(
737
- Li("None"),
738
- ),
739
  table_div_dmm,
740
  ),
741
  Section(
742
  H3("PG19"),
743
  H4("Download and Extraction"),
744
  Ol(
745
- Li("Directly downloaded from hugging-face dump pg19/"),
746
  ),
747
  H4("Filtering"),
748
  Ol(
749
- Li("Min word: 20"),
750
- Li("Language: ???"),
751
- Li("Unigram log probablity"),
752
  ),
753
  H4("Local Deduplication Process"),
754
  Ol(
755
  Li("Local dedup was done within PG19 itself"),
756
  ),
757
- H4("Global Deduplication Process"),
758
- Ol(
759
- Li("After local dedup, remaining data was deduped again with all the datasets combined"),
760
- ),
761
  table_div_pg19,
762
  ),
763
  )
 
458
  Section(
459
  H3("ArXiv"),
460
  H4("Download and Extraction"),
461
+ P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
462
  H4("Filtering"),
463
+ P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset (citation needed)"),
464
  Ol(
465
+ Li("Language Filter: any language other than English are discarded"),
466
+ Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
467
+ Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by". A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
468
+ Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
 
469
  ),
470
  H4("Local Deduplication Process"),
471
  Ol(
 
509
  table_div_s2o,
510
  ),
511
  Section(
512
+ H3("PubMed - need to update with abstract vs central"),
513
  H4("Download and Extraction"),
514
  Ol(
515
  Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
 
540
  Section(
541
  H3("Phil Papers"),
542
  H4("Download and Extraction"),
543
+ P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
 
 
 
 
 
544
  H4("Filtering"),
545
  Ol(
546
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
 
549
  Ol(
550
  Li("Local dedup was done with all papers combined."),
551
  ),
 
 
 
 
552
  table_div_phil,
553
  ),
554
  Section(
555
  H3("Europarl"),
556
  H4("Download and Extraction"),
557
+ P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
 
 
 
558
  H4("Filtering"),
559
+ P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
 
 
 
560
  H4("Local Deduplication Process"),
561
  Ol(
562
  Li("Local dedup was done within europarl itself"),
563
  ),
 
 
 
 
564
  table_div_up,
565
  ),
566
  Section(
567
  H3("HackerNews"),
568
  H4("Download and Extraction"),
569
+ P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
570
+ P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest threads included stories from the 3rd level onwards. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
 
 
 
 
571
  H4("Filtering"),
572
  Ol(
573
+ Li("Language Filter: English"),
574
+ Li("Minimum Word Count Filter: 10"),
575
+ Li("Unigram Log Probability"),
576
  ),
577
  H4("Local Deduplication Process"),
578
  Ol(
579
  Li("Local dedup was done within hackernews itself"),
580
  ),
 
 
 
 
581
  table_div_hn,
582
  ),
583
  Section(
584
  H3("USPTO"),
585
  H4("Download and Extraction"),
586
+ P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
 
 
 
 
587
  H4("Filtering"),
588
  Ol(
589
+ Li("Language Filter: English"),
590
+ Li("Minimum Word Count Filter: 50"),
591
+ Li("Unigram Log Probability"),
592
  ),
593
  H4("Local Deduplication Process"),
594
  Ol(
595
  Li("Local dedup was done within USPTO itself"),
596
  ),
 
 
 
 
597
  table_div_uspto,
598
  ),
599
  Section(
600
  H3("FreeLaw"),
601
  H4("Download and Extraction"),
602
+ P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"),". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function.",
603
+ D_code("""
604
+ ("html", html2text),
605
+ ("html_lawbox", html2text),
606
+ ("html_columbia", html2text),
607
+ ("html_anon_2020", html2text),
608
+ ("html_with_citations", html2text),
609
+ ("xml_harvard", html2text),
610
+ plain_text
611
+ """, language ="SQL")," All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
612
  H4("Filtering"),
613
  Ol(
614
+ Li("Language Filter: English"),
615
+ Li("Minimum Word Count Filter: 50"),
616
+ Li("Unigram Log Probability"),
617
  ),
618
  H4("Local Deduplication Process"),
619
  Ol(
620
  Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
621
  ),
 
 
 
 
622
  table_div_freelaw,
623
  ),
624
  Section(
625
  H3("StackExchange"),
626
  H4("Download and Extraction"),
627
+ P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href"math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
628
+ P("""
629
+ 1. Questions:
 
 
 
630
  2. Comment1:
631
  3. Comment2:
632
  4. Answer1:
 
635
  7. Answer2:
636
  8. Comment1:
637
  9. Comment2:
638
+ """),
 
639
  H4("Filtering"),
640
  Ol(
641
+ Li("Minimum Word Count Filter: 10"),
642
  ),
643
  H4("Local Deduplication Process"),
644
  Ol(
645
  Li("Local dedup was done within stackexchange itself"),
646
  ),
 
 
 
 
647
  table_div_se,
648
  ),
649
  Section(
650
  H3("Ubuntu IRC"),
651
  H4("Download and Extraction"),
652
+ P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href"https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
653
+ P("During extraction, the logs were cleaned using following functions:"),
654
+ D_code("""
655
+ def exclude_system(x):
656
+ return '\n'.join(line for line in x.split('\n') if not line.startswith('==='))
657
+
658
+ def exclude_select_system(x):
659
+ return '\n'.join(line for line in x.split('\n') if not (line.startswith('===') and any(term in line for term in ['has joined #', 'has left #', 'Topic for #', "Topic (#", "is now known as"]) ))
660
+
661
+ def clean(x):
662
+ return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
663
+ """, block="block", language="python" ),
664
  H4("Filtering"),
665
  Ol(
666
+ Li("Language Filter: English"),
667
+ Li("Minimum Word Count Filter: 10"),
668
+ Li("Unigram Log Probability"),
669
  ),
670
  H4("Local Deduplication Process"),
671
  Ol(
672
  Li("Local dedup was done within Ubuntu IRC itself"),
673
  ),
 
 
 
 
674
  table_div_uirc,
675
  ),
676
  Section(
677
  H3("DM Maths"),
678
  H4("Download and Extraction"),
679
+ P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
680
+ D_code("""
681
+ Question: TEXT
682
+ Answer: TEXT""", block="block", language="python"),
683
  H4("Filtering"),
684
  Ol(
685
+ Li("No filtering was applied to DM Math"),
686
  ),
687
  H4("Local Deduplication Process"),
688
  Ol(
689
  Li("None"),
690
  ),
 
 
 
 
691
  table_div_dmm,
692
  ),
693
  Section(
694
  H3("PG19"),
695
  H4("Download and Extraction"),
696
  Ol(
697
+ Li("The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
698
  ),
699
  H4("Filtering"),
700
  Ol(
701
+ Li("Language Filter: ???"),
702
+ Li("Minimum Word Count Filter: 20"),
703
+ Li("Unigram Log Probability"),
704
  ),
705
  H4("Local Deduplication Process"),
706
  Ol(
707
  Li("Local dedup was done within PG19 itself"),
708
  ),
 
 
 
 
709
  table_div_pg19,
710
  ),
711
  )