victormiller commited on
Commit
8a16e84
·
verified ·
1 Parent(s): 0bc171c

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +201 -24
curated.py CHANGED
@@ -571,6 +571,183 @@ phil_examples = Div(
571
  ),
572
  )
573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  filtering_process = Div(
575
  Section(
576
  H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
@@ -605,10 +782,10 @@ filtering_process = Div(
605
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
606
  ),
607
  table_div_arx,
608
- # Details(
609
- # Summary("ArXiv Filtering Examples"),
610
- # arx_examples,
611
- # ),
612
  ),
613
  ),
614
  Section(
@@ -647,10 +824,10 @@ filtering_process = Div(
647
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
648
  ),
649
  table_div_s2o,
650
- # Details(
651
- # Summary("FreeLaw Filtering Examples -- need to update"),
652
- # freelaw_examples,
653
- # ),
654
  ),
655
  ),
656
  Section(
@@ -683,10 +860,10 @@ filtering_process = Div(
683
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
684
  ),
685
  table_div_med,
686
- # Details(
687
- # Summary("PubMed Filtering Examples"),
688
- # pubmed_examples,
689
- # ),
690
  ),
691
  ),
692
  Section(
@@ -715,10 +892,10 @@ filtering_process = Div(
715
  H4("Filtering"),
716
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
717
  table_div_up,
718
- # Details(
719
- # Summary("EuroParl Filtering Examples"),
720
- # eu_examples,
721
- # ),
722
  ),
723
  ),
724
  Section(
@@ -860,10 +1037,10 @@ filtering_process = Div(
860
  Li("None"),
861
  ),
862
  table_div_dmm,
863
- # Details(
864
- # Summary("DM Math Filtering Examples"),
865
- # dmm_examples,
866
- # ),
867
  ),
868
  ),
869
  Section(
@@ -881,10 +1058,10 @@ filtering_process = Div(
881
  Li("Unigram Log Probability"),
882
  ),
883
  table_div_pg19,
884
- #Details(
885
- # Summary("PG-19 Filtering Examples"),
886
- # pg19_examples,
887
- #),
888
  ),
889
  ),
890
  )
 
571
  ),
572
  )
573
 
574
+ arx_examples = Div(
575
+ Div(
576
+ get_arx_data(target=gen_random_id()),
577
+ style="border: 1px solid #ccc; padding: 20px;",
578
+ ),
579
+ )
580
+
581
+ def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
582
+ doc_id = max(0, min(int(doc_id), 9))
583
+
584
+ if data_source == "S2ORC":
585
+ raw_sample_doc = extracted_sample_doc = json.load(
586
+ open("data/curated_samples/s2orc_raw.json")
587
+ )
588
+ else:
589
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
590
+
591
+ raw_json = raw_sample_doc[doc_id]
592
+ extracted_json = extracted_sample_doc[doc_id]
593
+ return view_data(
594
+ raw_json,
595
+ extracted_json,
596
+ doc_id=doc_id,
597
+ data_source="S2ORC",
598
+ data_sources="S2ORC",
599
+ target=target,
600
+ )
601
+
602
+ s2o_examples = Div(
603
+ Div(
604
+ get_S2ORC_data(target=gen_random_id()),
605
+ style="border: 1px solid #ccc; padding: 20px;",
606
+ ),
607
+ )
608
+
609
+ def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
610
+ doc_id = max(0, min(int(doc_id), 9))
611
+
612
+ if data_source == "S2ORC":
613
+ raw_sample_doc = extracted_sample_doc = json.load(
614
+ open("data/curated_samples/s2orc_abstract_raw.json")
615
+ )
616
+ else:
617
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
618
+
619
+ raw_json = raw_sample_doc[doc_id]
620
+ extracted_json = extracted_sample_doc[doc_id]
621
+ return view_data(
622
+ raw_json,
623
+ extracted_json,
624
+ doc_id=doc_id,
625
+ data_source="S2ORC Abstract",
626
+ data_sources="S2ORC Abstract",
627
+ target=target,
628
+ )
629
+
630
+ s2oa_examples = Div(
631
+ Div(
632
+ get_S2ORCA_data(target=gen_random_id()),
633
+ style="border: 1px solid #ccc; padding: 20px;",
634
+ ),
635
+ )
636
+
637
+ def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
638
+ doc_id = max(0, min(int(doc_id), 9))
639
+
640
+ if data_source == "Pubmed":
641
+ raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
642
+ extracted_sample_doc = json.load(
643
+ open("data/curated_samples/pubmed_extract.json")
644
+ )
645
+ else:
646
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
647
+
648
+ raw_json = raw_sample_doc[doc_id]
649
+ extracted_json = extracted_sample_doc[doc_id]
650
+ return view_data(
651
+ raw_json,
652
+ extracted_json,
653
+ doc_id=doc_id,
654
+ data_source="Pubmed",
655
+ data_sources="Pubmed",
656
+ target=target,
657
+ )
658
+
659
+ pubmed_examples = Div(
660
+ Div(
661
+ get_pubmed_data(target=gen_random_id()),
662
+ style="border: 1px solid #ccc; padding: 20px;",
663
+ ),
664
+ )
665
+
666
+ def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
667
+ doc_id = max(0, min(int(doc_id), 9))
668
+
669
+ if data_source == "DM Math":
670
+ raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
671
+ extracted_sample_doc = json.load(
672
+ open("data/curated_samples/dm_maths_extract.json")
673
+ )
674
+ else:
675
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
676
+
677
+ raw_json = raw_sample_doc[doc_id]
678
+ extracted_json = extracted_sample_doc[doc_id]
679
+ return view_data(
680
+ raw_json,
681
+ extracted_json,
682
+ doc_id=doc_id,
683
+ data_source="DM Math",
684
+ data_sources="DM Math",
685
+ target=target,
686
+ )
687
+
688
+ dmm_examples = Div(
689
+ Div(
690
+ get_dmm_data(target=gen_random_id()),
691
+ style="border: 1px solid #ccc; padding: 20px;",
692
+ ),
693
+ )
694
+
695
+ def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
696
+ doc_id = max(0, min(int(doc_id), 9))
697
+
698
+ if data_source == "PG19":
699
+ raw_sample_doc = extracted_sample_doc = json.load(
700
+ open("data/curated_samples/pg19_raw.json")
701
+ )
702
+ else:
703
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
704
+
705
+ raw_json = raw_sample_doc[doc_id]
706
+ extracted_json = extracted_sample_doc[doc_id]
707
+ return view_data(
708
+ raw_json,
709
+ extracted_json,
710
+ doc_id=doc_id,
711
+ data_source="PG19",
712
+ data_sources="PG19",
713
+ target=target,
714
+ )
715
+
716
+ pg19_examples = Div(
717
+ Div(
718
+ get_pg19_data(target=gen_random_id()),
719
+ style="border: 1px solid #ccc; padding: 20px;",
720
+ ),
721
+ )
722
+
723
+ def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
724
+ doc_id = max(0, min(int(doc_id), 9))
725
+
726
+ if data_source == "Europarl":
727
+ raw_sample_doc = extracted_sample_doc = json.load(
728
+ open("data/curated_samples/europarl_raw.json")
729
+ )
730
+ else:
731
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
732
+
733
+ raw_json = raw_sample_doc[doc_id]
734
+ extracted_json = extracted_sample_doc[doc_id]
735
+ return view_data(
736
+ raw_json,
737
+ extracted_json,
738
+ doc_id=doc_id,
739
+ data_source="Europarl",
740
+ data_sources="Europarl",
741
+ target=target,
742
+ )
743
+
744
+ eu_examples = Div(
745
+ Div(
746
+ get_eu_data(target=gen_random_id()),
747
+ style="border: 1px solid #ccc; padding: 20px;",
748
+ ),
749
+ )
750
+
751
  filtering_process = Div(
752
  Section(
753
  H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
 
782
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
783
  ),
784
  table_div_arx,
785
+ Details(
786
+ Summary("ArXiv Filtering Examples"),
787
+ arx_examples,
788
+ ),
789
  ),
790
  ),
791
  Section(
 
824
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
825
  ),
826
  table_div_s2o,
827
+ Details(
828
+ Summary("FreeLaw Filtering Examples -- need to update"),
829
+ freelaw_examples,
830
+ ),
831
  ),
832
  ),
833
  Section(
 
860
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
861
  ),
862
  table_div_med,
863
+ Details(
864
+ Summary("PubMed Filtering Examples"),
865
+ pubmed_examples,
866
+ ),
867
  ),
868
  ),
869
  Section(
 
892
  H4("Filtering"),
893
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
894
  table_div_up,
895
+ Details(
896
+ Summary("EuroParl Filtering Examples"),
897
+ eu_examples,
898
+ ),
899
  ),
900
  ),
901
  Section(
 
1037
  Li("None"),
1038
  ),
1039
  table_div_dmm,
1040
+ Details(
1041
+ Summary("DM Math Filtering Examples"),
1042
+ dmm_examples,
1043
+ ),
1044
  ),
1045
  ),
1046
  Section(
 
1058
  Li("Unigram Log Probability"),
1059
  ),
1060
  table_div_pg19,
1061
+ Details(
1062
+ Summary("PG-19 Filtering Examples"),
1063
+ pg19_examples,
1064
+ ),
1065
  ),
1066
  ),
1067
  )