victormiller commited on
Commit
6a336ca
·
verified ·
1 Parent(s): 66a1161

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +13 -13
curated.py CHANGED
@@ -539,7 +539,7 @@ data_preprocessing_div = Div(
539
  P(
540
  "The ",
541
  B("Minimum Word Count Filter"),
542
- " sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource.",
543
  ),
544
  P(
545
  "The ",
@@ -570,7 +570,7 @@ data_preprocessing_div = Div(
570
  P(
571
  "The ",
572
  B("Paragraph Count Filter"),
573
- " counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity.",
574
  ),
575
  P(
576
  "The ",
@@ -659,7 +659,7 @@ filtering_process = Div(
659
  ),
660
  P(
661
  B("Filtering: "),
662
- "Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed.",
663
  ),
664
  table_div_wikipedia,
665
  Details(
@@ -694,10 +694,10 @@ filtering_process = Div(
694
  ),
695
  ". All markdowns were combined to create jsonl files.",
696
  ),
697
- P(B("Unique Data Preperation Challenges: ")),
698
  Ul(
699
  Li(
700
- "Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.",
701
  style="margin-bottom: -3px",
702
  ),
703
  ),
@@ -715,7 +715,7 @@ filtering_process = Div(
715
  style="margin-bottom: -3px",
716
  ),
717
  Li(
718
- "Unigram Log Probablity Filter Theshold: -20",
719
  style="margin-bottom: -3px",
720
  ),
721
  Li(
@@ -859,7 +859,7 @@ filtering_process = Div(
859
  D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),
860
  ". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
861
  ),
862
- P(B("Unique Data Preperation Challenges: ")),
863
  Ul(
864
  Li(
865
  "Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.",
@@ -1112,7 +1112,7 @@ filtering_process = Div(
1112
  P(
1113
  "The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level)."
1114
  ),
1115
- P(B("Unique Data Preperation Challenges: ")),
1116
  Ul(
1117
  Li(
1118
  "As discussed above, the comment heirarchies required a thoughful approach to extracting meaningful data. ",
@@ -1190,7 +1190,7 @@ filtering_process = Div(
1190
  P(
1191
  "All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
1192
  ),
1193
- P(B("Unique Data Preperation Challenges: ")),
1194
  Ul(
1195
  Li(
1196
  "Consecutive whitespaces and tabs were found. Consecutive Whitespaces and tabes were reduce to one, single whitespace.",
@@ -1261,7 +1261,7 @@ filtering_process = Div(
1261
  block="block",
1262
  language="python",
1263
  ),
1264
- P(B("Unique Data Preperation Challenges: ")),
1265
  Ul(
1266
  Li(
1267
  "Handling code block was a required finding the specific blocks and exacting the details in one snippet.",
@@ -1328,7 +1328,7 @@ filtering_process = Div(
1328
  block="block",
1329
  language="python",
1330
  ),
1331
- P(B("Unique Data Preperation Challenges: ")),
1332
  Ul(
1333
  Li(
1334
  "Similar to the HackerNews challenges, we had to map comments and sub-comments to the original question.",
@@ -1366,7 +1366,7 @@ filtering_process = Div(
1366
  block="block",
1367
  language="python",
1368
  ),
1369
- P(B("Unique Data Preperation Challenges: ")),
1370
  Ul(
1371
  Li(
1372
  "A byte string was included at the beginning of new lines",
@@ -1409,7 +1409,7 @@ filtering_process = Div(
1409
  ),
1410
  ".",
1411
  ),
1412
- P(B("Unique Data Preperation Challenges: ")),
1413
  Ul(
1414
  Li(
1415
  "Consecutive whitespaces were found spanning 10+ whitespace entries. These whitespaces were reduce to one, single whitespace.",
 
539
  P(
540
  "The ",
541
  B("Minimum Word Count Filter"),
542
+ " sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each data source.",
543
  ),
544
  P(
545
  "The ",
 
570
  P(
571
  "The ",
572
  B("Paragraph Count Filter"),
573
+ " counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful heuristic for document complexity.",
574
  ),
575
  P(
576
  "The ",
 
659
  ),
660
  P(
661
  B("Filtering: "),
662
+ "Manual inspection of the dataset demonstrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed.",
663
  ),
664
  table_div_wikipedia,
665
  Details(
 
694
  ),
695
  ". All markdowns were combined to create jsonl files.",
696
  ),
697
+ P(B("Unique Data Preparation Challenges: ")),
698
  Ul(
699
  Li(
700
+ "Due to large amounts of meaningful data being contained in table formats, special consideration was taken to extract the data and proper metadata.",
701
  style="margin-bottom: -3px",
702
  ),
703
  ),
 
715
  style="margin-bottom: -3px",
716
  ),
717
  Li(
718
+ "Unigram Log Probability Filter Threshold: -20",
719
  style="margin-bottom: -3px",
720
  ),
721
  Li(
 
859
  D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),
860
  ". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
861
  ),
862
+ P(B("Unique Data Preparation Challenges: ")),
863
  Ul(
864
  Li(
865
  "Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.",
 
1112
  P(
1113
  "The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level)."
1114
  ),
1115
+ P(B("Unique Data Preparation Challenges: ")),
1116
  Ul(
1117
  Li(
1118
  "As discussed above, the comment heirarchies required a thoughful approach to extracting meaningful data. ",
 
1190
  P(
1191
  "All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
1192
  ),
1193
+ P(B("Unique Data Preparation Challenges: ")),
1194
  Ul(
1195
  Li(
1196
  "Consecutive whitespaces and tabs were found. Consecutive Whitespaces and tabes were reduce to one, single whitespace.",
 
1261
  block="block",
1262
  language="python",
1263
  ),
1264
+ P(B("Unique Data Preparation Challenges: ")),
1265
  Ul(
1266
  Li(
1267
  "Handling code block was a required finding the specific blocks and exacting the details in one snippet.",
 
1328
  block="block",
1329
  language="python",
1330
  ),
1331
+ P(B("Unique Data Preparation Challenges: ")),
1332
  Ul(
1333
  Li(
1334
  "Similar to the HackerNews challenges, we had to map comments and sub-comments to the original question.",
 
1366
  block="block",
1367
  language="python",
1368
  ),
1369
+ P(B("Unique Data Preparation Challenges: ")),
1370
  Ul(
1371
  Li(
1372
  "A byte string was included at the beginning of new lines",
 
1409
  ),
1410
  ".",
1411
  ),
1412
+ P(B("Unique Data Preparation Challenges: ")),
1413
  Ul(
1414
  Li(
1415
  "Consecutive whitespaces were found spanning 10+ whitespace entries. These whitespaces were reduce to one, single whitespace.",