Spaces:
Running
Running
victormiller
commited on
Update curated.py
Browse files- curated.py +6 -6
curated.py
CHANGED
@@ -459,7 +459,7 @@ data_preprocessing_div = Div(
|
|
459 |
P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
|
460 |
P("The ", B("Language Filter"), " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
|
461 |
P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
|
462 |
-
P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
|
463 |
H3("Data Processing for S2ORC"),
|
464 |
P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
|
465 |
P("The ", B("Title and Abstract Filter")," extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
|
@@ -558,7 +558,7 @@ filtering_process = Div(
|
|
558 |
Ol(
|
559 |
Li("Language Filter: any language other than English are discarded"),
|
560 |
Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
|
561 |
-
Li("Unigram Log Probablity Filter:
|
562 |
Li("Note: the Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
563 |
),
|
564 |
table_div_arx,
|
@@ -616,7 +616,7 @@ filtering_process = Div(
|
|
616 |
P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
|
617 |
H4("Download and Extraction"),
|
618 |
Ol(
|
619 |
-
Li("This was downloaded directly in zip format using S2ORC
|
620 |
),
|
621 |
H4("Filtering"),
|
622 |
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
|
@@ -936,9 +936,9 @@ filtering_process = Div(
|
|
936 |
P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
937 |
H4("Filtering"),
|
938 |
Ol(
|
939 |
-
Li("Language Filter: English", style = "margin-bottom:
|
940 |
-
Li("Minimum Word Count Filter: 20", style = "margin-bottom:
|
941 |
-
Li("Unigram Log Probability: ", "-20", style = "margin-bottom:
|
942 |
),
|
943 |
table_div_pg19,
|
944 |
Details(
|
|
|
459 |
P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
|
460 |
P("The ", B("Language Filter"), " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
|
461 |
P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
|
462 |
+
P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by ", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
|
463 |
H3("Data Processing for S2ORC"),
|
464 |
P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
|
465 |
P("The ", B("Title and Abstract Filter")," extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
|
|
|
558 |
Ol(
|
559 |
Li("Language Filter: any language other than English are discarded"),
|
560 |
Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
|
561 |
+
Li("Unigram Log Probablity Filter Theshold: -20"),
|
562 |
Li("Note: the Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
563 |
),
|
564 |
table_div_arx,
|
|
|
616 |
P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
|
617 |
H4("Download and Extraction"),
|
618 |
Ol(
|
619 |
+
Li("This was downloaded directly in zip format using S2ORC API key and a get() request: ", D_code("response = urllib.request.urlopen(url)", language = "python")),
|
620 |
),
|
621 |
H4("Filtering"),
|
622 |
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
|
|
|
936 |
P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
937 |
H4("Filtering"),
|
938 |
Ol(
|
939 |
+
Li("Language Filter: English", style = "margin-bottom: -3px"),
|
940 |
+
Li("Minimum Word Count Filter: 20", style = "margin-bottom: -3px"),
|
941 |
+
Li("Unigram Log Probability: ", "-20", style = "margin-bottom: -3px"),
|
942 |
),
|
943 |
table_div_pg19,
|
944 |
Details(
|