fh-new-vm1

Sleeping

App Files Files Community

victormiller commited on Sep 24, 2024

Commit

e759b31

verified ·

1 Parent(s): 85e7ef7

Update overview

Browse files

Files changed (1) hide show

overview +146 -1

overview CHANGED Viewed

@@ -11,7 +11,152 @@ import web
 import common
 import results
 def overview():
-    return Div()

 import common
 import results
+dataset_comparison = pd.DataFrame(
+        {
+            "Dataset": [
+                "TxT360",
+                "FineWeb",
+                "RefinedWeb",
+                "RedPajama-v2",
+                "C4",
+                "Dolma",
+                "RedPajama-v1",
+                "The Pile",
+            ],
+            "CommonCrawl": [
+                "99 Snapshots",
+                "96 Snapshots",
+                "90 Snapshots",
+                "84 Snapshots",
+                "1 Snapshots",
+                "24 Snapshots",
+                "5 Snapshots",
+                "0.6% of 74 Snapshots",
+            ],
+            "Papers": [
+                "5 Sources",
+                "-",
+                "-",
+                "-",
+                "-",
+                "1 Source",
+                "1 Source",
+                "4 Sources",
+            ],
+            "Wikipedia": [
+                "Improves data quality by removing irrelevant documents",
+                "Filters out low-quality or incomplete documents",
+                "Provides additional information for analysis",
+                "Enables language-specific analysis and insights",
+                "Helps understand the complexity and content of documents",
+                "Identifies important terms and topics in the dataset",
+                "Quantifies the importance of individual words",
+                "RedPajama-v1",
+            ],
+            "FreeLaw": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "DM Math": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "USPTO": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "PG-19": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "HackerNews": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "Ubuntu IRC": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "EuroParl": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "StackExchange": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "Code": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+        }
+    )
+table_html = dataset_comparison.to_html(index=False, border=0)
+table_div = Div(NotStr(table_html), style="margin: 40px;")
 def overview():
+    return Div(Section(
+            H2("Combining the Best of Web and Curated Sources"),
+            H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
+            P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
+            table_div,
+            id="section5",
+        ),
+        id="inner-text",
+        )