victormiller commited on
Commit
e759b31
·
verified ·
1 Parent(s): 85e7ef7

Update overview

Browse files
Files changed (1) hide show
  1. overview +146 -1
overview CHANGED
@@ -11,7 +11,152 @@ import web
11
  import common
12
  import results
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def overview():
16
- return Div()
 
 
 
 
 
 
 
 
17
 
 
11
  import common
12
  import results
13
 
14
+ dataset_comparison = pd.DataFrame(
15
+ {
16
+ "Dataset": [
17
+ "TxT360",
18
+ "FineWeb",
19
+ "RefinedWeb",
20
+ "RedPajama-v2",
21
+ "C4",
22
+ "Dolma",
23
+ "RedPajama-v1",
24
+ "The Pile",
25
+ ],
26
+ "CommonCrawl": [
27
+ "99 Snapshots",
28
+ "96 Snapshots",
29
+ "90 Snapshots",
30
+ "84 Snapshots",
31
+ "1 Snapshots",
32
+ "24 Snapshots",
33
+ "5 Snapshots",
34
+ "0.6% of 74 Snapshots",
35
+ ],
36
+ "Papers": [
37
+ "5 Sources",
38
+ "-",
39
+ "-",
40
+ "-",
41
+ "-",
42
+ "1 Source",
43
+ "1 Source",
44
+ "4 Sources",
45
+ ],
46
+ "Wikipedia": [
47
+ "Improves data quality by removing irrelevant documents",
48
+ "Filters out low-quality or incomplete documents",
49
+ "Provides additional information for analysis",
50
+ "Enables language-specific analysis and insights",
51
+ "Helps understand the complexity and content of documents",
52
+ "Identifies important terms and topics in the dataset",
53
+ "Quantifies the importance of individual words",
54
+ "RedPajama-v1",
55
+ ],
56
+ "FreeLaw": [
57
+ "May exclude documents in less common languages",
58
+ "May remove documents with valuable information",
59
+ "May introduce bias in the analysis",
60
+ "May not accurately represent the language distribution",
61
+ "May not capture the complexity of document structure",
62
+ "May be sensitive to noise and outliers",
63
+ "May not capture the semantic meaning of words",
64
+ "RedPajama-v1",
65
+ ],
66
+ "DM Math": [
67
+ "May exclude documents in less common languages",
68
+ "May remove documents with valuable information",
69
+ "May introduce bias in the analysis",
70
+ "May not accurately represent the language distribution",
71
+ "May not capture the complexity of document structure",
72
+ "May be sensitive to noise and outliers",
73
+ "May not capture the semantic meaning of words",
74
+ "RedPajama-v1",
75
+ ],
76
+ "USPTO": [
77
+ "May exclude documents in less common languages",
78
+ "May remove documents with valuable information",
79
+ "May introduce bias in the analysis",
80
+ "May not accurately represent the language distribution",
81
+ "May not capture the complexity of document structure",
82
+ "May be sensitive to noise and outliers",
83
+ "May not capture the semantic meaning of words",
84
+ "RedPajama-v1",
85
+ ],
86
+ "PG-19": [
87
+ "May exclude documents in less common languages",
88
+ "May remove documents with valuable information",
89
+ "May introduce bias in the analysis",
90
+ "May not accurately represent the language distribution",
91
+ "May not capture the complexity of document structure",
92
+ "May be sensitive to noise and outliers",
93
+ "May not capture the semantic meaning of words",
94
+ "RedPajama-v1",
95
+ ],
96
+ "HackerNews": [
97
+ "May exclude documents in less common languages",
98
+ "May remove documents with valuable information",
99
+ "May introduce bias in the analysis",
100
+ "May not accurately represent the language distribution",
101
+ "May not capture the complexity of document structure",
102
+ "May be sensitive to noise and outliers",
103
+ "May not capture the semantic meaning of words",
104
+ "RedPajama-v1",
105
+ ],
106
+ "Ubuntu IRC": [
107
+ "May exclude documents in less common languages",
108
+ "May remove documents with valuable information",
109
+ "May introduce bias in the analysis",
110
+ "May not accurately represent the language distribution",
111
+ "May not capture the complexity of document structure",
112
+ "May be sensitive to noise and outliers",
113
+ "May not capture the semantic meaning of words",
114
+ "RedPajama-v1",
115
+ ],
116
+ "EuroParl": [
117
+ "May exclude documents in less common languages",
118
+ "May remove documents with valuable information",
119
+ "May introduce bias in the analysis",
120
+ "May not accurately represent the language distribution",
121
+ "May not capture the complexity of document structure",
122
+ "May be sensitive to noise and outliers",
123
+ "May not capture the semantic meaning of words",
124
+ "RedPajama-v1",
125
+ ],
126
+ "StackExchange": [
127
+ "May exclude documents in less common languages",
128
+ "May remove documents with valuable information",
129
+ "May introduce bias in the analysis",
130
+ "May not accurately represent the language distribution",
131
+ "May not capture the complexity of document structure",
132
+ "May be sensitive to noise and outliers",
133
+ "May not capture the semantic meaning of words",
134
+ "RedPajama-v1",
135
+ ],
136
+ "Code": [
137
+ "May exclude documents in less common languages",
138
+ "May remove documents with valuable information",
139
+ "May introduce bias in the analysis",
140
+ "May not accurately represent the language distribution",
141
+ "May not capture the complexity of document structure",
142
+ "May be sensitive to noise and outliers",
143
+ "May not capture the semantic meaning of words",
144
+ "RedPajama-v1",
145
+ ],
146
+ }
147
+ )
148
+
149
+ table_html = dataset_comparison.to_html(index=False, border=0)
150
+ table_div = Div(NotStr(table_html), style="margin: 40px;")
151
 
152
  def overview():
153
+ return Div(Section(
154
+ H2("Combining the Best of Web and Curated Sources"),
155
+ H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
156
+ P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
157
+ table_div,
158
+ id="section5",
159
+ ),
160
+ id="inner-text",
161
+ )
162