victormiller commited on
Commit
48b277d
·
verified ·
1 Parent(s): 5e5aef1

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +52 -52
curated.py CHANGED
@@ -89,19 +89,19 @@ table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin: 40px;")
89
  freelaw_filter = pd.DataFrame(
90
  {
91
  "Dataset": [
92
- "Wikipedia",
93
  ],
94
  "Lines Downloaded": [
95
- "61614907",
96
  ],
97
  "Percent Removed After Language Filter": [
98
- "0.00%",
99
  ],
100
  "Percent Removed After Min Word Count Filter": [
101
- "1.86%",
102
  ],
103
  "Percent Removed After Unigram Probability Filter": [
104
- "0.00%",
105
  ],
106
  "Percent Removed After Local Dedup": [
107
  "",
@@ -118,16 +118,16 @@ table_div_freelaw = Div(NotStr(table_html_freelaw), style="margin: 40px;")
118
  dmm_filter = pd.DataFrame(
119
  {
120
  "Dataset": [
121
- "Wikipedia",
122
  ],
123
  "Lines Downloaded": [
124
- "61614907",
125
  ],
126
  "Percent Removed After Language Filter": [
127
  "0.00%",
128
  ],
129
  "Percent Removed After Min Word Count Filter": [
130
- "1.86%",
131
  ],
132
  "Percent Removed After Unigram Probability Filter": [
133
  "0.00%",
@@ -148,19 +148,19 @@ table_div_dmm = Div(NotStr(table_html_dmm), style="margin: 40px;")
148
  uspto_filter = pd.DataFrame(
149
  {
150
  "Dataset": [
151
- "Wikipedia",
152
  ],
153
  "Lines Downloaded": [
154
- "61614907",
155
  ],
156
  "Percent Removed After Language Filter": [
157
- "0.00%",
158
  ],
159
  "Percent Removed After Min Word Count Filter": [
160
- "1.86%",
161
  ],
162
  "Percent Removed After Unigram Probability Filter": [
163
- "0.00%",
164
  ],
165
  "Percent Removed After Local Dedup": [
166
  "",
@@ -177,19 +177,19 @@ table_div_uspto = Div(NotStr(table_html_uspto), style="margin: 40px;")
177
  pg19_filter = pd.DataFrame(
178
  {
179
  "Dataset": [
180
- "Wikipedia",
181
  ],
182
  "Lines Downloaded": [
183
- "61614907",
184
  ],
185
  "Percent Removed After Language Filter": [
186
- "0.00%",
187
  ],
188
  "Percent Removed After Min Word Count Filter": [
189
- "1.86%",
190
  ],
191
  "Percent Removed After Unigram Probability Filter": [
192
- "0.00%",
193
  ],
194
  "Percent Removed After Local Dedup": [
195
  "",
@@ -207,19 +207,19 @@ table_div_pg19 = Div(NotStr(table_html_pg19), style="margin: 40px;")
207
  hn_filter = pd.DataFrame(
208
  {
209
  "Dataset": [
210
- "Wikipedia",
211
  ],
212
  "Lines Downloaded": [
213
- "61614907",
214
  ],
215
  "Percent Removed After Language Filter": [
216
- "0.00%",
217
  ],
218
  "Percent Removed After Min Word Count Filter": [
219
- "1.86%",
220
  ],
221
  "Percent Removed After Unigram Probability Filter": [
222
- "0.00%",
223
  ],
224
  "Percent Removed After Local Dedup": [
225
  "",
@@ -237,19 +237,19 @@ table_div_hn = Div(NotStr(table_html_hn), style="margin: 40px;")
237
  uirc_filter = pd.DataFrame(
238
  {
239
  "Dataset": [
240
- "Wikipedia",
241
  ],
242
  "Lines Downloaded": [
243
- "61614907",
244
  ],
245
  "Percent Removed After Language Filter": [
246
- "0.00%",
247
  ],
248
  "Percent Removed After Min Word Count Filter": [
249
- "1.86%",
250
  ],
251
  "Percent Removed After Unigram Probability Filter": [
252
- "0.00%",
253
  ],
254
  "Percent Removed After Local Dedup": [
255
  "",
@@ -266,16 +266,16 @@ table_div_uirc = Div(NotStr(table_html_uirc), style="margin: 40px;")
266
  up_filter = pd.DataFrame(
267
  {
268
  "Dataset": [
269
- "Wikipedia",
270
  ],
271
  "Lines Downloaded": [
272
- "61614907",
273
  ],
274
  "Percent Removed After Language Filter": [
275
  "0.00%",
276
  ],
277
  "Percent Removed After Min Word Count Filter": [
278
- "1.86%",
279
  ],
280
  "Percent Removed After Unigram Probability Filter": [
281
  "0.00%",
@@ -295,16 +295,16 @@ table_div_up = Div(NotStr(table_html_up), style="margin: 40px;")
295
  se_filter = pd.DataFrame(
296
  {
297
  "Dataset": [
298
- "Wikipedia",
299
  ],
300
  "Lines Downloaded": [
301
- "61614907",
302
  ],
303
  "Percent Removed After Language Filter": [
304
  "0.00%",
305
  ],
306
  "Percent Removed After Min Word Count Filter": [
307
- "1.86%",
308
  ],
309
  "Percent Removed After Unigram Probability Filter": [
310
  "0.00%",
@@ -324,19 +324,19 @@ table_div_se = Div(NotStr(table_html_se), style="margin: 40px;")
324
  arx_filter = pd.DataFrame(
325
  {
326
  "Dataset": [
327
- "Wikipedia",
328
  ],
329
  "Lines Downloaded": [
330
- "61614907",
331
  ],
332
  "Percent Removed After Language Filter": [
333
- "0.00%",
334
  ],
335
  "Percent Removed After Min Word Count Filter": [
336
- "1.86%",
337
  ],
338
  "Percent Removed After Unigram Probability Filter": [
339
- "0.00%",
340
  ],
341
  "Percent Removed After Local Dedup": [
342
  "",
@@ -353,16 +353,16 @@ table_div_arx = Div(NotStr(table_html_arx), style="margin: 40px;")
353
  s2o_filter = pd.DataFrame(
354
  {
355
  "Dataset": [
356
- "Wikipedia",
357
  ],
358
  "Lines Downloaded": [
359
- "61614907",
360
  ],
361
  "Percent Removed After Language Filter": [
362
  "0.00%",
363
  ],
364
  "Percent Removed After Min Word Count Filter": [
365
- "1.86%",
366
  ],
367
  "Percent Removed After Unigram Probability Filter": [
368
  "0.00%",
@@ -382,19 +382,19 @@ table_div_s2o = Div(NotStr(table_html_s2o), style="margin: 40px;")
382
  med_filter = pd.DataFrame(
383
  {
384
  "Dataset": [
385
- "Wikipedia",
386
  ],
387
  "Lines Downloaded": [
388
- "61614907",
389
  ],
390
  "Percent Removed After Language Filter": [
391
- "0.00%",
392
  ],
393
  "Percent Removed After Min Word Count Filter": [
394
- "1.86%",
395
  ],
396
  "Percent Removed After Unigram Probability Filter": [
397
- "0.00%",
398
  ],
399
  "Percent Removed After Local Dedup": [
400
  "",
@@ -411,19 +411,19 @@ table_div_med = Div(NotStr(table_html_med), style="margin: 40px;")
411
  phil_filter = pd.DataFrame(
412
  {
413
  "Dataset": [
414
- "Wikipedia",
415
  ],
416
  "Lines Downloaded": [
417
- "61614907",
418
  ],
419
  "Percent Removed After Language Filter": [
420
- "0.00%",
421
  ],
422
  "Percent Removed After Min Word Count Filter": [
423
- "1.86%",
424
  ],
425
  "Percent Removed After Unigram Probability Filter": [
426
- "0.00%",
427
  ],
428
  "Percent Removed After Local Dedup": [
429
  "",
 
89
  freelaw_filter = pd.DataFrame(
90
  {
91
  "Dataset": [
92
+ "FreeLaw",
93
  ],
94
  "Lines Downloaded": [
95
+ "75971288",
96
  ],
97
  "Percent Removed After Language Filter": [
98
+ "3.00%",
99
  ],
100
  "Percent Removed After Min Word Count Filter": [
101
+ "7.49%",
102
  ],
103
  "Percent Removed After Unigram Probability Filter": [
104
+ "0.07%",
105
  ],
106
  "Percent Removed After Local Dedup": [
107
  "",
 
118
  dmm_filter = pd.DataFrame(
119
  {
120
  "Dataset": [
121
+ "DM Math",
122
  ],
123
  "Lines Downloaded": [
124
+ "112559888",
125
  ],
126
  "Percent Removed After Language Filter": [
127
  "0.00%",
128
  ],
129
  "Percent Removed After Min Word Count Filter": [
130
+ "0.00%",
131
  ],
132
  "Percent Removed After Unigram Probability Filter": [
133
  "0.00%",
 
148
  uspto_filter = pd.DataFrame(
149
  {
150
  "Dataset": [
151
+ "USPTO",
152
  ],
153
  "Lines Downloaded": [
154
+ "6880276",
155
  ],
156
  "Percent Removed After Language Filter": [
157
+ "0.02%",
158
  ],
159
  "Percent Removed After Min Word Count Filter": [
160
+ "1.88%",
161
  ],
162
  "Percent Removed After Unigram Probability Filter": [
163
+ "0.01%",
164
  ],
165
  "Percent Removed After Local Dedup": [
166
  "",
 
177
  pg19_filter = pd.DataFrame(
178
  {
179
  "Dataset": [
180
+ "PG-19",
181
  ],
182
  "Lines Downloaded": [
183
+ "28752",
184
  ],
185
  "Percent Removed After Language Filter": [
186
+ "0.24%",
187
  ],
188
  "Percent Removed After Min Word Count Filter": [
189
+ "0.00%",
190
  ],
191
  "Percent Removed After Unigram Probability Filter": [
192
+ "0.17%",
193
  ],
194
  "Percent Removed After Local Dedup": [
195
  "",
 
207
  hn_filter = pd.DataFrame(
208
  {
209
  "Dataset": [
210
+ "HackerNews",
211
  ],
212
  "Lines Downloaded": [
213
+ "2064931",
214
  ],
215
  "Percent Removed After Language Filter": [
216
+ "2.62%%",
217
  ],
218
  "Percent Removed After Min Word Count Filter": [
219
+ "0.02%",
220
  ],
221
  "Percent Removed After Unigram Probability Filter": [
222
+ "0.34%",
223
  ],
224
  "Percent Removed After Local Dedup": [
225
  "",
 
237
  uirc_filter = pd.DataFrame(
238
  {
239
  "Dataset": [
240
+ "Ubunutu IRC",
241
  ],
242
  "Lines Downloaded": [
243
+ "37966",
244
  ],
245
  "Percent Removed After Language Filter": [
246
+ "38.10%",
247
  ],
248
  "Percent Removed After Min Word Count Filter": [
249
+ "0.14%",
250
  ],
251
  "Percent Removed After Unigram Probability Filter": [
252
+ "1.12%",
253
  ],
254
  "Percent Removed After Local Dedup": [
255
  "",
 
266
  up_filter = pd.DataFrame(
267
  {
268
  "Dataset": [
269
+ "EuroParl",
270
  ],
271
  "Lines Downloaded": [
272
+ "69814",
273
  ],
274
  "Percent Removed After Language Filter": [
275
  "0.00%",
276
  ],
277
  "Percent Removed After Min Word Count Filter": [
278
+ "0.00%",
279
  ],
280
  "Percent Removed After Unigram Probability Filter": [
281
  "0.00%",
 
295
  se_filter = pd.DataFrame(
296
  {
297
  "Dataset": [
298
+ "StackExchange",
299
  ],
300
  "Lines Downloaded": [
301
+ "23246548",
302
  ],
303
  "Percent Removed After Language Filter": [
304
  "0.00%",
305
  ],
306
  "Percent Removed After Min Word Count Filter": [
307
+ "0.00%",
308
  ],
309
  "Percent Removed After Unigram Probability Filter": [
310
  "0.00%",
 
324
  arx_filter = pd.DataFrame(
325
  {
326
  "Dataset": [
327
+ "ArXiv",
328
  ],
329
  "Lines Downloaded": [
330
+ "1911867",
331
  ],
332
  "Percent Removed After Language Filter": [
333
+ "2.22%",
334
  ],
335
  "Percent Removed After Min Word Count Filter": [
336
+ "5.65%",
337
  ],
338
  "Percent Removed After Unigram Probability Filter": [
339
+ "0.07%",
340
  ],
341
  "Percent Removed After Local Dedup": [
342
  "",
 
353
  s2o_filter = pd.DataFrame(
354
  {
355
  "Dataset": [
356
+ "S2ORC",
357
  ],
358
  "Lines Downloaded": [
359
+ "12963563",
360
  ],
361
  "Percent Removed After Language Filter": [
362
  "0.00%",
363
  ],
364
  "Percent Removed After Min Word Count Filter": [
365
+ "0.00%",
366
  ],
367
  "Percent Removed After Unigram Probability Filter": [
368
  "0.00%",
 
382
  med_filter = pd.DataFrame(
383
  {
384
  "Dataset": [
385
+ "PubMed - Central",
386
  ],
387
  "Lines Downloaded": [
388
+ "5230932",
389
  ],
390
  "Percent Removed After Language Filter": [
391
+ "7.66%",
392
  ],
393
  "Percent Removed After Min Word Count Filter": [
394
+ "1.29%",
395
  ],
396
  "Percent Removed After Unigram Probability Filter": [
397
+ "0.02%",
398
  ],
399
  "Percent Removed After Local Dedup": [
400
  "",
 
411
  phil_filter = pd.DataFrame(
412
  {
413
  "Dataset": [
414
+ "Phil Papers",
415
  ],
416
  "Lines Downloaded": [
417
+ "49389",
418
  ],
419
  "Percent Removed After Language Filter": [
420
+ "20.68%",
421
  ],
422
  "Percent Removed After Min Word Count Filter": [
423
+ "0.00%",
424
  ],
425
  "Percent Removed After Unigram Probability Filter": [
426
+ "0.12%",
427
  ],
428
  "Percent Removed After Local Dedup": [
429
  "",