smilkov commited on
Commit
285cf8d
1 Parent(s): 7c003db

Upload folder using huggingface_hub

Browse files
Files changed (32) hide show
  1. .gitattributes +17 -0
  2. data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin +3 -0
  3. data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl +3 -0
  4. data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json +38 -0
  5. data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl +3 -0
  6. data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet +3 -0
  7. data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json +34 -0
  8. data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet +3 -0
  9. data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json +39 -0
  10. data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet +3 -0
  11. data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json +48 -0
  12. data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet +3 -0
  13. data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json +41 -0
  14. data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet +3 -0
  15. data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json +62 -0
  16. data/datasets/lilac/mmlu_professional_law/config.yml +63 -0
  17. data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet +3 -0
  18. data/datasets/lilac/mmlu_professional_law/manifest.json +26 -0
  19. data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin +3 -0
  20. data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl +3 -0
  21. data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json +35 -0
  22. data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl +3 -0
  23. data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet +3 -0
  24. data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json +31 -0
  25. data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet +3 -0
  26. data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json +36 -0
  27. data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet +3 -0
  28. data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json +45 -0
  29. data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet +3 -0
  30. data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json +38 -0
  31. data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet +3 -0
  32. data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json +59 -0
.gitattributes CHANGED
@@ -13,3 +13,20 @@ data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs mer
13
  data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
14
  data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
15
  data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
14
  data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
15
  data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
16
+ data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
17
+ data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
18
+ data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
19
+ data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
20
+ data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
21
+ data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
22
+ data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
23
+ data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
24
+ data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
25
+ data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
26
+ data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
27
+ data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
28
+ data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
29
+ data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
30
+ data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
31
+ data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
32
+ data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df9d6e2f5df4b8693544f31ca78a9d1936a4caf47acc2babeb1cb766131b7636
3
+ size 684360968
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2081ce5d760026fe341e0553cd9e40747ca902e4e7edb851cb747f350f19bb0d
3
+ size 11174465
data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(choices)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "choices": {
10
+ "repeated_field": {
11
+ "fields": {
12
+ "gte-small": {
13
+ "repeated_field": {
14
+ "fields": {
15
+ "embedding": {
16
+ "dtype": "embedding"
17
+ }
18
+ },
19
+ "dtype": "string_span"
20
+ },
21
+ "signal": {
22
+ "signal_name": "gte-small"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "signal_name": "gte-small"
32
+ },
33
+ "enriched_path": [
34
+ "choices",
35
+ "*"
36
+ ],
37
+ "vector_store": "hnsw"
38
+ }
data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02fb1662da21f33ea1429a0f9adf1301185da46f642a722717fe7c523314fa57
3
+ size 11173475
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:968d4f87c7b51b995d9e3a96423a06b91984e5ee4a47062cd53fe87cca5cafbe
3
+ size 3469413
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(choices)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "choices": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "lang_detection": {
15
+ "dtype": "string",
16
+ "signal": {
17
+ "split_by_paragraph": false,
18
+ "signal_name": "lang_detection"
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
25
+ },
26
+ "signal": {
27
+ "split_by_paragraph": false,
28
+ "signal_name": "lang_detection"
29
+ },
30
+ "enriched_path": [
31
+ "choices",
32
+ "*"
33
+ ]
34
+ }
data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:316f3be499fcbb960bc1e83a201838ca0b3047a71d8e1c302fe4e0d833a3bf90
3
+ size 5544176
data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(choices)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "choices": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "near_dup": {
15
+ "fields": {
16
+ "cluster_id": {
17
+ "dtype": "uint32",
18
+ "categorical": true
19
+ }
20
+ },
21
+ "signal": {
22
+ "threshold": 0.85,
23
+ "signal_name": "near_dup"
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "threshold": 0.85,
33
+ "signal_name": "near_dup"
34
+ },
35
+ "enriched_path": [
36
+ "choices",
37
+ "*"
38
+ ]
39
+ }
data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cb41d4e9d0d82bd824abfa733d5be3a599e011098c5d41ebadeb1166a15f722
3
+ size 3393096
data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(choices)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "choices": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "pii": {
15
+ "fields": {
16
+ "emails": {
17
+ "repeated_field": {
18
+ "dtype": "string_span"
19
+ }
20
+ },
21
+ "ip_addresses": {
22
+ "repeated_field": {
23
+ "dtype": "string_span"
24
+ }
25
+ },
26
+ "secrets": {
27
+ "repeated_field": {
28
+ "dtype": "string_span"
29
+ }
30
+ }
31
+ },
32
+ "signal": {
33
+ "signal_name": "pii"
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ }
40
+ },
41
+ "signal": {
42
+ "signal_name": "pii"
43
+ },
44
+ "enriched_path": [
45
+ "choices",
46
+ "*"
47
+ ]
48
+ }
data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b1255490f17c64f88b8b332c7df30060df612b9de11b17aaf6f70234c363e1e
3
+ size 4080744
data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "spacy_ner(choices)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "choices": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "spacy_ner": {
15
+ "repeated_field": {
16
+ "fields": {
17
+ "label": {
18
+ "dtype": "string"
19
+ }
20
+ },
21
+ "dtype": "string_span"
22
+ },
23
+ "signal": {
24
+ "model": "en_core_web_sm",
25
+ "signal_name": "spacy_ner"
26
+ }
27
+ }
28
+ }
29
+ }
30
+ }
31
+ }
32
+ },
33
+ "signal": {
34
+ "model": "en_core_web_sm",
35
+ "signal_name": "spacy_ner"
36
+ },
37
+ "enriched_path": [
38
+ "choices",
39
+ "*"
40
+ ]
41
+ }
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc00a68e0f835e25b214d90e7e48251b39d748f1e836af713440cd0ea2517ead
3
+ size 4634821
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(choices)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "choices": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "text_statistics": {
15
+ "fields": {
16
+ "num_characters": {
17
+ "dtype": "int32"
18
+ },
19
+ "readability": {
20
+ "dtype": "float32"
21
+ },
22
+ "log(type_token_ratio)": {
23
+ "dtype": "float32"
24
+ },
25
+ "frac_non_ascii": {
26
+ "dtype": "float32",
27
+ "bins": [
28
+ [
29
+ "Low",
30
+ null,
31
+ 0.15
32
+ ],
33
+ [
34
+ "Medium",
35
+ 0.15,
36
+ 0.3
37
+ ],
38
+ [
39
+ "High",
40
+ 0.3,
41
+ null
42
+ ]
43
+ ]
44
+ }
45
+ },
46
+ "signal": {
47
+ "signal_name": "text_statistics"
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ }
54
+ },
55
+ "signal": {
56
+ "signal_name": "text_statistics"
57
+ },
58
+ "enriched_path": [
59
+ "choices",
60
+ "*"
61
+ ]
62
+ }
data/datasets/lilac/mmlu_professional_law/config.yml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - embedding: gte-small
3
+ path:
4
+ - choices
5
+ - '*'
6
+ - embedding: gte-small
7
+ path: question
8
+ name: mmlu_professional_law
9
+ namespace: local
10
+ settings:
11
+ preferred_embedding: gte-small
12
+ ui:
13
+ media_paths:
14
+ - question
15
+ - - choices
16
+ - '*'
17
+ signals:
18
+ - path: question
19
+ signal:
20
+ signal_name: text_statistics
21
+ - path: question
22
+ signal:
23
+ signal_name: pii
24
+ - path: question
25
+ signal:
26
+ signal_name: near_dup
27
+ - path:
28
+ - choices
29
+ - '*'
30
+ signal:
31
+ signal_name: text_statistics
32
+ - path:
33
+ - choices
34
+ - '*'
35
+ signal:
36
+ signal_name: spacy_ner
37
+ - path: question
38
+ signal:
39
+ signal_name: lang_detection
40
+ - path:
41
+ - choices
42
+ - '*'
43
+ signal:
44
+ signal_name: near_dup
45
+ - path:
46
+ - choices
47
+ - '*'
48
+ signal:
49
+ signal_name: pii
50
+ - path:
51
+ - choices
52
+ - '*'
53
+ signal:
54
+ signal_name: lang_detection
55
+ - path: question
56
+ signal:
57
+ signal_name: spacy_ner
58
+ source:
59
+ config_name: professional_law
60
+ dataset_name: cais/mmlu
61
+ source_name: huggingface
62
+ tags:
63
+ - legal
data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65cd2771cf0bb88dbed9ad66ceaff472115f07c9dfea866c7e3f65b68392e745
3
+ size 50699938
data/datasets/lilac/mmlu_professional_law/manifest.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "question": {
8
+ "dtype": "string"
9
+ },
10
+ "choices": {
11
+ "repeated_field": {
12
+ "dtype": "string"
13
+ }
14
+ },
15
+ "answer": {
16
+ "dtype": "string"
17
+ },
18
+ "__hfsplit__": {
19
+ "dtype": "string"
20
+ },
21
+ "__rowid__": {
22
+ "dtype": "string"
23
+ }
24
+ }
25
+ }
26
+ }
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b02300405fccc3011294e15ee869933dd81578173435defbcb19e3b40a65e93
3
+ size 771802212
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f72169740d80ee2b2ea66589d7ebcc58c83381978a4640a27510c416a02bf6c7
3
+ size 11296648
data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(question)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "question": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "gte-small"
30
+ },
31
+ "enriched_path": [
32
+ "question"
33
+ ],
34
+ "vector_store": "hnsw"
35
+ }
data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b51cad455e94b167bc9cf130c262ed1b143a8f386c7074a61983e01cd93d277
3
+ size 7911602
data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf6cf8cdc246ce4406599aec8782d3be02f2585f1fbad74173faf0ffcb453a49
3
+ size 3361922
data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "question"
30
+ ]
31
+ }
data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c4139f699d1a248cf5378c442ef6f17970913394d5d0c79bd7c6e6801ab548a
3
+ size 3697516
data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.85,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "question"
35
+ ]
36
+ }
data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2735c4a2c5d40973652d369140533af74425db6dd753f8a25850d4efeee4928e
3
+ size 3369080
data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "question"
44
+ ]
45
+ }
data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e775b663f9a3b7c7ebdd31f9a860254dec31c18aa46c5a61820050d0556cbb0f
3
+ size 9105982
data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "spacy_ner(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "spacy_ner": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "label": {
17
+ "dtype": "string"
18
+ }
19
+ },
20
+ "dtype": "string_span"
21
+ },
22
+ "signal": {
23
+ "model": "en_core_web_sm",
24
+ "signal_name": "spacy_ner"
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "model": "en_core_web_sm",
33
+ "signal_name": "spacy_ner"
34
+ },
35
+ "enriched_path": [
36
+ "question"
37
+ ]
38
+ }
data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:995b3ac42907ea244d9cb04c68a4715af8ddb7d72dcced056bc58dc9a9f05e7e
3
+ size 4389031
data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "signal": {
54
+ "signal_name": "text_statistics"
55
+ },
56
+ "enriched_path": [
57
+ "question"
58
+ ]
59
+ }