smilkov commited on
Commit
0baab57
1 Parent(s): 2101081

Upload folder using huggingface_hub

Browse files
Files changed (38) hide show
  1. .gitattributes +19 -0
  2. data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet +3 -0
  3. data/datasets/lilac/squad_v2/answers/text/lang_detection/signal_manifest.json +39 -0
  4. data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet +3 -0
  5. data/datasets/lilac/squad_v2/answers/text/near_dup/signal_manifest.json +44 -0
  6. data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet +3 -0
  7. data/datasets/lilac/squad_v2/answers/text/pii/signal_manifest.json +53 -0
  8. data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet +3 -0
  9. data/datasets/lilac/squad_v2/answers/text/spacy_ner/signal_manifest.json +46 -0
  10. data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet +3 -0
  11. data/datasets/lilac/squad_v2/answers/text/text_statistics/signal_manifest.json +67 -0
  12. data/datasets/lilac/squad_v2/config.yml +80 -0
  13. data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin +3 -0
  14. data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl +3 -0
  15. data/datasets/lilac/squad_v2/context/gte-small/signal_manifest.json +35 -0
  16. data/datasets/lilac/squad_v2/context/gte-small/spans.pkl +3 -0
  17. data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet +3 -0
  18. data/datasets/lilac/squad_v2/context/lang_detection/signal_manifest.json +31 -0
  19. data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet +3 -0
  20. data/datasets/lilac/squad_v2/context/near_dup/signal_manifest.json +36 -0
  21. data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet +3 -0
  22. data/datasets/lilac/squad_v2/context/pii/signal_manifest.json +45 -0
  23. data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet +3 -0
  24. data/datasets/lilac/squad_v2/context/spacy_ner/signal_manifest.json +38 -0
  25. data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet +3 -0
  26. data/datasets/lilac/squad_v2/context/text_statistics/signal_manifest.json +59 -0
  27. data/datasets/lilac/squad_v2/data-00000-of-00001.parquet +3 -0
  28. data/datasets/lilac/squad_v2/manifest.json +41 -0
  29. data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet +3 -0
  30. data/datasets/lilac/squad_v2/question/lang_detection/signal_manifest.json +31 -0
  31. data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet +3 -0
  32. data/datasets/lilac/squad_v2/question/near_dup/signal_manifest.json +36 -0
  33. data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet +3 -0
  34. data/datasets/lilac/squad_v2/question/pii/signal_manifest.json +45 -0
  35. data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet +3 -0
  36. data/datasets/lilac/squad_v2/question/spacy_ner/signal_manifest.json +38 -0
  37. data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet +3 -0
  38. data/datasets/lilac/squad_v2/question/text_statistics/signal_manifest.json +59 -0
.gitattributes CHANGED
@@ -48,3 +48,22 @@ data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00
48
  data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
49
  data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
50
  data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
49
  data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
50
  data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
51
+ data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
52
+ data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
53
+ data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
54
+ data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
55
+ data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
56
+ data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
57
+ data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
58
+ data/datasets/lilac/squad_v2/context/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
59
+ data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
60
+ data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
61
+ data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
62
+ data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
63
+ data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
64
+ data/datasets/lilac/squad_v2/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
65
+ data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
66
+ data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
67
+ data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
68
+ data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
69
+ data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c03103ba132a7209461f86bd1045431d06db431930344e4bdf97236347cc2164
3
+ size 4738120
data/datasets/lilac/squad_v2/answers/text/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(answers.text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "answers": {
12
+ "fields": {
13
+ "text": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "lang_detection": {
17
+ "dtype": "string",
18
+ "signal": {
19
+ "split_by_paragraph": false,
20
+ "signal_name": "lang_detection"
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "split_by_paragraph": false,
32
+ "signal_name": "lang_detection"
33
+ },
34
+ "enriched_path": [
35
+ "answers",
36
+ "text",
37
+ "*"
38
+ ]
39
+ }
data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf4ae7259d126104da2aea0e1fad0c7cd83033f7774f0d44a2436f7c891fde34
3
+ size 5224344
data/datasets/lilac/squad_v2/answers/text/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(answers.text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "answers": {
12
+ "fields": {
13
+ "text": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "near_dup": {
17
+ "fields": {
18
+ "cluster_id": {
19
+ "dtype": "uint32",
20
+ "categorical": true
21
+ }
22
+ },
23
+ "signal": {
24
+ "threshold": 0.85,
25
+ "signal_name": "near_dup"
26
+ }
27
+ }
28
+ }
29
+ }
30
+ }
31
+ }
32
+ }
33
+ }
34
+ },
35
+ "signal": {
36
+ "threshold": 0.85,
37
+ "signal_name": "near_dup"
38
+ },
39
+ "enriched_path": [
40
+ "answers",
41
+ "text",
42
+ "*"
43
+ ]
44
+ }
data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adf2c35877ae9957a987049c40a9a1b2edbe4b2d93b1da86bfeb739fae240040
3
+ size 4841393
data/datasets/lilac/squad_v2/answers/text/pii/signal_manifest.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(answers.text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "answers": {
12
+ "fields": {
13
+ "text": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "pii": {
17
+ "fields": {
18
+ "emails": {
19
+ "repeated_field": {
20
+ "dtype": "string_span"
21
+ }
22
+ },
23
+ "ip_addresses": {
24
+ "repeated_field": {
25
+ "dtype": "string_span"
26
+ }
27
+ },
28
+ "secrets": {
29
+ "repeated_field": {
30
+ "dtype": "string_span"
31
+ }
32
+ }
33
+ },
34
+ "signal": {
35
+ "signal_name": "pii"
36
+ }
37
+ }
38
+ }
39
+ }
40
+ }
41
+ }
42
+ }
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "pii"
47
+ },
48
+ "enriched_path": [
49
+ "answers",
50
+ "text",
51
+ "*"
52
+ ]
53
+ }
data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86cfdb80cf22a545cd01d557ae3396e942fa9679990c43457d110a6dac6a2d78
3
+ size 5041580
data/datasets/lilac/squad_v2/answers/text/spacy_ner/signal_manifest.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "spacy_ner(answers.text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "answers": {
12
+ "fields": {
13
+ "text": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "spacy_ner": {
17
+ "repeated_field": {
18
+ "fields": {
19
+ "label": {
20
+ "dtype": "string"
21
+ }
22
+ },
23
+ "dtype": "string_span"
24
+ },
25
+ "signal": {
26
+ "model": "en_core_web_sm",
27
+ "signal_name": "spacy_ner"
28
+ }
29
+ }
30
+ }
31
+ }
32
+ }
33
+ }
34
+ }
35
+ }
36
+ },
37
+ "signal": {
38
+ "model": "en_core_web_sm",
39
+ "signal_name": "spacy_ner"
40
+ },
41
+ "enriched_path": [
42
+ "answers",
43
+ "text",
44
+ "*"
45
+ ]
46
+ }
data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b5ddbc51455341a26121c5427bd0f32639515dad34d77561402df81d8ab903
3
+ size 5100206
data/datasets/lilac/squad_v2/answers/text/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(answers.text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "answers": {
12
+ "fields": {
13
+ "text": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "text_statistics": {
17
+ "fields": {
18
+ "num_characters": {
19
+ "dtype": "int32"
20
+ },
21
+ "readability": {
22
+ "dtype": "float32"
23
+ },
24
+ "log(type_token_ratio)": {
25
+ "dtype": "float32"
26
+ },
27
+ "frac_non_ascii": {
28
+ "dtype": "float32",
29
+ "bins": [
30
+ [
31
+ "Low",
32
+ null,
33
+ 0.15
34
+ ],
35
+ [
36
+ "Medium",
37
+ 0.15,
38
+ 0.3
39
+ ],
40
+ [
41
+ "High",
42
+ 0.3,
43
+ null
44
+ ]
45
+ ]
46
+ }
47
+ },
48
+ "signal": {
49
+ "signal_name": "text_statistics"
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
55
+ }
56
+ }
57
+ }
58
+ },
59
+ "signal": {
60
+ "signal_name": "text_statistics"
61
+ },
62
+ "enriched_path": [
63
+ "answers",
64
+ "text",
65
+ "*"
66
+ ]
67
+ }
data/datasets/lilac/squad_v2/config.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - embedding: gte-small
3
+ path: context
4
+ name: squad_v2
5
+ namespace: local
6
+ settings:
7
+ preferred_embedding: gte-small
8
+ ui:
9
+ media_paths:
10
+ - context
11
+ - question
12
+ - - answers
13
+ - text
14
+ - '*'
15
+ signals:
16
+ - path: context
17
+ signal:
18
+ signal_name: text_statistics
19
+ - path: context
20
+ signal:
21
+ signal_name: pii
22
+ - path: context
23
+ signal:
24
+ signal_name: near_dup
25
+ - path: question
26
+ signal:
27
+ signal_name: spacy_ner
28
+ - path: question
29
+ signal:
30
+ signal_name: pii
31
+ - path:
32
+ - answers
33
+ - text
34
+ - '*'
35
+ signal:
36
+ signal_name: pii
37
+ - path:
38
+ - answers
39
+ - text
40
+ - '*'
41
+ signal:
42
+ signal_name: spacy_ner
43
+ - path:
44
+ - answers
45
+ - text
46
+ - '*'
47
+ signal:
48
+ signal_name: near_dup
49
+ - path: context
50
+ signal:
51
+ signal_name: lang_detection
52
+ - path:
53
+ - answers
54
+ - text
55
+ - '*'
56
+ signal:
57
+ signal_name: lang_detection
58
+ - path: question
59
+ signal:
60
+ signal_name: near_dup
61
+ - path: question
62
+ signal:
63
+ signal_name: lang_detection
64
+ - path:
65
+ - answers
66
+ - text
67
+ - '*'
68
+ signal:
69
+ signal_name: text_statistics
70
+ - path: question
71
+ signal:
72
+ signal_name: text_statistics
73
+ - path: context
74
+ signal:
75
+ signal_name: spacy_ner
76
+ source:
77
+ dataset_name: squad_v2
78
+ source_name: huggingface
79
+ tags:
80
+ - machine-learning
data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e24a3d0200e46bb221dacc8066ccc85033ff0378721338cfd60612f130e034d1
3
+ size 601394376
data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc41116b96f4e1fa547697ce62afe0fe7aba054a8d694b308e1e0270474801da
3
+ size 10694495
data/datasets/lilac/squad_v2/context/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(context)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "context": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "gte-small"
30
+ },
31
+ "enriched_path": [
32
+ "context"
33
+ ],
34
+ "vector_store": "hnsw"
35
+ }
data/datasets/lilac/squad_v2/context/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d69a524ee48c0c218eeb901ae265ae74b12511fee17fe31ae1627c0122e25f04
3
+ size 8815907
data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd10704958387059935d9e22f0415677a6daf3105105af8314314ce3c3114274
3
+ size 4682949
data/datasets/lilac/squad_v2/context/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(context)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "context": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "context"
30
+ ]
31
+ }
data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0d3a359a05aa7c073900e4973569f808afa26c7bf0328c31e553efcc14bea90
3
+ size 4962702
data/datasets/lilac/squad_v2/context/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(context)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "context": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.85,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "context"
35
+ ]
36
+ }
data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0265330a9d7ff27498f4f0e9ddce89a027203d11941d6bc8f8d4334872346d9c
3
+ size 4685328
data/datasets/lilac/squad_v2/context/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(context)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "context": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "context"
44
+ ]
45
+ }
data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffe9e61a5449146e9b7cf725b80f46277b359a1aca1d04fe15c5bae4e9f286f1
3
+ size 9241062
data/datasets/lilac/squad_v2/context/spacy_ner/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "spacy_ner(context)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "context": {
12
+ "fields": {
13
+ "spacy_ner": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "label": {
17
+ "dtype": "string"
18
+ }
19
+ },
20
+ "dtype": "string_span"
21
+ },
22
+ "signal": {
23
+ "model": "en_core_web_sm",
24
+ "signal_name": "spacy_ner"
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "model": "en_core_web_sm",
33
+ "signal_name": "spacy_ner"
34
+ },
35
+ "enriched_path": [
36
+ "context"
37
+ ]
38
+ }
data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1915970f9199dc82019cbb6089c85df3ddfd189848e0f34f549e34b617cd0f8
3
+ size 5165481
data/datasets/lilac/squad_v2/context/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(context)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "context": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "signal": {
54
+ "signal_name": "text_statistics"
55
+ },
56
+ "enriched_path": [
57
+ "context"
58
+ ]
59
+ }
data/datasets/lilac/squad_v2/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9f54db85b8bacd3ea30ecf70410441e300c783e621767c1d9746d6474852ceb
3
+ size 27086838
data/datasets/lilac/squad_v2/manifest.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "id": {
8
+ "dtype": "string"
9
+ },
10
+ "title": {
11
+ "dtype": "string"
12
+ },
13
+ "context": {
14
+ "dtype": "string"
15
+ },
16
+ "question": {
17
+ "dtype": "string"
18
+ },
19
+ "answers": {
20
+ "fields": {
21
+ "text": {
22
+ "repeated_field": {
23
+ "dtype": "string"
24
+ }
25
+ },
26
+ "answer_start": {
27
+ "repeated_field": {
28
+ "dtype": "int32"
29
+ }
30
+ }
31
+ }
32
+ },
33
+ "__hfsplit__": {
34
+ "dtype": "string"
35
+ },
36
+ "__rowid__": {
37
+ "dtype": "string"
38
+ }
39
+ }
40
+ }
41
+ }
data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06d32aa3096865a6236fd620a16499876c919b245e26fe9a2809b3c02eebc13d
3
+ size 4694280
data/datasets/lilac/squad_v2/question/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "question"
30
+ ]
31
+ }
data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b7e1d521750d16c37c70c378306ef22916e2d9715a565f1127d9e3626c966d4
3
+ size 5571030
data/datasets/lilac/squad_v2/question/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.85,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "question"
35
+ ]
36
+ }
data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a72f4f32331de183cfe67be224ba473ec83ba8f855dafab97371580684718e4f
3
+ size 4685523
data/datasets/lilac/squad_v2/question/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "question"
44
+ ]
45
+ }
data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca09ac95100482b232d2b3c7e9a08490ecb056d10b9e91a23170df6669482a3b
3
+ size 5286963
data/datasets/lilac/squad_v2/question/spacy_ner/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "spacy_ner(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "spacy_ner": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "label": {
17
+ "dtype": "string"
18
+ }
19
+ },
20
+ "dtype": "string_span"
21
+ },
22
+ "signal": {
23
+ "model": "en_core_web_sm",
24
+ "signal_name": "spacy_ner"
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "model": "en_core_web_sm",
33
+ "signal_name": "spacy_ner"
34
+ },
35
+ "enriched_path": [
36
+ "question"
37
+ ]
38
+ }
data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36304efa2147c8737c1c4da192837aa855504ecfe9eb6f14d267c501bcaaa246
3
+ size 5104750
data/datasets/lilac/squad_v2/question/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(question)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "question": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "signal": {
54
+ "signal_name": "text_statistics"
55
+ },
56
+ "enriched_path": [
57
+ "question"
58
+ ]
59
+ }