smilkov commited on
Commit
38086f8
1 Parent(s): 22080cf

Upload folder using huggingface_hub

Browse files
Files changed (18) hide show
  1. .gitattributes +9 -0
  2. data/datasets/lilac/open-asssistant-conversations/config.yml +31 -0
  3. data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet +3 -0
  4. data/datasets/lilac/open-asssistant-conversations/manifest.json +118 -0
  5. data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin +3 -0
  6. data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl +3 -0
  7. data/datasets/lilac/open-asssistant-conversations/text/gte-small/signal_manifest.json +35 -0
  8. data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl +3 -0
  9. data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet +3 -0
  10. data/datasets/lilac/open-asssistant-conversations/text/lang_detection/signal_manifest.json +31 -0
  11. data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet +3 -0
  12. data/datasets/lilac/open-asssistant-conversations/text/near_dup/signal_manifest.json +36 -0
  13. data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet +3 -0
  14. data/datasets/lilac/open-asssistant-conversations/text/pii/signal_manifest.json +45 -0
  15. data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet +3 -0
  16. data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/signal_manifest.json +38 -0
  17. data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet +3 -0
  18. data/datasets/lilac/open-asssistant-conversations/text/text_statistics/signal_manifest.json +59 -0
.gitattributes CHANGED
@@ -39,3 +39,12 @@ data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/data-00000-of-00001.
39
  data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
40
  data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
41
  data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
39
  data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
40
  data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
41
  data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
42
+ data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
43
+ data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
44
+ data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
45
+ data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
46
+ data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
47
+ data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
48
+ data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
49
+ data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
50
+ data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/open-asssistant-conversations/config.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - embedding: gte-small
3
+ path: text
4
+ name: open-asssistant-conversations
5
+ namespace: local
6
+ settings:
7
+ preferred_embedding: gte-small
8
+ ui:
9
+ media_paths:
10
+ - text
11
+ signals:
12
+ - path: text
13
+ signal:
14
+ signal_name: text_statistics
15
+ - path: text
16
+ signal:
17
+ signal_name: lang_detection
18
+ - path: text
19
+ signal:
20
+ signal_name: near_dup
21
+ - path: text
22
+ signal:
23
+ signal_name: spacy_ner
24
+ - path: text
25
+ signal:
26
+ signal_name: pii
27
+ source:
28
+ dataset_name: OpenAssistant/oasst1
29
+ source_name: huggingface
30
+ tags:
31
+ - machine-learning
data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d53dbedb539cf7fa3f89d739f698fd3ccf1fbbd86dac20bd0b74cf674cc508e8
3
+ size 42071566
data/datasets/lilac/open-asssistant-conversations/manifest.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "message_id": {
8
+ "dtype": "string"
9
+ },
10
+ "parent_id": {
11
+ "dtype": "string"
12
+ },
13
+ "user_id": {
14
+ "dtype": "string"
15
+ },
16
+ "created_date": {
17
+ "dtype": "string"
18
+ },
19
+ "text": {
20
+ "dtype": "string"
21
+ },
22
+ "role": {
23
+ "dtype": "string"
24
+ },
25
+ "lang": {
26
+ "dtype": "string"
27
+ },
28
+ "review_count": {
29
+ "dtype": "int32"
30
+ },
31
+ "review_result": {
32
+ "dtype": "boolean"
33
+ },
34
+ "deleted": {
35
+ "dtype": "boolean"
36
+ },
37
+ "rank": {
38
+ "dtype": "int32"
39
+ },
40
+ "synthetic": {
41
+ "dtype": "boolean"
42
+ },
43
+ "model_name": {
44
+ "dtype": "string"
45
+ },
46
+ "detoxify": {
47
+ "fields": {
48
+ "toxicity": {
49
+ "dtype": "float64"
50
+ },
51
+ "severe_toxicity": {
52
+ "dtype": "float64"
53
+ },
54
+ "obscene": {
55
+ "dtype": "float64"
56
+ },
57
+ "identity_attack": {
58
+ "dtype": "float64"
59
+ },
60
+ "insult": {
61
+ "dtype": "float64"
62
+ },
63
+ "threat": {
64
+ "dtype": "float64"
65
+ },
66
+ "sexual_explicit": {
67
+ "dtype": "float64"
68
+ }
69
+ }
70
+ },
71
+ "message_tree_id": {
72
+ "dtype": "string"
73
+ },
74
+ "tree_state": {
75
+ "dtype": "string"
76
+ },
77
+ "emojis": {
78
+ "fields": {
79
+ "name": {
80
+ "repeated_field": {
81
+ "dtype": "string"
82
+ }
83
+ },
84
+ "count": {
85
+ "repeated_field": {
86
+ "dtype": "int32"
87
+ }
88
+ }
89
+ }
90
+ },
91
+ "labels": {
92
+ "fields": {
93
+ "name": {
94
+ "repeated_field": {
95
+ "dtype": "string"
96
+ }
97
+ },
98
+ "value": {
99
+ "repeated_field": {
100
+ "dtype": "float64"
101
+ }
102
+ },
103
+ "count": {
104
+ "repeated_field": {
105
+ "dtype": "int32"
106
+ }
107
+ }
108
+ }
109
+ },
110
+ "__hfsplit__": {
111
+ "dtype": "string"
112
+ },
113
+ "__rowid__": {
114
+ "dtype": "string"
115
+ }
116
+ }
117
+ }
118
+ }
data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:138c8efe1e911904c3702c582b892acc8c5616062a35773c31872a8969e2badf
3
+ size 327991072
data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8bcd3f617d324acd7e13d0d0fabd38065012bea40141579e16681bcdfdcaf46
3
+ size 6171232
data/datasets/lilac/open-asssistant-conversations/text/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(text)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "text": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "gte-small"
30
+ },
31
+ "enriched_path": [
32
+ "text"
33
+ ],
34
+ "vector_store": "hnsw"
35
+ }
data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1cda70a8dc3259ff058e5e3ffc24cfbaaafe3fb9ba5c1b836e0757180114e28
3
+ size 5164058
data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8944421e23764080d8fde7460d08aa683ebbafc6fad2bd65654ea701ba50ca
3
+ size 2980981
data/datasets/lilac/open-asssistant-conversations/text/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "text"
30
+ ]
31
+ }
data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2949cfca1b91bb99c56364fdb47679301b90d1f51bd1963f04fbbcbe093d15c
3
+ size 3486319
data/datasets/lilac/open-asssistant-conversations/text/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.85,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "text"
35
+ ]
36
+ }
data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c010e414a4379f8c1637c54864c46c7872a7ed0dc26990c5b755581d2073f8b
3
+ size 2953059
data/datasets/lilac/open-asssistant-conversations/text/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "text"
44
+ ]
45
+ }
data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e011c0efb333a2d028e1be33030bf795fc373f27a3c2ce611099081057df2be
3
+ size 5955273
data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "spacy_ner(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "spacy_ner": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "label": {
17
+ "dtype": "string"
18
+ }
19
+ },
20
+ "dtype": "string_span"
21
+ },
22
+ "signal": {
23
+ "model": "en_core_web_sm",
24
+ "signal_name": "spacy_ner"
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "model": "en_core_web_sm",
33
+ "signal_name": "spacy_ner"
34
+ },
35
+ "enriched_path": [
36
+ "text"
37
+ ]
38
+ }
data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb0f5af1af587a3b083dd7859f9cd4a5cf2943e41396c776db9a2a4f59eb4c9d
3
+ size 3827015
data/datasets/lilac/open-asssistant-conversations/text/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "signal": {
54
+ "signal_name": "text_statistics"
55
+ },
56
+ "enriched_path": [
57
+ "text"
58
+ ]
59
+ }