nsthorat commited on
Commit
8640ad4
1 Parent(s): b027cda
Files changed (37) hide show
  1. .gitattributes +0 -18
  2. README.md +1 -1
  3. data/datasets/local/imdb/data-00000-of-00001.parquet +0 -3
  4. data/datasets/local/imdb/manifest.json +0 -21
  5. data/datasets/local/imdb/settings.json +0 -1
  6. data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin +0 -3
  7. data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl +0 -3
  8. data/datasets/local/imdb/text/gte-small/signal_manifest.json +0 -35
  9. data/datasets/local/imdb/text/gte-small/spans.pkl +0 -3
  10. data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet +0 -3
  11. data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/signal_manifest.json +0 -38
  12. data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet +0 -3
  13. data/datasets/local/imdb/text/near_dup/signal_manifest.json +0 -36
  14. data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet +0 -3
  15. data/datasets/local/imdb/text/pii/signal_manifest.json +0 -45
  16. data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet +0 -3
  17. data/datasets/local/imdb/text/spacy_ner/signal_manifest.json +0 -38
  18. data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet +0 -3
  19. data/datasets/local/imdb/text/text_statistics/signal_manifest.json +0 -59
  20. data/datasets/local/open-asssistant-conversations/data-00000-of-00001.parquet +0 -3
  21. data/datasets/local/open-asssistant-conversations/manifest.json +0 -118
  22. data/datasets/local/open-asssistant-conversations/settings.json +0 -1
  23. data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin +0 -3
  24. data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl +0 -3
  25. data/datasets/local/open-asssistant-conversations/text/gte-small/signal_manifest.json +0 -35
  26. data/datasets/local/open-asssistant-conversations/text/gte-small/spans.pkl +0 -3
  27. data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet +0 -3
  28. data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/signal_manifest.json +0 -38
  29. data/datasets/local/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet +0 -3
  30. data/datasets/local/open-asssistant-conversations/text/near_dup/signal_manifest.json +0 -36
  31. data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet +0 -3
  32. data/datasets/local/open-asssistant-conversations/text/pii/signal_manifest.json +0 -45
  33. data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet +0 -3
  34. data/datasets/local/open-asssistant-conversations/text/spacy_ner/signal_manifest.json +0 -38
  35. data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet +0 -3
  36. data/datasets/local/open-asssistant-conversations/text/text_statistics/signal_manifest.json +0 -59
  37. lilac/concepts/db_concept.py +1 -0
.gitattributes DELETED
@@ -1,18 +0,0 @@
1
- data/datasets/local/open-asssistant-conversations/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
2
- data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
3
- data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
4
- data/datasets/local/open-asssistant-conversations/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
5
- data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
6
- data/datasets/local/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
7
- data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
8
- data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
9
- data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
10
- data/datasets/local/imdb/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
11
- data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
12
- data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
13
- data/datasets/local/imdb/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
14
- data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
15
- data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
16
- data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
17
- data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
18
- data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Lilac Blueprint
3
  emoji: 🌷
4
  colorFrom: purple
5
  colorTo: purple
 
1
  ---
2
+ title: Lilac
3
  emoji: 🌷
4
  colorFrom: purple
5
  colorTo: purple
data/datasets/local/imdb/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cf3f121bae8b8d8c12af8bebe4cda35c2a84750470fff57ea37a4930c257d6f
3
- size 86160733
 
 
 
 
data/datasets/local/imdb/manifest.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "data_schema": {
6
- "fields": {
7
- "text": {
8
- "dtype": "string"
9
- },
10
- "label": {
11
- "dtype": "string"
12
- },
13
- "__hfsplit__": {
14
- "dtype": "string"
15
- },
16
- "__rowid__": {
17
- "dtype": "string"
18
- }
19
- }
20
- }
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/imdb/settings.json DELETED
@@ -1 +0,0 @@
1
- {"ui": {"media_paths": [["text"]], "markdown_paths": []}, "preferred_embedding": "gte-small"}
 
 
data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4659a623093a2ef1646885a6ecb6ef86c56c2dcd0b10900d7b46d193dfb69e7f
3
- size 691432464
 
 
 
 
data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cbf4a5777b0cd1f8bb5061a6177b27cc0f5a8a6349c487c0c5c52fe60697d64
3
- size 10390846
 
 
 
 
data/datasets/local/imdb/text/gte-small/signal_manifest.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "files": [],
3
- "parquet_id": "gte-small(text)",
4
- "data_schema": {
5
- "fields": {
6
- "__rowid__": {
7
- "dtype": "string"
8
- },
9
- "text": {
10
- "fields": {
11
- "gte-small": {
12
- "repeated_field": {
13
- "fields": {
14
- "embedding": {
15
- "dtype": "embedding"
16
- }
17
- },
18
- "dtype": "string_span"
19
- },
20
- "signal": {
21
- "signal_name": "gte-small"
22
- }
23
- }
24
- }
25
- }
26
- }
27
- },
28
- "signal": {
29
- "signal_name": "gte-small"
30
- },
31
- "enriched_path": [
32
- "text"
33
- ],
34
- "vector_store": "hnsw"
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/imdb/text/gte-small/spans.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:526e8505beb2386e3ff30367968685fd2229f76af2c0c86d50afaa7da3018dbc
3
- size 7476546
 
 
 
 
data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dd9ee881d6bf4fa2bb3a6db647d0c6d1f648b4a80b4e6d1aa081032bfddf5bc
3
- size 3495763
 
 
 
 
data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/signal_manifest.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "lang_detection(split_by_paragraph=True)(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "lang_detection(split_by_paragraph=True)": {
14
- "repeated_field": {
15
- "fields": {
16
- "lang_code": {
17
- "dtype": "string"
18
- }
19
- },
20
- "dtype": "string_span"
21
- },
22
- "signal": {
23
- "split_by_paragraph": true,
24
- "signal_name": "lang_detection"
25
- }
26
- }
27
- }
28
- }
29
- }
30
- },
31
- "signal": {
32
- "split_by_paragraph": true,
33
- "signal_name": "lang_detection"
34
- },
35
- "enriched_path": [
36
- "text"
37
- ]
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1ccd12fc66d0c31a19554fcb5f442751807745e51c3a9336cec637525a422fc
3
- size 3916036
 
 
 
 
data/datasets/local/imdb/text/near_dup/signal_manifest.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "near_dup(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "near_dup": {
14
- "fields": {
15
- "cluster_id": {
16
- "dtype": "uint32",
17
- "categorical": true
18
- }
19
- },
20
- "signal": {
21
- "threshold": 0.75,
22
- "signal_name": "near_dup"
23
- }
24
- }
25
- }
26
- }
27
- }
28
- },
29
- "signal": {
30
- "threshold": 0.75,
31
- "signal_name": "near_dup"
32
- },
33
- "enriched_path": [
34
- "text"
35
- ]
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4f1f559281ca4e3efcafd4b10c51cbe2f5039d86ce95d3dc07156671fd8b824
3
- size 3313984
 
 
 
 
data/datasets/local/imdb/text/pii/signal_manifest.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "pii(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "pii": {
14
- "fields": {
15
- "emails": {
16
- "repeated_field": {
17
- "dtype": "string_span"
18
- }
19
- },
20
- "ip_addresses": {
21
- "repeated_field": {
22
- "dtype": "string_span"
23
- }
24
- },
25
- "secrets": {
26
- "repeated_field": {
27
- "dtype": "string_span"
28
- }
29
- }
30
- },
31
- "signal": {
32
- "signal_name": "pii"
33
- }
34
- }
35
- }
36
- }
37
- }
38
- },
39
- "signal": {
40
- "signal_name": "pii"
41
- },
42
- "enriched_path": [
43
- "text"
44
- ]
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:19ce0e0966a4db29b7b862aa3fa87ef3b02997e57efcdd722023819caa1be7bb
3
- size 8483750
 
 
 
 
data/datasets/local/imdb/text/spacy_ner/signal_manifest.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "spacy_ner(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "spacy_ner": {
14
- "repeated_field": {
15
- "fields": {
16
- "label": {
17
- "dtype": "string"
18
- }
19
- },
20
- "dtype": "string_span"
21
- },
22
- "signal": {
23
- "model": "en_core_web_sm",
24
- "signal_name": "spacy_ner"
25
- }
26
- }
27
- }
28
- }
29
- }
30
- },
31
- "signal": {
32
- "model": "en_core_web_sm",
33
- "signal_name": "spacy_ner"
34
- },
35
- "enriched_path": [
36
- "text"
37
- ]
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:672357a255fecf4e29604674ff3ceb11b6772d0388293f5267f608a6163faf49
3
- size 4404092
 
 
 
 
data/datasets/local/imdb/text/text_statistics/signal_manifest.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "text_statistics(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "text_statistics": {
14
- "fields": {
15
- "num_characters": {
16
- "dtype": "int32"
17
- },
18
- "readability": {
19
- "dtype": "float32"
20
- },
21
- "log(type_token_ratio)": {
22
- "dtype": "float32"
23
- },
24
- "frac_non_ascii": {
25
- "dtype": "float32",
26
- "bins": [
27
- [
28
- "Low",
29
- null,
30
- 0.15
31
- ],
32
- [
33
- "Medium",
34
- 0.15,
35
- 0.3
36
- ],
37
- [
38
- "High",
39
- 0.3,
40
- null
41
- ]
42
- ]
43
- }
44
- },
45
- "signal": {
46
- "signal_name": "text_statistics"
47
- }
48
- }
49
- }
50
- }
51
- }
52
- },
53
- "signal": {
54
- "signal_name": "text_statistics"
55
- },
56
- "enriched_path": [
57
- "text"
58
- ]
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/open-asssistant-conversations/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2557dc647ff10b0396e9b40f24468f599661c664ff777c62647605503dea94dc
3
- size 42071787
 
 
 
 
data/datasets/local/open-asssistant-conversations/manifest.json DELETED
@@ -1,118 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "data_schema": {
6
- "fields": {
7
- "message_id": {
8
- "dtype": "string"
9
- },
10
- "parent_id": {
11
- "dtype": "string"
12
- },
13
- "user_id": {
14
- "dtype": "string"
15
- },
16
- "created_date": {
17
- "dtype": "string"
18
- },
19
- "text": {
20
- "dtype": "string"
21
- },
22
- "role": {
23
- "dtype": "string"
24
- },
25
- "lang": {
26
- "dtype": "string"
27
- },
28
- "review_count": {
29
- "dtype": "int32"
30
- },
31
- "review_result": {
32
- "dtype": "boolean"
33
- },
34
- "deleted": {
35
- "dtype": "boolean"
36
- },
37
- "rank": {
38
- "dtype": "int32"
39
- },
40
- "synthetic": {
41
- "dtype": "boolean"
42
- },
43
- "model_name": {
44
- "dtype": "string"
45
- },
46
- "detoxify": {
47
- "fields": {
48
- "toxicity": {
49
- "dtype": "float64"
50
- },
51
- "severe_toxicity": {
52
- "dtype": "float64"
53
- },
54
- "obscene": {
55
- "dtype": "float64"
56
- },
57
- "identity_attack": {
58
- "dtype": "float64"
59
- },
60
- "insult": {
61
- "dtype": "float64"
62
- },
63
- "threat": {
64
- "dtype": "float64"
65
- },
66
- "sexual_explicit": {
67
- "dtype": "float64"
68
- }
69
- }
70
- },
71
- "message_tree_id": {
72
- "dtype": "string"
73
- },
74
- "tree_state": {
75
- "dtype": "string"
76
- },
77
- "emojis": {
78
- "fields": {
79
- "name": {
80
- "repeated_field": {
81
- "dtype": "string"
82
- }
83
- },
84
- "count": {
85
- "repeated_field": {
86
- "dtype": "int32"
87
- }
88
- }
89
- }
90
- },
91
- "labels": {
92
- "fields": {
93
- "name": {
94
- "repeated_field": {
95
- "dtype": "string"
96
- }
97
- },
98
- "value": {
99
- "repeated_field": {
100
- "dtype": "float64"
101
- }
102
- },
103
- "count": {
104
- "repeated_field": {
105
- "dtype": "int32"
106
- }
107
- }
108
- }
109
- },
110
- "__hfsplit__": {
111
- "dtype": "string"
112
- },
113
- "__rowid__": {
114
- "dtype": "string"
115
- }
116
- }
117
- }
118
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/open-asssistant-conversations/settings.json DELETED
@@ -1 +0,0 @@
1
- {"ui": {"media_paths": [["text"]], "markdown_paths": []}, "preferred_embedding": "gte-small"}
 
 
data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ef42015e1cfa76fc929c0a8913911c765e871586af7eac6f42def6abbd856f5
3
- size 327991004
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4370d3885f9dea3df44fc7e366069c83c93af3b068ed5a56eaa2ac442c4f502c
3
- size 6171229
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/gte-small/signal_manifest.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "files": [],
3
- "parquet_id": "gte-small(text)",
4
- "data_schema": {
5
- "fields": {
6
- "__rowid__": {
7
- "dtype": "string"
8
- },
9
- "text": {
10
- "fields": {
11
- "gte-small": {
12
- "repeated_field": {
13
- "fields": {
14
- "embedding": {
15
- "dtype": "embedding"
16
- }
17
- },
18
- "dtype": "string_span"
19
- },
20
- "signal": {
21
- "signal_name": "gte-small"
22
- }
23
- }
24
- }
25
- }
26
- }
27
- },
28
- "signal": {
29
- "signal_name": "gte-small"
30
- },
31
- "enriched_path": [
32
- "text"
33
- ],
34
- "vector_store": "hnsw"
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/gte-small/spans.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9b6962f47eefe550b314cdb4d6c6eb0811670f5f41d137b952fcc55e1d331cc
3
- size 5164058
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:17cf7d5bd4e5b74dbe3024da1e4115c013b65626901916a0aa471e79ba88d1b1
3
- size 3765373
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/signal_manifest.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "lang_detection(split_by_paragraph=True)(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "lang_detection(split_by_paragraph=True)": {
14
- "repeated_field": {
15
- "fields": {
16
- "lang_code": {
17
- "dtype": "string"
18
- }
19
- },
20
- "dtype": "string_span"
21
- },
22
- "signal": {
23
- "split_by_paragraph": true,
24
- "signal_name": "lang_detection"
25
- }
26
- }
27
- }
28
- }
29
- }
30
- },
31
- "signal": {
32
- "split_by_paragraph": true,
33
- "signal_name": "lang_detection"
34
- },
35
- "enriched_path": [
36
- "text"
37
- ]
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:43c99611fc94cdd4998e03f18f651fe2ea7b515a5780bbcb78baa2030a3b39b1
3
- size 3485154
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/near_dup/signal_manifest.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "near_dup(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "near_dup": {
14
- "fields": {
15
- "cluster_id": {
16
- "dtype": "uint32",
17
- "categorical": true
18
- }
19
- },
20
- "signal": {
21
- "threshold": 0.75,
22
- "signal_name": "near_dup"
23
- }
24
- }
25
- }
26
- }
27
- }
28
- },
29
- "signal": {
30
- "threshold": 0.75,
31
- "signal_name": "near_dup"
32
- },
33
- "enriched_path": [
34
- "text"
35
- ]
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1cc6966d1c3c262121fa6130ff54e4ba7431d89ae81dfbc9ef9025f31bf095be
3
- size 2953280
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/pii/signal_manifest.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "pii(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "pii": {
14
- "fields": {
15
- "emails": {
16
- "repeated_field": {
17
- "dtype": "string_span"
18
- }
19
- },
20
- "ip_addresses": {
21
- "repeated_field": {
22
- "dtype": "string_span"
23
- }
24
- },
25
- "secrets": {
26
- "repeated_field": {
27
- "dtype": "string_span"
28
- }
29
- }
30
- },
31
- "signal": {
32
- "signal_name": "pii"
33
- }
34
- }
35
- }
36
- }
37
- }
38
- },
39
- "signal": {
40
- "signal_name": "pii"
41
- },
42
- "enriched_path": [
43
- "text"
44
- ]
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:971edbdc4bdfad768444691a2e276f3c69e99a9f5251168aaa7fd2a89a649043
3
- size 5955494
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/spacy_ner/signal_manifest.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "spacy_ner(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "spacy_ner": {
14
- "repeated_field": {
15
- "fields": {
16
- "label": {
17
- "dtype": "string"
18
- }
19
- },
20
- "dtype": "string_span"
21
- },
22
- "signal": {
23
- "model": "en_core_web_sm",
24
- "signal_name": "spacy_ner"
25
- }
26
- }
27
- }
28
- }
29
- }
30
- },
31
- "signal": {
32
- "model": "en_core_web_sm",
33
- "signal_name": "spacy_ner"
34
- },
35
- "enriched_path": [
36
- "text"
37
- ]
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1adb0c31662191820bcffbefd09c00ecdc101bbc41b9941179ff0b4fd78d11b
3
- size 3827236
 
 
 
 
data/datasets/local/open-asssistant-conversations/text/text_statistics/signal_manifest.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "text_statistics(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "text_statistics": {
14
- "fields": {
15
- "num_characters": {
16
- "dtype": "int32"
17
- },
18
- "readability": {
19
- "dtype": "float32"
20
- },
21
- "log(type_token_ratio)": {
22
- "dtype": "float32"
23
- },
24
- "frac_non_ascii": {
25
- "dtype": "float32",
26
- "bins": [
27
- [
28
- "Low",
29
- null,
30
- 0.15
31
- ],
32
- [
33
- "Medium",
34
- 0.15,
35
- 0.3
36
- ],
37
- [
38
- "High",
39
- 0.3,
40
- null
41
- ]
42
- ]
43
- }
44
- },
45
- "signal": {
46
- "signal_name": "text_statistics"
47
- }
48
- }
49
- }
50
- }
51
- }
52
- },
53
- "signal": {
54
- "signal_name": "text_statistics"
55
- },
56
- "enriched_path": [
57
- "text"
58
- ]
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lilac/concepts/db_concept.py CHANGED
@@ -366,6 +366,7 @@ class DiskConceptDB(ConceptDB):
366
  f'Concept "{namespace}/{name}" does not exist or user does not have access.')
367
 
368
  concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
 
369
  if not file_exists(concept_json_path):
370
  return None
371
 
 
366
  f'Concept "{namespace}/{name}" does not exist or user does not have access.')
367
 
368
  concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
369
+ print('json path=', concept_json_path)
370
  if not file_exists(concept_json_path):
371
  return None
372