nsthorat-lilac commited on
Commit
fcf8b49
1 Parent(s): 9440712

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -7,3 +7,12 @@ data/datasets/local/open-asssistant-conversations/text/near_dup/data-00000-of-00
7
  data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
8
  data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
9
  data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
7
  data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
8
  data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
9
  data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
10
+ data/datasets/local/imdb/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
11
+ data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
12
+ data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
13
+ data/datasets/local/imdb/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
14
+ data/datasets/local/imdb/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
15
+ data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
16
+ data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
17
+ data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
18
+ data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
data/datasets/local/imdb/config.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ name: imdb
2
+ namespace: local
3
+ settings:
4
+ preferred_embedding: gte-small
5
+ ui:
6
+ media_paths:
7
+ - text
8
+ source:
9
+ source_name: huggingface
10
+ dataset_name: imdb
data/datasets/local/imdb/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf3f121bae8b8d8c12af8bebe4cda35c2a84750470fff57ea37a4930c257d6f
3
+ size 86160733
data/datasets/local/imdb/manifest.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "text": {
8
+ "dtype": "string"
9
+ },
10
+ "label": {
11
+ "dtype": "string"
12
+ },
13
+ "__hfsplit__": {
14
+ "dtype": "string"
15
+ },
16
+ "__rowid__": {
17
+ "dtype": "string"
18
+ }
19
+ }
20
+ }
21
+ }
data/datasets/local/imdb/settings.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ui": {"media_paths": [["text"]], "markdown_paths": []}, "preferred_embedding": "gte-small"}
data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4659a623093a2ef1646885a6ecb6ef86c56c2dcd0b10900d7b46d193dfb69e7f
3
+ size 691432464
data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cbf4a5777b0cd1f8bb5061a6177b27cc0f5a8a6349c487c0c5c52fe60697d64
3
+ size 10390846
data/datasets/local/imdb/text/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(text)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "text": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "gte-small"
30
+ },
31
+ "enriched_path": [
32
+ "text"
33
+ ],
34
+ "vector_store": "hnsw"
35
+ }
data/datasets/local/imdb/text/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:526e8505beb2386e3ff30367968685fd2229f76af2c0c86d50afaa7da3018dbc
3
+ size 7476546
data/datasets/local/imdb/text/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45651752b9f1178504ed253e070243e2782a79faf0e1272c3b9e3ba4ed8a717d
3
+ size 3309640
data/datasets/local/imdb/text/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "text"
30
+ ]
31
+ }
data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1ccd12fc66d0c31a19554fcb5f442751807745e51c3a9336cec637525a422fc
3
+ size 3916036
data/datasets/local/imdb/text/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.75,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.75,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "text"
35
+ ]
36
+ }
data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f1f559281ca4e3efcafd4b10c51cbe2f5039d86ce95d3dc07156671fd8b824
3
+ size 3313984
data/datasets/local/imdb/text/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "text"
44
+ ]
45
+ }
data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19ce0e0966a4db29b7b862aa3fa87ef3b02997e57efcdd722023819caa1be7bb
3
+ size 8483750
data/datasets/local/imdb/text/spacy_ner/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "spacy_ner(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "spacy_ner": {
14
+ "repeated_field": {
15
+ "fields": {
16
+ "label": {
17
+ "dtype": "string"
18
+ }
19
+ },
20
+ "dtype": "string_span"
21
+ },
22
+ "signal": {
23
+ "model": "en_core_web_sm",
24
+ "signal_name": "spacy_ner"
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "model": "en_core_web_sm",
33
+ "signal_name": "spacy_ner"
34
+ },
35
+ "enriched_path": [
36
+ "text"
37
+ ]
38
+ }
data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:672357a255fecf4e29604674ff3ceb11b6772d0388293f5267f608a6163faf49
3
+ size 4404092
data/datasets/local/imdb/text/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "signal": {
54
+ "signal_name": "text_statistics"
55
+ },
56
+ "enriched_path": [
57
+ "text"
58
+ ]
59
+ }