dsmilkov commited on
Commit
fca48a0
1 Parent(s): fa6b9f2
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -87
  2. Dockerfile +1 -1
  3. data/datasets/lilac/imdb/config.yml +0 -31
  4. data/datasets/lilac/imdb/data-00000-of-00001.parquet +0 -3
  5. data/datasets/lilac/imdb/manifest.json +0 -21
  6. data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin +0 -3
  7. data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl +0 -3
  8. data/datasets/lilac/imdb/text/gte-small/signal_manifest.json +0 -35
  9. data/datasets/lilac/imdb/text/gte-small/spans.pkl +0 -3
  10. data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet +0 -3
  11. data/datasets/lilac/imdb/text/lang_detection/signal_manifest.json +0 -31
  12. data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet +0 -3
  13. data/datasets/lilac/imdb/text/near_dup/signal_manifest.json +0 -36
  14. data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet +0 -3
  15. data/datasets/lilac/imdb/text/pii/signal_manifest.json +0 -45
  16. data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet +0 -3
  17. data/datasets/lilac/imdb/text/spacy_ner/signal_manifest.json +0 -38
  18. data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet +0 -3
  19. data/datasets/lilac/imdb/text/text_statistics/signal_manifest.json +0 -59
  20. data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin +0 -3
  21. data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl +0 -3
  22. data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json +0 -38
  23. data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl +0 -3
  24. data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet +0 -3
  25. data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json +0 -34
  26. data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet +0 -3
  27. data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json +0 -39
  28. data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet +0 -3
  29. data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json +0 -48
  30. data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet +0 -3
  31. data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json +0 -41
  32. data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet +0 -3
  33. data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json +0 -62
  34. data/datasets/lilac/mmlu_professional_law/config.yml +0 -63
  35. data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet +0 -3
  36. data/datasets/lilac/mmlu_professional_law/manifest.json +0 -26
  37. data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin +0 -3
  38. data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl +0 -3
  39. data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json +0 -35
  40. data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl +0 -3
  41. data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet +0 -3
  42. data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json +0 -31
  43. data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet +0 -3
  44. data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json +0 -36
  45. data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet +0 -3
  46. data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json +0 -45
  47. data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet +0 -3
  48. data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json +0 -38
  49. data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet +0 -3
  50. data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json +0 -59
.gitattributes DELETED
@@ -1,87 +0,0 @@
1
- data/datasets/lilac/pile-of-law-constitutions/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
2
- data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
3
- data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
4
- data/datasets/lilac/pile-of-law-constitutions/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
5
- data/datasets/lilac/piqa/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
6
- data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
7
- data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
8
- data/datasets/lilac/piqa/goal/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
9
- data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
10
- data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
11
- data/datasets/lilac/piqa/sol1/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
12
- data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
13
- data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
14
- data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
15
- data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
16
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
17
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
18
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
19
- data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
20
- data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
21
- data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
22
- data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
23
- data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
24
- data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
25
- data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
26
- data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
27
- data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
28
- data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
29
- data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
30
- data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
31
- data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
32
- data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
33
- data/datasets/lilac/pile-of-law-r-legaladvice/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
34
- data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
35
- data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
36
- data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
37
- data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
38
- data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
39
- data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
40
- data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
41
- data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
42
- data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
43
- data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
44
- data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
45
- data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
46
- data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
47
- data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
48
- data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
49
- data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
50
- data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
51
- data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
52
- data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
53
- data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
54
- data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
55
- data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
56
- data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
57
- data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
58
- data/datasets/lilac/squad_v2/context/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
59
- data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
60
- data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
61
- data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
62
- data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
63
- data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
64
- data/datasets/lilac/squad_v2/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
65
- data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
66
- data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
67
- data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
68
- data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
69
- data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
70
- data/datasets/lilac/imdb/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
71
- data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
72
- data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
73
- data/datasets/lilac/imdb/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
74
- data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
75
- data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
76
- data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
77
- data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
78
- data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
79
- data/datasets/lilac/wikitext-2-raw-v1/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
80
- data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
81
- data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
82
- data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
83
- data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
84
- data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
85
- data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
86
- data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
87
- data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -21,4 +21,4 @@ COPY /lilac ./lilac/
21
 
22
  COPY docker_start.sh docker_start.py ./
23
 
24
- CMD ./docker_start.sh
 
21
 
22
  COPY docker_start.sh docker_start.py ./
23
 
24
+ CMD ["bash", "docker_start.sh"]
data/datasets/lilac/imdb/config.yml DELETED
@@ -1,31 +0,0 @@
1
- embeddings:
2
- - embedding: gte-small
3
- path: text
4
- name: imdb
5
- namespace: local
6
- settings:
7
- preferred_embedding: gte-small
8
- ui:
9
- media_paths:
10
- - text
11
- signals:
12
- - path: text
13
- signal:
14
- signal_name: near_dup
15
- - path: text
16
- signal:
17
- signal_name: text_statistics
18
- - path: text
19
- signal:
20
- signal_name: lang_detection
21
- - path: text
22
- signal:
23
- signal_name: spacy_ner
24
- - path: text
25
- signal:
26
- signal_name: pii
27
- source:
28
- dataset_name: imdb
29
- source_name: huggingface
30
- tags:
31
- - machine-learning
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/imdb/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fe90be23f86ca1e73b8a77a235344db822601f794a5643dca9d0d07c49ce3d8
3
- size 86160450
 
 
 
 
data/datasets/lilac/imdb/manifest.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "data_schema": {
6
- "fields": {
7
- "text": {
8
- "dtype": "string"
9
- },
10
- "label": {
11
- "dtype": "string"
12
- },
13
- "__hfsplit__": {
14
- "dtype": "string"
15
- },
16
- "__rowid__": {
17
- "dtype": "string"
18
- }
19
- }
20
- }
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b8fc440b947f068966bc68c83ce4b5502b98e3aab928d58c295dc8a7b7b016c
3
- size 691432396
 
 
 
 
data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3ee2426c761b50025e02f2388fa65c51eb9705d6fd65f95d6502650214b8472
3
- size 10390867
 
 
 
 
data/datasets/lilac/imdb/text/gte-small/signal_manifest.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "files": [],
3
- "parquet_id": "gte-small(text)",
4
- "data_schema": {
5
- "fields": {
6
- "__rowid__": {
7
- "dtype": "string"
8
- },
9
- "text": {
10
- "fields": {
11
- "gte-small": {
12
- "repeated_field": {
13
- "fields": {
14
- "embedding": {
15
- "dtype": "embedding"
16
- }
17
- },
18
- "dtype": "string_span"
19
- },
20
- "signal": {
21
- "signal_name": "gte-small"
22
- }
23
- }
24
- }
25
- }
26
- }
27
- },
28
- "signal": {
29
- "signal_name": "gte-small"
30
- },
31
- "enriched_path": [
32
- "text"
33
- ],
34
- "vector_store": "hnsw"
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/imdb/text/gte-small/spans.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:16417449057fc4304e098e037cb2b3e9a693570768a68bdca457d452adaee130
3
- size 7476546
 
 
 
 
data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:adba6df16ec47b68625618b01a0b8d2cc65de572acb20eed561128b037fcdfd7
3
- size 3309315
 
 
 
 
data/datasets/lilac/imdb/text/lang_detection/signal_manifest.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "lang_detection(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "lang_detection": {
14
- "dtype": "string",
15
- "signal": {
16
- "split_by_paragraph": false,
17
- "signal_name": "lang_detection"
18
- }
19
- }
20
- }
21
- }
22
- }
23
- },
24
- "signal": {
25
- "split_by_paragraph": false,
26
- "signal_name": "lang_detection"
27
- },
28
- "enriched_path": [
29
- "text"
30
- ]
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c75a5f0d12b8b02671e99a2dd313d292610c9264f2302ad1877959430420079
3
- size 3915752
 
 
 
 
data/datasets/lilac/imdb/text/near_dup/signal_manifest.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "near_dup(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "near_dup": {
14
- "fields": {
15
- "cluster_id": {
16
- "dtype": "uint32",
17
- "categorical": true
18
- }
19
- },
20
- "signal": {
21
- "threshold": 0.85,
22
- "signal_name": "near_dup"
23
- }
24
- }
25
- }
26
- }
27
- }
28
- },
29
- "signal": {
30
- "threshold": 0.85,
31
- "signal_name": "near_dup"
32
- },
33
- "enriched_path": [
34
- "text"
35
- ]
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2dfe71845bfd419a2a309b132456341bf317d63e8ccf8bc100835f1a20c81c5b
3
- size 3313701
 
 
 
 
data/datasets/lilac/imdb/text/pii/signal_manifest.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "pii(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "pii": {
14
- "fields": {
15
- "emails": {
16
- "repeated_field": {
17
- "dtype": "string_span"
18
- }
19
- },
20
- "ip_addresses": {
21
- "repeated_field": {
22
- "dtype": "string_span"
23
- }
24
- },
25
- "secrets": {
26
- "repeated_field": {
27
- "dtype": "string_span"
28
- }
29
- }
30
- },
31
- "signal": {
32
- "signal_name": "pii"
33
- }
34
- }
35
- }
36
- }
37
- }
38
- },
39
- "signal": {
40
- "signal_name": "pii"
41
- },
42
- "enriched_path": [
43
- "text"
44
- ]
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:be39eb80c981da64a2ec5f7fe42c43ffb390b887a220e278a58cdde04e7824ef
3
- size 8479478
 
 
 
 
data/datasets/lilac/imdb/text/spacy_ner/signal_manifest.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "spacy_ner(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "spacy_ner": {
14
- "repeated_field": {
15
- "fields": {
16
- "label": {
17
- "dtype": "string"
18
- }
19
- },
20
- "dtype": "string_span"
21
- },
22
- "signal": {
23
- "model": "en_core_web_sm",
24
- "signal_name": "spacy_ner"
25
- }
26
- }
27
- }
28
- }
29
- }
30
- },
31
- "signal": {
32
- "model": "en_core_web_sm",
33
- "signal_name": "spacy_ner"
34
- },
35
- "enriched_path": [
36
- "text"
37
- ]
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4919817d0c520fc5aacc9721f01a0e29fb4d2c89b6698865c9680c81c44ce920
3
- size 4403809
 
 
 
 
data/datasets/lilac/imdb/text/text_statistics/signal_manifest.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "text_statistics(text)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "text": {
12
- "fields": {
13
- "text_statistics": {
14
- "fields": {
15
- "num_characters": {
16
- "dtype": "int32"
17
- },
18
- "readability": {
19
- "dtype": "float32"
20
- },
21
- "log(type_token_ratio)": {
22
- "dtype": "float32"
23
- },
24
- "frac_non_ascii": {
25
- "dtype": "float32",
26
- "bins": [
27
- [
28
- "Low",
29
- null,
30
- 0.15
31
- ],
32
- [
33
- "Medium",
34
- 0.15,
35
- 0.3
36
- ],
37
- [
38
- "High",
39
- 0.3,
40
- null
41
- ]
42
- ]
43
- }
44
- },
45
- "signal": {
46
- "signal_name": "text_statistics"
47
- }
48
- }
49
- }
50
- }
51
- }
52
- },
53
- "signal": {
54
- "signal_name": "text_statistics"
55
- },
56
- "enriched_path": [
57
- "text"
58
- ]
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:df9d6e2f5df4b8693544f31ca78a9d1936a4caf47acc2babeb1cb766131b7636
3
- size 684360968
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2081ce5d760026fe341e0553cd9e40747ca902e4e7edb851cb747f350f19bb0d
3
- size 11174465
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "files": [],
3
- "parquet_id": "gte-small(choices)",
4
- "data_schema": {
5
- "fields": {
6
- "__rowid__": {
7
- "dtype": "string"
8
- },
9
- "choices": {
10
- "repeated_field": {
11
- "fields": {
12
- "gte-small": {
13
- "repeated_field": {
14
- "fields": {
15
- "embedding": {
16
- "dtype": "embedding"
17
- }
18
- },
19
- "dtype": "string_span"
20
- },
21
- "signal": {
22
- "signal_name": "gte-small"
23
- }
24
- }
25
- }
26
- }
27
- }
28
- }
29
- },
30
- "signal": {
31
- "signal_name": "gte-small"
32
- },
33
- "enriched_path": [
34
- "choices",
35
- "*"
36
- ],
37
- "vector_store": "hnsw"
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:02fb1662da21f33ea1429a0f9adf1301185da46f642a722717fe7c523314fa57
3
- size 11173475
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:968d4f87c7b51b995d9e3a96423a06b91984e5ee4a47062cd53fe87cca5cafbe
3
- size 3469413
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "lang_detection(choices)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "choices": {
12
- "repeated_field": {
13
- "fields": {
14
- "lang_detection": {
15
- "dtype": "string",
16
- "signal": {
17
- "split_by_paragraph": false,
18
- "signal_name": "lang_detection"
19
- }
20
- }
21
- }
22
- }
23
- }
24
- }
25
- },
26
- "signal": {
27
- "split_by_paragraph": false,
28
- "signal_name": "lang_detection"
29
- },
30
- "enriched_path": [
31
- "choices",
32
- "*"
33
- ]
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:316f3be499fcbb960bc1e83a201838ca0b3047a71d8e1c302fe4e0d833a3bf90
3
- size 5544176
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "near_dup(choices)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "choices": {
12
- "repeated_field": {
13
- "fields": {
14
- "near_dup": {
15
- "fields": {
16
- "cluster_id": {
17
- "dtype": "uint32",
18
- "categorical": true
19
- }
20
- },
21
- "signal": {
22
- "threshold": 0.85,
23
- "signal_name": "near_dup"
24
- }
25
- }
26
- }
27
- }
28
- }
29
- }
30
- },
31
- "signal": {
32
- "threshold": 0.85,
33
- "signal_name": "near_dup"
34
- },
35
- "enriched_path": [
36
- "choices",
37
- "*"
38
- ]
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cb41d4e9d0d82bd824abfa733d5be3a599e011098c5d41ebadeb1166a15f722
3
- size 3393096
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json DELETED
@@ -1,48 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "pii(choices)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "choices": {
12
- "repeated_field": {
13
- "fields": {
14
- "pii": {
15
- "fields": {
16
- "emails": {
17
- "repeated_field": {
18
- "dtype": "string_span"
19
- }
20
- },
21
- "ip_addresses": {
22
- "repeated_field": {
23
- "dtype": "string_span"
24
- }
25
- },
26
- "secrets": {
27
- "repeated_field": {
28
- "dtype": "string_span"
29
- }
30
- }
31
- },
32
- "signal": {
33
- "signal_name": "pii"
34
- }
35
- }
36
- }
37
- }
38
- }
39
- }
40
- },
41
- "signal": {
42
- "signal_name": "pii"
43
- },
44
- "enriched_path": [
45
- "choices",
46
- "*"
47
- ]
48
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b1255490f17c64f88b8b332c7df30060df612b9de11b17aaf6f70234c363e1e
3
- size 4080744
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json DELETED
@@ -1,41 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "spacy_ner(choices)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "choices": {
12
- "repeated_field": {
13
- "fields": {
14
- "spacy_ner": {
15
- "repeated_field": {
16
- "fields": {
17
- "label": {
18
- "dtype": "string"
19
- }
20
- },
21
- "dtype": "string_span"
22
- },
23
- "signal": {
24
- "model": "en_core_web_sm",
25
- "signal_name": "spacy_ner"
26
- }
27
- }
28
- }
29
- }
30
- }
31
- }
32
- },
33
- "signal": {
34
- "model": "en_core_web_sm",
35
- "signal_name": "spacy_ner"
36
- },
37
- "enriched_path": [
38
- "choices",
39
- "*"
40
- ]
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc00a68e0f835e25b214d90e7e48251b39d748f1e836af713440cd0ea2517ead
3
- size 4634821
 
 
 
 
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json DELETED
@@ -1,62 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "text_statistics(choices)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "choices": {
12
- "repeated_field": {
13
- "fields": {
14
- "text_statistics": {
15
- "fields": {
16
- "num_characters": {
17
- "dtype": "int32"
18
- },
19
- "readability": {
20
- "dtype": "float32"
21
- },
22
- "log(type_token_ratio)": {
23
- "dtype": "float32"
24
- },
25
- "frac_non_ascii": {
26
- "dtype": "float32",
27
- "bins": [
28
- [
29
- "Low",
30
- null,
31
- 0.15
32
- ],
33
- [
34
- "Medium",
35
- 0.15,
36
- 0.3
37
- ],
38
- [
39
- "High",
40
- 0.3,
41
- null
42
- ]
43
- ]
44
- }
45
- },
46
- "signal": {
47
- "signal_name": "text_statistics"
48
- }
49
- }
50
- }
51
- }
52
- }
53
- }
54
- },
55
- "signal": {
56
- "signal_name": "text_statistics"
57
- },
58
- "enriched_path": [
59
- "choices",
60
- "*"
61
- ]
62
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/config.yml DELETED
@@ -1,63 +0,0 @@
1
- embeddings:
2
- - embedding: gte-small
3
- path:
4
- - choices
5
- - '*'
6
- - embedding: gte-small
7
- path: question
8
- name: mmlu_professional_law
9
- namespace: local
10
- settings:
11
- preferred_embedding: gte-small
12
- ui:
13
- media_paths:
14
- - question
15
- - - choices
16
- - '*'
17
- signals:
18
- - path: question
19
- signal:
20
- signal_name: text_statistics
21
- - path: question
22
- signal:
23
- signal_name: pii
24
- - path: question
25
- signal:
26
- signal_name: near_dup
27
- - path:
28
- - choices
29
- - '*'
30
- signal:
31
- signal_name: text_statistics
32
- - path:
33
- - choices
34
- - '*'
35
- signal:
36
- signal_name: spacy_ner
37
- - path: question
38
- signal:
39
- signal_name: lang_detection
40
- - path:
41
- - choices
42
- - '*'
43
- signal:
44
- signal_name: near_dup
45
- - path:
46
- - choices
47
- - '*'
48
- signal:
49
- signal_name: pii
50
- - path:
51
- - choices
52
- - '*'
53
- signal:
54
- signal_name: lang_detection
55
- - path: question
56
- signal:
57
- signal_name: spacy_ner
58
- source:
59
- config_name: professional_law
60
- dataset_name: cais/mmlu
61
- source_name: huggingface
62
- tags:
63
- - legal
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:65cd2771cf0bb88dbed9ad66ceaff472115f07c9dfea866c7e3f65b68392e745
3
- size 50699938
 
 
 
 
data/datasets/lilac/mmlu_professional_law/manifest.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "data_schema": {
6
- "fields": {
7
- "question": {
8
- "dtype": "string"
9
- },
10
- "choices": {
11
- "repeated_field": {
12
- "dtype": "string"
13
- }
14
- },
15
- "answer": {
16
- "dtype": "string"
17
- },
18
- "__hfsplit__": {
19
- "dtype": "string"
20
- },
21
- "__rowid__": {
22
- "dtype": "string"
23
- }
24
- }
25
- }
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b02300405fccc3011294e15ee869933dd81578173435defbcb19e3b40a65e93
3
- size 771802212
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f72169740d80ee2b2ea66589d7ebcc58c83381978a4640a27510c416a02bf6c7
3
- size 11296648
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "files": [],
3
- "parquet_id": "gte-small(question)",
4
- "data_schema": {
5
- "fields": {
6
- "__rowid__": {
7
- "dtype": "string"
8
- },
9
- "question": {
10
- "fields": {
11
- "gte-small": {
12
- "repeated_field": {
13
- "fields": {
14
- "embedding": {
15
- "dtype": "embedding"
16
- }
17
- },
18
- "dtype": "string_span"
19
- },
20
- "signal": {
21
- "signal_name": "gte-small"
22
- }
23
- }
24
- }
25
- }
26
- }
27
- },
28
- "signal": {
29
- "signal_name": "gte-small"
30
- },
31
- "enriched_path": [
32
- "question"
33
- ],
34
- "vector_store": "hnsw"
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b51cad455e94b167bc9cf130c262ed1b143a8f386c7074a61983e01cd93d277
3
- size 7911602
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf6cf8cdc246ce4406599aec8782d3be02f2585f1fbad74173faf0ffcb453a49
3
- size 3361922
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "lang_detection(question)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "question": {
12
- "fields": {
13
- "lang_detection": {
14
- "dtype": "string",
15
- "signal": {
16
- "split_by_paragraph": false,
17
- "signal_name": "lang_detection"
18
- }
19
- }
20
- }
21
- }
22
- }
23
- },
24
- "signal": {
25
- "split_by_paragraph": false,
26
- "signal_name": "lang_detection"
27
- },
28
- "enriched_path": [
29
- "question"
30
- ]
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c4139f699d1a248cf5378c442ef6f17970913394d5d0c79bd7c6e6801ab548a
3
- size 3697516
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "near_dup(question)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "question": {
12
- "fields": {
13
- "near_dup": {
14
- "fields": {
15
- "cluster_id": {
16
- "dtype": "uint32",
17
- "categorical": true
18
- }
19
- },
20
- "signal": {
21
- "threshold": 0.85,
22
- "signal_name": "near_dup"
23
- }
24
- }
25
- }
26
- }
27
- }
28
- },
29
- "signal": {
30
- "threshold": 0.85,
31
- "signal_name": "near_dup"
32
- },
33
- "enriched_path": [
34
- "question"
35
- ]
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2735c4a2c5d40973652d369140533af74425db6dd753f8a25850d4efeee4928e
3
- size 3369080
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "pii(question)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "question": {
12
- "fields": {
13
- "pii": {
14
- "fields": {
15
- "emails": {
16
- "repeated_field": {
17
- "dtype": "string_span"
18
- }
19
- },
20
- "ip_addresses": {
21
- "repeated_field": {
22
- "dtype": "string_span"
23
- }
24
- },
25
- "secrets": {
26
- "repeated_field": {
27
- "dtype": "string_span"
28
- }
29
- }
30
- },
31
- "signal": {
32
- "signal_name": "pii"
33
- }
34
- }
35
- }
36
- }
37
- }
38
- },
39
- "signal": {
40
- "signal_name": "pii"
41
- },
42
- "enriched_path": [
43
- "question"
44
- ]
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e775b663f9a3b7c7ebdd31f9a860254dec31c18aa46c5a61820050d0556cbb0f
3
- size 9105982
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "spacy_ner(question)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "question": {
12
- "fields": {
13
- "spacy_ner": {
14
- "repeated_field": {
15
- "fields": {
16
- "label": {
17
- "dtype": "string"
18
- }
19
- },
20
- "dtype": "string_span"
21
- },
22
- "signal": {
23
- "model": "en_core_web_sm",
24
- "signal_name": "spacy_ner"
25
- }
26
- }
27
- }
28
- }
29
- }
30
- },
31
- "signal": {
32
- "model": "en_core_web_sm",
33
- "signal_name": "spacy_ner"
34
- },
35
- "enriched_path": [
36
- "question"
37
- ]
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:995b3ac42907ea244d9cb04c68a4715af8ddb7d72dcced056bc58dc9a9f05e7e
3
- size 4389031
 
 
 
 
data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "text_statistics(question)",
6
- "data_schema": {
7
- "fields": {
8
- "__rowid__": {
9
- "dtype": "string"
10
- },
11
- "question": {
12
- "fields": {
13
- "text_statistics": {
14
- "fields": {
15
- "num_characters": {
16
- "dtype": "int32"
17
- },
18
- "readability": {
19
- "dtype": "float32"
20
- },
21
- "log(type_token_ratio)": {
22
- "dtype": "float32"
23
- },
24
- "frac_non_ascii": {
25
- "dtype": "float32",
26
- "bins": [
27
- [
28
- "Low",
29
- null,
30
- 0.15
31
- ],
32
- [
33
- "Medium",
34
- 0.15,
35
- 0.3
36
- ],
37
- [
38
- "High",
39
- 0.3,
40
- null
41
- ]
42
- ]
43
- }
44
- },
45
- "signal": {
46
- "signal_name": "text_statistics"
47
- }
48
- }
49
- }
50
- }
51
- }
52
- },
53
- "signal": {
54
- "signal_name": "text_statistics"
55
- },
56
- "enriched_path": [
57
- "question"
58
- ]
59
- }