File size: 4,309 Bytes
b1494e2
76d83e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f4befa
77014bb
b1494e2
77014bb
b1494e2
7edba0c
b1494e2
 
7edba0c
 
 
b1494e2
 
 
 
7edba0c
 
af895fd
066d4a0
77014bb
b1494e2
31fbf3d
b1494e2
6739168
b1494e2
 
 
 
 
 
 
 
066d4a0
6739168
96e9569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1494e2
 
 
 
 
 
86fe272
 
02f502d
7edba0c
86fe272
7edba0c
494bc5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
066d4a0
 
494bc5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
066d4a0
 
494bc5a
7edba0c
77014bb
 
b1494e2
7edba0c
 
b1494e2
77bf495
 
 
066d4a0
 
7edba0c
 
 
066d4a0
77014bb
139e81a
 
 
7edba0c
 
 
 
 
 
 
4f4befa
 
 
 
 
 
7edba0c
172280a
7edba0c
 
 
31fbf3d
7edba0c
 
96e9569
 
 
 
 
 
 
7edba0c
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
datasets:
  - namespace: lilac
    name: Capybara
    source:
      dataset_name: LDJnr/Capybara
      source_name: huggingface
    embeddings:
      - path:
          - conversation
          - '*'
          - input
        embedding: gte-small
      - path:
          - conversation
          - '*'
          - output
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - - conversation
            - '*'
            - input
          - - conversation
            - '*'
            - output
        markdown_paths: []
      tags:
        - datasets
  - namespace: lilac
    name: glaive-code-assistant
    source:
      dataset_name: glaiveai/glaive-code-assistant
      source_name: huggingface
    embeddings:
      - path: question
        embedding: gte-small
      - path: answer
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - question
          - answer
        markdown_paths: []
      tags:
        - datasets
  - namespace: lilac
    name: open-assistant-conversations-2
    source:
      dataset_name: OpenAssistant/oasst2
      source_name: huggingface
    embeddings:
      - path: text
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - text
      tags:
        - datasets
  - namespace: lilac
    name: lmsys-chat-1m
    source:
      dataset_name: OpenAssistant/oasst2
      source_name: huggingface
    embeddings:
      - path:
          - conversation
          - '*'
          - content
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - - conversation
            - '*'
            - content
      tags:
        - logs
  - namespace: lilac
    name: databricks-dolly-15k-curated-en
    source:
      dataset_name: argilla/databricks-dolly-15k-curated-en
      source_name: huggingface
    embeddings:
      - path: original-instruction
        embedding: gte-small
      - path: original-context
        embedding: gte-small
      - path: original-response
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - original-instruction
          - original-context
          - original-response
          - - new-instruction
            - value
            - '*'
          - - new-context
            - value
            - '*'
          - - new-response
            - value
            - '*'
      tags:
        - machine-learning
  - namespace: lilac
    name: OpenOrca-100k
    source:
      dataset_name: Open-Orca/OpenOrca
      sample_size: 100000
      source_name: huggingface
    embeddings:
      - path: question
        embedding: gte-small
      - path: response
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - question
          - response
      tags:
        - machine-learning
  - namespace: lilac
    name: dolphin
    tags:
      - datasets
    source:
      dataset_name: cognitivecomputations/dolphin
      config_name: flan1m-alpaca-uncensored
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - instruction
          - input
          - output
          - - input__cluster
            - text
        markdown_paths: []
use_garden: true
signals:
  - signal_name: text_statistics
  - signal_name: lang_detection
concept_model_cache_embeddings:
  - gte-small
  - gte-base
  - sbert
  - openai
  - cohere
clusters:
  - dataset_namespace: lilac
    dataset_name: Capybara
    input_path: !!python/tuple
      - conversation
      - '*'
      - input
  - dataset_namespace: lilac
    dataset_name: glaive-code-assistant
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: open-assistant-conversations-2
    input_path: !!python/tuple
      - text
  - dataset_namespace: lilac
    dataset_name: lmsys-chat-1m
    input_selector:
      format: openai_json
      selector: user
    output_path: !!python/tuple
      - conversation__clusters
  - dataset_namespace: lilac
    dataset_name: databricks-dolly-15k-curated-en
    input_path: !!python/tuple
      - original-instruction
  - dataset_namespace: lilac
    dataset_name: OpenOrca-100k
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: dolphin
    input_path: !!python/tuple
      - input