File size: 11,691 Bytes
b1494e2
76d83e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f4befa
77014bb
b1494e2
77014bb
b1494e2
7edba0c
b1494e2
 
7edba0c
 
 
b1494e2
 
 
 
7edba0c
 
066d4a0
77014bb
ffb5a43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1494e2
31fbf3d
b1494e2
6739168
b1494e2
 
 
 
 
 
 
 
066d4a0
6739168
96e9569
 
 
ea8b20a
96e9569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e82f79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d01ddfa
 
 
 
 
dccec41
 
 
 
 
 
d01ddfa
 
 
 
 
 
57af0e3
 
abd635d
 
 
 
 
 
 
 
 
 
 
 
 
7615485
 
 
 
 
dccec41
 
 
 
 
7615485
 
 
 
 
 
 
4ebad3a
 
 
d88c722
4ebad3a
d88c722
4ebad3a
 
 
 
 
 
 
 
 
9e82f79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a334a7
 
9e82f79
 
 
 
 
 
 
 
 
 
 
 
 
 
9b5c14a
 
c74ec0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e678153
 
 
 
 
 
 
 
 
 
 
a6fd0fe
 
11af274
 
 
 
 
 
 
 
 
 
 
 
05938cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a171693
 
 
 
 
 
 
 
 
 
 
9c4c009
 
a171693
 
 
 
 
 
 
 
 
 
 
9c4c009
 
5761772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1494e2
 
 
 
 
 
86fe272
 
02f502d
7edba0c
86fe272
7edba0c
494bc5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
066d4a0
08128d5
5ceaf74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494bc5a
7edba0c
b1494e2
7edba0c
 
b1494e2
dccec41
 
 
77bf495
 
 
066d4a0
 
7edba0c
08128d5
 
77014bb
139e81a
 
 
7edba0c
 
 
 
 
 
 
4f4befa
 
 
 
 
 
7edba0c
172280a
7edba0c
 
ffb5a43
 
 
 
ea8b20a
ffb5a43
 
7edba0c
31fbf3d
7edba0c
 
96e9569
 
 
5ceaf74
96e9569
 
 
ee96c2d
 
 
 
d01ddfa
 
 
 
 
 
 
7edba0c
 
 
 
5ceaf74
 
 
 
08128d5
 
 
 
5ceaf74
 
 
 
 
 
 
d88c722
 
 
 
7615485
 
 
 
 
 
 
 
7edba0c
9e82f79
7edba0c
9e82f79
 
 
 
 
0c01f44
9e82f79
0c01f44
9e82f79
c74ec0b
 
 
 
e678153
 
 
 
11af274
 
 
 
05938cd
 
 
 
 
 
 
 
a171693
 
 
 
 
 
 
 
5761772
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
datasets:
  - namespace: lilac
    name: Capybara
    source:
      dataset_name: LDJnr/Capybara
      source_name: huggingface
    embeddings:
      - path:
          - conversation
          - '*'
          - input
        embedding: gte-small
      - path:
          - conversation
          - '*'
          - output
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - - conversation
            - '*'
            - input
          - - conversation
            - '*'
            - output
      tags:
        - datasets
  - namespace: lilac
    name: glaive-code-assistant
    source:
      dataset_name: glaiveai/glaive-code-assistant
      source_name: huggingface
    embeddings:
      - path: question
        embedding: gte-small
      - path: answer
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - question
          - answer
      tags:
        - datasets
  - namespace: lilac
    name: glaive-function-calling-v2
    source:
      dataset_name: lilacai/glaive-function-calling-v2-sharegpt
      source_name: huggingface
    embeddings:
      - path:
          - conversations
          - '*'
          - value
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - - conversations
            - '*'
            - value
      tags:
        - datasets
  - namespace: lilac
    name: open-assistant-conversations-2
    source:
      dataset_name: OpenAssistant/oasst2
      source_name: huggingface
    embeddings:
      - path: text
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - text
      tags:
        - datasets
  - namespace: lilac
    name: lmsys-chat-1m
    source:
      dataset_name: lmsys/lmsys-chat-1m
      source_name: huggingface
    embeddings:
      - path:
          - conversation
          - '*'
          - content
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - - conversation
            - '*'
            - content
      tags:
        - logs
  - namespace: lilac
    name: OpenOrca
    source:
      dataset_name: Open-Orca/OpenOrca
      source_name: huggingface
    embeddings:
      - path: question
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - question
          - response
      tags:
        - datasets
  - namespace: lilac
    name: SlimOrca
    source:
      dataset_name: Open-Orca/SlimOrca
      source_name: huggingface
    embeddings:
      - path:
          - conversations
          - '*'
          - value
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - - conversations
            - '*'
            - value
      tags:
        - datasets
  - namespace: lilac
    name: UltraChat-200k
    source:
      dataset_name: HuggingFaceH4/ultrachat_200k
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - - messages
            - '*'
            - content
      tags:
        - datasets
  - namespace: lilac
    name: roblox_luau_corpus
    source:
      dataset_name: Roblox/luau_corpus
      source_name: huggingface
    embeddings:
      - path: prompt
        embedding: gte-small
      - path: completion
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - prompt
          - completion
      tags:
        - datasets
  - namespace: lilac
    name: hncomments-1m
    source:
      dataset_name: OpenPipe/hacker-news
      sample_size: 1000000
      source_name: huggingface
    embeddings:
      - path: text
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - text
      tags:
        - datasets
  - namespace: lilac
    name: MMLU
    source:
      dataset_name: cais/mmlu
      config_name: all
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - question
          - - choices
            - '*'
          - answer
      tags:
        - eval
  - namespace: lilac
    name: ARC-Easy
    source:
      dataset_name: allenai/ai2_arc
      config_name: ARC-Easy
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - question
          - - choices
            - text
            - '*'
          - answerKey
      tags:
        - eval
  - namespace: lilac
    name: ARC-Challenge
    source:
      dataset_name: allenai/ai2_arc
      config_name: ARC-Challenge
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - question
          - - choices
            - text
            - '*'
          - answerKey
      tags:
        - eval
  - namespace: lilac
    name: HellaSwag
    source:
      dataset_name: Rowan/hellaswag
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - ctx
          - ctx_a
          - ctx_b
          - - endings
            - '*'
      tags:
        - eval
  - namespace: lilac
    name: HumanEval
    source:
      dataset_name: openai_humaneval
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - prompt
          - canonical_solution
          - test
      tags:
        - eval
  - namespace: lilac
    name: mbpp
    source:
      dataset_name: mbpp
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - code
          - text
      tags:
        - eval
  - namespace: lilac
    name: TruthfulQA-MultipleChoice
    source:
      dataset_name: truthful_qa
      config_name: multiple_choice
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - question
          - - mc1_targets
            - choices
            - '*'
          - - mc2_targets
            - choices
            - '*'
      tags:
        - eval
  - namespace: lilac
    name: TruthfulQA-Generation
    source:
      dataset_name: truthful_qa
      config_name: generation
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - question
          - - correct_answers
            - '*'
          - - incorrect_answers
            - '*'
          - source
      tags:
        - eval
  - namespace: lilac
    name: GSM8K-main
    source:
      dataset_name: gsm8k
      config_name: main
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - question
          - answer
      tags:
        - eval
  - namespace: lilac
    name: GSM8K-socratic
    source:
      dataset_name: gsm8k
      config_name: socratic
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - question
          - answer
      tags:
        - eval
  - namespace: lilac
    name: WinoGrande
    source:
      dataset_name: winogrande
      config_name: winogrande_xl
      source_name: huggingface
    settings:
      ui:
        media_paths:
          - sentence
          - option1
          - option2
          - answer
      tags:
        - eval
  - namespace: lilac
    name: databricks-dolly-15k-curated-en
    source:
      dataset_name: argilla/databricks-dolly-15k-curated-en
      source_name: huggingface
    embeddings:
      - path: original-instruction
        embedding: gte-small
      - path: original-context
        embedding: gte-small
      - path: original-response
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - original-instruction
          - original-context
          - original-response
          - - new-instruction
            - value
            - '*'
          - - new-context
            - value
            - '*'
          - - new-response
            - value
            - '*'
      tags:
        - datasets
  - namespace: lilac
    name: mosaic-instruct-v3
    source:
      dataset_name: mosaicml/instruct-v3
      source_name: huggingface
    embeddings:
      - path: prompt
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - prompt
          - response
      tags:
        - datasets
  - namespace: lilac
    name: dolphin
    source:
      dataset_name: cognitivecomputations/dolphin
      config_name: flan1m-alpaca-uncensored
      source_name: huggingface
    embeddings:
      - path: instruction
        embedding: gte-small
    settings:
      ui:
        media_paths:
          - instruction
          - input
          - output
      tags:
        - datasets
use_garden: true
signals:
  - signal_name: text_statistics
  - signal_name: lang_detection
concept_model_cache_embeddings:
  - gte-small
  - gte-base
  - sbert
  - openai
  - cohere
clusters:
  - dataset_namespace: lilac
    dataset_name: Capybara
    input_path: !!python/tuple
      - conversation
      - '*'
      - input
  - dataset_namespace: lilac
    dataset_name: glaive-code-assistant
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: glaive-function-calling-v2
    input_selector:
      format: sharegpt
      selector: human
    output_path: !!python/tuple
      - conversation_clusters
  - dataset_namespace: lilac
    dataset_name: open-assistant-conversations-2
    input_path: !!python/tuple
      - text
  - dataset_namespace: lilac
    dataset_name: lmsys-chat-1m
    input_selector:
      format: openai_conversation_json
      selector: user
    output_path: !!python/tuple
      - conversation__clusters
  - dataset_namespace: lilac
    dataset_name: OpenOrca
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: SlimOrca
    input_selector:
      format: sharegpt
      selector: human
    output_path: !!python/tuple
      - conversation__clusters
  - dataset_namespace: lilac
    dataset_name: databricks-dolly-15k-curated-en
    input_path: !!python/tuple
      - original-instruction
  - dataset_namespace: lilac
    dataset_name: mosaic-instruct-v3
    input_path: !!python/tuple
      - prompt
  - dataset_namespace: lilac
    dataset_name: dolphin
    input_path: !!python/tuple
      - input
  - dataset_namespace: lilac
    dataset_name: UltraChat-200k
    input_selector:
      format: openai_json
      selector: user
    output_path: !!python/tuple
      - messages__clusters
  - dataset_namespace: lilac
    dataset_name: hncomments-1m
    input_path: !!python/tuple
      - text
  - dataset_namespace: lilac
    dataset_name: roblox_luau_corpus
    input_path: !!python/tuple
      - prompt
  - dataset_namespace: lilac
    dataset_name: roblox_luau_corpus
    input_path: !!python/tuple
      - completion
  - dataset_namespace: lilac
    dataset_name: MMLU
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: ARC-Easy
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: ARC-Challenge
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: HellaSwag
    input_path: !!python/tuple
      - ctx
  - dataset_namespace: lilac
    dataset_name: HumanEval
    input_path: !!python/tuple
      - prompt
  - dataset_namespace: lilac
    dataset_name: mbpp
    input_path: !!python/tuple
      - text
  - dataset_namespace: lilac
    dataset_name: TruthfulQA-Generation
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: TruthfulQA-MultipleChoice
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: GSM8K-main
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: GSM8K-socratic
    input_path: !!python/tuple
      - question
  - dataset_namespace: lilac
    dataset_name: WinoGrande
    input_path: !!python/tuple
      - sentence