datasets: - namespace: lilac name: Capybara source: dataset_name: LDJnr/Capybara source_name: huggingface embeddings: - path: - conversation - '*' - input embedding: gte-small - path: - conversation - '*' - output embedding: gte-small settings: ui: media_paths: - - conversation - '*' - input - - conversation - '*' - output tags: - datasets - namespace: lilac name: glaive-code-assistant source: dataset_name: glaiveai/glaive-code-assistant source_name: huggingface embeddings: - path: question embedding: gte-small - path: answer embedding: gte-small settings: ui: media_paths: - question - answer tags: - datasets - namespace: lilac name: glaive-function-calling-v2 source: dataset_name: lilacai/glaive-function-calling-v2-sharegpt source_name: huggingface embeddings: - path: - conversations - '*' - value embedding: gte-small settings: ui: media_paths: - - conversations - '*' - value tags: - datasets - namespace: lilac name: open-assistant-conversations-2 source: dataset_name: OpenAssistant/oasst2 source_name: huggingface embeddings: - path: text embedding: gte-small settings: ui: media_paths: - text tags: - datasets - namespace: lilac name: lmsys-chat-1m source: dataset_name: lmsys/lmsys-chat-1m source_name: huggingface embeddings: - path: - conversation - '*' - content embedding: gte-small settings: ui: media_paths: - - conversation - '*' - content tags: - logs - namespace: lilac name: OpenOrca source: dataset_name: Open-Orca/OpenOrca source_name: huggingface embeddings: - path: question embedding: gte-small settings: ui: media_paths: - question - response tags: - datasets - namespace: lilac name: SlimOrca source: dataset_name: Open-Orca/SlimOrca source_name: huggingface embeddings: - path: - conversations - '*' - value embedding: gte-small settings: ui: media_paths: - - conversations - '*' - value tags: - datasets - namespace: lilac name: UltraChat-200k source: dataset_name: HuggingFaceH4/ultrachat_200k source_name: huggingface settings: ui: media_paths: - - messages - '*' - content tags: - datasets - namespace: lilac name: roblox_luau_corpus source: dataset_name: Roblox/luau_corpus source_name: huggingface embeddings: - path: prompt embedding: gte-small - path: completion embedding: gte-small settings: ui: media_paths: - prompt - completion tags: - datasets - namespace: lilac name: hncomments-1m source: dataset_name: OpenPipe/hacker-news sample_size: 1000000 source_name: huggingface embeddings: - path: text embedding: gte-small settings: ui: media_paths: - text tags: - datasets - namespace: lilac name: MMLU source: dataset_name: cais/mmlu config_name: all source_name: huggingface settings: ui: media_paths: - question - - choices - '*' - answer tags: - eval - namespace: lilac name: ARC-Easy source: dataset_name: allenai/ai2_arc config_name: ARC-Easy source_name: huggingface settings: ui: media_paths: - question - - choices - text - '*' - answerKey tags: - eval - namespace: lilac name: ARC-Challenge source: dataset_name: allenai/ai2_arc config_name: ARC-Challenge source_name: huggingface settings: ui: media_paths: - question - - choices - text - '*' - answerKey tags: - eval - namespace: lilac name: HellaSwag source: dataset_name: Rowan/hellaswag source_name: huggingface settings: ui: media_paths: - ctx - ctx_a - ctx_b - - endings - '*' tags: - eval - namespace: lilac name: HumanEval source: dataset_name: openai_humaneval source_name: huggingface settings: ui: media_paths: - prompt - canonical_solution - test tags: - eval - namespace: lilac name: mbpp source: dataset_name: mbpp source_name: huggingface settings: ui: media_paths: - code - text tags: - eval - namespace: lilac name: TruthfulQA-MultipleChoice source: dataset_name: truthful_qa config_name: multiple_choice source_name: huggingface settings: ui: media_paths: - question - - mc1_targets - choices - '*' - - mc2_targets - choices - '*' tags: - eval - namespace: lilac name: TruthfulQA-Generation source: dataset_name: truthful_qa config_name: generation source_name: huggingface settings: ui: media_paths: - question - - correct_answers - '*' - - incorrect_answers - '*' - source tags: - eval - namespace: lilac name: GSM8K-main source: dataset_name: gsm8k config_name: main source_name: huggingface settings: ui: media_paths: - question - answer tags: - eval - namespace: lilac name: GSM8K-socratic source: dataset_name: gsm8k config_name: socratic source_name: huggingface settings: ui: media_paths: - question - answer tags: - eval - namespace: lilac name: WinoGrande source: dataset_name: winogrande config_name: winogrande_xl source_name: huggingface settings: ui: media_paths: - sentence - option1 - option2 - answer tags: - eval - namespace: lilac name: databricks-dolly-15k-curated-en source: dataset_name: argilla/databricks-dolly-15k-curated-en source_name: huggingface embeddings: - path: original-instruction embedding: gte-small - path: original-context embedding: gte-small - path: original-response embedding: gte-small settings: ui: media_paths: - original-instruction - original-context - original-response - - new-instruction - value - '*' - - new-context - value - '*' - - new-response - value - '*' tags: - datasets - namespace: lilac name: mosaic-instruct-v3 source: dataset_name: mosaicml/instruct-v3 source_name: huggingface embeddings: - path: prompt embedding: gte-small settings: ui: media_paths: - prompt - response tags: - datasets - namespace: lilac name: dolphin source: dataset_name: cognitivecomputations/dolphin config_name: flan1m-alpaca-uncensored source_name: huggingface embeddings: - path: instruction embedding: gte-small settings: ui: media_paths: - instruction - input - output tags: - datasets use_garden: true signals: - signal_name: text_statistics - signal_name: lang_detection concept_model_cache_embeddings: - gte-small - gte-base - sbert - openai - cohere clusters: - dataset_namespace: lilac dataset_name: Capybara input_path: !!python/tuple - conversation - '*' - input - dataset_namespace: lilac dataset_name: glaive-code-assistant input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: glaive-function-calling-v2 input_selector: format: sharegpt selector: human output_path: !!python/tuple - conversation_clusters - dataset_namespace: lilac dataset_name: open-assistant-conversations-2 input_path: !!python/tuple - text - dataset_namespace: lilac dataset_name: lmsys-chat-1m input_selector: format: openai_conversation_json selector: user output_path: !!python/tuple - conversation__clusters - dataset_namespace: lilac dataset_name: OpenOrca input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: SlimOrca input_selector: format: sharegpt selector: human output_path: !!python/tuple - conversation__clusters - dataset_namespace: lilac dataset_name: databricks-dolly-15k-curated-en input_path: !!python/tuple - original-instruction - dataset_namespace: lilac dataset_name: mosaic-instruct-v3 input_path: !!python/tuple - prompt - dataset_namespace: lilac dataset_name: dolphin input_path: !!python/tuple - input - dataset_namespace: lilac dataset_name: UltraChat-200k input_selector: format: openai_json selector: user output_path: !!python/tuple - messages__clusters - dataset_namespace: lilac dataset_name: hncomments-1m input_path: !!python/tuple - text - dataset_namespace: lilac dataset_name: roblox_luau_corpus input_path: !!python/tuple - prompt - dataset_namespace: lilac dataset_name: roblox_luau_corpus input_path: !!python/tuple - completion - dataset_namespace: lilac dataset_name: MMLU input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: ARC-Easy input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: ARC-Challenge input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: HellaSwag input_path: !!python/tuple - ctx - dataset_namespace: lilac dataset_name: HumanEval input_path: !!python/tuple - prompt - dataset_namespace: lilac dataset_name: mbpp input_path: !!python/tuple - text - dataset_namespace: lilac dataset_name: TruthfulQA-Generation input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: TruthfulQA-MultipleChoice input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: GSM8K-main input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: GSM8K-socratic input_path: !!python/tuple - question - dataset_namespace: lilac dataset_name: WinoGrande input_path: !!python/tuple - sentence