# Lilac project config. # See https://lilacml.com/api_reference/index.html#lilac.Config for details. datasets: - namespace: local name: glue source: dataset_name: glue config_name: ax source_name: huggingface embeddings: - path: premise embedding: gte-small - path: premise embedding: gte-base - path: hypothesis embedding: gte-small signals: - path: premise signal: signal_name: pii - path: hypothesis signal: signal_name: pii - path: premise signal: signal_name: text_statistics settings: ui: media_paths: - premise markdown_paths: [] - namespace: local name: glue_ax source: dataset_name: glue config_name: ax source_name: huggingface embeddings: - path: hypothesis embedding: gte-small signals: - path: premise signal: signal_name: text_statistics - path: premise signal: signal_name: pii - path: premise signal: signal_name: near_dup - path: hypothesis signal: embedding: gte-small namespace: '' concept_name: '' signal_name: concept_score - path: hypothesis signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: hypothesis signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score settings: ui: media_paths: - hypothesis markdown_paths: [] - namespace: local name: imdb3 source: dataset_name: imdb source_name: huggingface settings: ui: media_paths: - text markdown_paths: [] - namespace: local name: imdb source: dataset_name: imdb source_name: huggingface embeddings: - path: text embedding: gte-small signals: - path: text signal: signal_name: pii - path: text signal: signal_name: text_statistics settings: ui: media_paths: - text markdown_paths: [] - namespace: local name: imdb2 source: dataset_name: imdb source_name: huggingface settings: ui: media_paths: - text markdown_paths: [] - namespace: lilac name: OpenOrca-100k source: dataset_name: Open-Orca/OpenOrca sample_size: 100000 source_name: huggingface embeddings: - path: question embedding: gte-small - path: response embedding: gte-small signals: - path: question signal: signal_name: near_dup - path: question signal: signal_name: pii - path: question signal: signal_name: lang_detection - path: question signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: question signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: legal-termination signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: question signal: signal_name: text_statistics - path: response signal: signal_name: near_dup - path: response signal: signal_name: pii - path: response signal: signal_name: lang_detection - path: response signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: question signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: legal-termination signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: response signal: signal_name: text_statistics - path: system_prompt signal: signal_name: pii settings: ui: media_paths: - question - response markdown_paths: [] - namespace: local name: the_movies_dataset source: filepaths: - gs://lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv names: [] source_name: csv settings: ui: media_paths: - overview markdown_paths: [] - namespace: local name: glue_ax_parquet source: filepaths: - gs://lilac-data/datasets/glue_ax_parquet/glue_ax.parquet source_name: parquet settings: ui: media_paths: - premise markdown_paths: [] - namespace: lilac name: mmlu_professional_law source: dataset_name: cais/mmlu config_name: professional_law source_name: huggingface embeddings: - path: question embedding: gte-small - path: - choices - '*' embedding: gte-small signals: - path: question signal: signal_name: near_dup - path: question signal: signal_name: pii - path: question signal: signal_name: lang_detection - path: question signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: question signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: legal-termination signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: question signal: signal_name: text_statistics - path: - choices - '*' signal: signal_name: near_dup - path: - choices - '*' signal: signal_name: pii - path: - choices - '*' signal: signal_name: lang_detection - path: - choices - '*' signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: - choices - '*' signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: - choices - '*' signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: - choices - '*' signal: embedding: gte-small namespace: lilac concept_name: question signal_name: concept_score - path: - choices - '*' signal: embedding: gte-small namespace: lilac concept_name: legal-termination signal_name: concept_score - path: - choices - '*' signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: - choices - '*' signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: - choices - '*' signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: - choices - '*' signal: signal_name: text_statistics settings: ui: media_paths: - question - - choices - '*' markdown_paths: [] preferred_embedding: gte-small - namespace: local name: deepset-prompt-inj source: dataset_name: deepset/prompt-injections source_name: huggingface embeddings: - path: text embedding: gte-small settings: ui: media_paths: - text markdown_paths: [] - namespace: local name: jasper-prompt-inj source: dataset_name: JasperLS/prompt-injections source_name: huggingface embeddings: - path: text embedding: gte-small settings: ui: media_paths: - text markdown_paths: [] - namespace: local name: mosaic-chat-v2 source: dataset_name: sam-mosaic/chat-v2 source_name: huggingface embeddings: - path: prompt embedding: gte-small - path: response embedding: gte-small signals: - path: prompt signal: signal_name: near_dup - path: prompt signal: signal_name: pii - path: prompt signal: signal_name: lang_detection - path: prompt signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: prompt signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: prompt signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: prompt signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: prompt signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: prompt signal: signal_name: text_statistics - path: response signal: signal_name: near_dup - path: response signal: signal_name: pii - path: response signal: signal_name: lang_detection - path: response signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: response signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: response signal: signal_name: text_statistics settings: ui: media_paths: - prompt - response markdown_paths: [] preferred_embedding: gte-small - namespace: local name: databricks-dolly-15k-curated-en source: dataset_name: argilla/databricks-dolly-15k-curated-en source_name: huggingface embeddings: - path: original-context embedding: gte-small - path: - new-context - value - '*' embedding: gte-small - path: original-instruction embedding: gte-small signals: - path: original-instruction signal: signal_name: near_dup - path: original-instruction signal: signal_name: pii - path: original-instruction signal: signal_name: lang_detection - path: original-instruction signal: signal_name: text_statistics - path: original-context signal: signal_name: near_dup - path: original-context signal: signal_name: pii - path: original-context signal: signal_name: lang_detection - path: original-context signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: original-context signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: original-context signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: original-context signal: embedding: gte-small namespace: lilac concept_name: question signal_name: concept_score - path: original-context signal: embedding: gte-small namespace: lilac concept_name: legal-termination signal_name: concept_score - path: original-context signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: original-context signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: original-context signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: original-context signal: signal_name: text_statistics - path: original-response signal: signal_name: near_dup - path: original-response signal: signal_name: pii - path: original-response signal: signal_name: lang_detection - path: original-response signal: signal_name: text_statistics - path: - new-instruction - value - '*' signal: signal_name: near_dup - path: - new-instruction - value - '*' signal: signal_name: pii - path: - new-instruction - value - '*' signal: signal_name: lang_detection - path: - new-instruction - value - '*' signal: signal_name: text_statistics - path: - new-context - value - '*' signal: signal_name: near_dup - path: - new-context - value - '*' signal: signal_name: pii - path: - new-context - value - '*' signal: signal_name: lang_detection - path: - new-context - value - '*' signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: - new-context - value - '*' signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: - new-context - value - '*' signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: - new-context - value - '*' signal: embedding: gte-small namespace: lilac concept_name: question signal_name: concept_score - path: - new-context - value - '*' signal: embedding: gte-small namespace: lilac concept_name: legal-termination signal_name: concept_score - path: - new-context - value - '*' signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: - new-context - value - '*' signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: - new-context - value - '*' signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: - new-context - value - '*' signal: signal_name: text_statistics - path: - new-response - value - '*' signal: signal_name: near_dup - path: - new-response - value - '*' signal: signal_name: pii - path: - new-response - value - '*' signal: signal_name: lang_detection - path: - new-response - value - '*' signal: signal_name: text_statistics - path: original-instruction signal: signal_name: spacy_ner settings: ui: media_paths: - original-instruction - original-context - original-response - - new-instruction - value - '*' - - new-context - value - '*' - - new-response - value - '*' markdown_paths: [] preferred_embedding: gte-small - namespace: local name: open-asssistant-conversations source: dataset_name: OpenAssistant/oasst1 source_name: huggingface embeddings: - path: text embedding: gte-small signals: - path: text signal: signal_name: near_dup - path: text signal: signal_name: pii - path: text signal: signal_name: lang_detection - path: text signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: question signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: legal-termination signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: text signal: signal_name: text_statistics settings: ui: media_paths: - text markdown_paths: [] preferred_embedding: gte-small - namespace: local name: enron-emails source: dataset_name: EleutherAI/pile config_name: enron_emails sample_size: 100000 source_name: huggingface embeddings: - path: text embedding: gte-small signals: - path: text signal: signal_name: near_dup - path: text signal: signal_name: pii - path: text signal: signal_name: lang_detection - path: text signal: embedding: gte-small namespace: lilac concept_name: positive-sentiment signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: toxicity signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: question signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: legal-termination signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: source-code signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: negative-sentiment signal_name: concept_score - path: text signal: embedding: gte-small namespace: lilac concept_name: profanity signal_name: concept_score - path: text signal: signal_name: text_statistics settings: ui: media_paths: - text markdown_paths: [] preferred_embedding: gte-small