Spaces:

lilacai
/

lilac

Running

App Files Files Community

nsthorat commited on Aug 8, 2023

Commit

80dadd4

•

1 Parent(s): dbe8c69

Push

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +10 -0
.env +40 -0
.env.demo +4 -0
.gitattributes +0 -35
.gitignore +5 -0
Dockerfile +29 -0
LICENSE +201 -0
README.md +6 -9
lilac/.gitignore +1 -0
lilac/__init__.py +33 -0
lilac/auth.py +87 -0
lilac/batch_utils.py +92 -0
lilac/cli.py +39 -0
lilac/concepts/__init__.py +0 -0
lilac/concepts/concept.py +330 -0
lilac/concepts/db_concept.py +520 -0
lilac/config.py +80 -0
lilac/conftest.py +28 -0
lilac/data/__init__.py +9 -0
lilac/data/dataset.py +485 -0
lilac/data/dataset_duckdb.py +1717 -0
lilac/data/dataset_test_utils.py +127 -0
lilac/data/dataset_utils.py +308 -0
lilac/data/duckdb_utils.py +25 -0
lilac/data_loader.py +110 -0
lilac/db_manager.py +42 -0
lilac/embeddings/__init__.py +0 -0
lilac/embeddings/cohere.py +59 -0
lilac/embeddings/default_vector_stores.py +10 -0
lilac/embeddings/embedding.py +110 -0
lilac/embeddings/gte.py +63 -0
lilac/embeddings/openai.py +68 -0
lilac/embeddings/palm.py +62 -0
lilac/embeddings/sbert.py +38 -0
lilac/embeddings/transformer_utils.py +35 -0
lilac/embeddings/vector_store.py +200 -0
lilac/embeddings/vector_store_hnsw.py +106 -0
lilac/embeddings/vector_store_numpy.py +92 -0
lilac/env.py +63 -0
lilac/load.py +214 -0
lilac/make_openapi.py +29 -0
lilac/parquet_writer.py +70 -0
lilac/router_concept.py +209 -0
lilac/router_data_loader.py +80 -0
lilac/router_dataset.py +303 -0
lilac/router_google_login.py +60 -0
lilac/router_signal.py +105 -0
lilac/router_tasks.py +14 -0
lilac/router_utils.py +54 -0
lilac/schema.py +600 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python
+**/__pycache__
+**/*.pyc
+**/*.pyo
+**/*.pyd
+# Ignore unit tests.
+**/*_test.py
+# Mac OS.
+.DS_Store

.env ADDED Viewed

	@@ -0,0 +1,40 @@

+# To overwrite these variables, create a .env.local file
+# The path to the directory where the data will be downloaded on machine
+LILAC_DATA_PATH=./data
+# Set to 1 for duckdb to use views instead of materialized tables (lower memory usage, but slower).
+DUCKDB_USE_VIEWS=0
+# Set to true to enable read-only mode, disabling the ability to add datasets & compute dataset
+# signals.
+# LILAC_AUTH_ENABLED=true
+# Variables that can be set in .env.local
+#
+# Get key from https://dashboard.cohere.ai/api-keys
+# COHERE_API_KEY=
+# GCS_REGION=
+# GCS_ACCESS_KEY=
+# GCS_SECRET_KEY=
+# Get key from https://platform.openai.com/account/api-keys
+# OPENAI_API_KEY=
+# Get key from https://makersuite.google.com/app/apikey
+# PALM_API_KEY=
+# HuggingFace demos: machine that uploads to HuggingFace.
+# For authenticating with HuggingFace to deploy to a Space.
+# HF_USERNAME=
+# The default repo to deploy to for a staging demo. Can be overridden by a command line flag.
+# HF_STAGING_DEMO_REPO='HF_ORG/HF_REPO_NAME'
+# For Google-login. This is generated from the Google Cloud Console for a web client.
+# See: https://developers.google.com/identity/protocols/oauth2
+GOOGLE_CLIENT_ID='279475920249-i8llm8vbos1vj5m1qocir8narb3r0enu.apps.googleusercontent.com'
+# The client secret of the above client.
+# GOOGLE_CLIENT_SECRET=
+# A random string for oauth sessions.
+# LILAC_OAUTH_SECRET_KEY=

.env.demo ADDED Viewed

	@@ -0,0 +1,4 @@

+LILAC_DATA_PATH='/data'
+HF_HOME='/data/.huggingface'
+TRANSFORMERS_CACHE='/data/.cache'
+XDG_CACHE_HOME='/data/.cache'

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+**/*.pyc
+**/*.pyo
+**/*.pyd
+**/*_test.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# NOTE: When we upgrade to 3.11 we can use a slimmer docker image which comes with gcc.
+FROM python:3.9-bullseye
+# Allow statements and log messages to immediately appear in the Knative logs
+ENV PYTHONUNBUFFERED True
+# Set the working directory in the container.
+WORKDIR /server
+# Install the dependencies. This requires exporting requirements.txt from poetry first, which
+# happens from ./build_docker.sh.
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY .env .
+COPY .env.demo .
+COPY LICENSE .
+# Copy python files.
+COPY /lilac ./lilac/
+# Copy the data files. We use glob so docker copy won't fail if the directory doesn't exist.
+COPY /dat[a] ./data/
+CMD [ \
+ "gunicorn", "lilac.server:app", \
+ "--bind", "0.0.0.0:5432", \
+ "-k", "uvicorn.workers.UvicornWorker" \
+ ]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+ 1. Definitions.
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+ END OF TERMS AND CONDITIONS
+ APPENDIX: How to apply the Apache License to your work.
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+ Copyright 2023 Lilac AI Inc.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.

README.md CHANGED Viewed

@@ -1,11 +1,8 @@
 ---
-title: Lilac
-emoji: 🌖
-colorFrom: indigo
-colorTo: blue
 sdk: docker
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Lilac Blueprint
+emoji: 🌷
+colorFrom: purple
+colorTo: purple
 sdk: docker
+app_port: 5432
+---

lilac/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ web/

lilac/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from importlib import metadata
+from .data import * # noqa: F403
+from .data.dataset_duckdb import DatasetDuckDB
+from .data_loader import create_dataset
+from .db_manager import get_dataset, set_default_dataset_cls
+from .embeddings.default_vector_stores import register_default_vector_stores
+from .server import start_server, stop_server
+from .signals import * # noqa: F403
+from .signals.default_signals import register_default_signals
+from .sources import * # noqa: F403
+from .sources.default_sources import register_default_sources
+try:
+ __version__ = metadata.version('lilacai')
+except metadata.PackageNotFoundError:
+ __version__ = ''
+register_default_sources()
+register_default_signals()
+register_default_vector_stores()
+set_default_dataset_cls(DatasetDuckDB)
+# Avoids polluting the results of dir(__package__).
+del (metadata, register_default_sources, register_default_signals, set_default_dataset_cls,
+ DatasetDuckDB)
+__all__ = [
+ 'start_server',
+ 'stop_server',
+ 'create_dataset',
+ 'get_dataset',
+]

lilac/auth.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""Authentication and ACL configuration."""
+from typing import Optional
+from fastapi import Request
+from pydantic import BaseModel, ValidationError
+from .env import env
+class ConceptAuthorizationException(Exception):
+ """Authorization exceptions thrown by the concept database."""
+ pass
+class DatasetUserAccess(BaseModel):
+ """User access for datasets."""
+ # Whether the user can compute a signal.
+ compute_signals: bool
+ # Whether the user can delete a dataset.
+ delete_dataset: bool
+ # Whether the user can delete a signal.
+ delete_signals: bool
+ # Whether the user can update settings.
+ update_settings: bool
+class ConceptUserAccess(BaseModel):
+ """User access for concepts."""
+ # Whether the user can delete any concept (not their own).
+ delete_any_concept: bool
+class UserAccess(BaseModel):
+ """User access."""
+ create_dataset: bool
+ # TODO(nsthorat): Make this keyed to each dataset and concept.
+ dataset: DatasetUserAccess
+ concept: ConceptUserAccess
+class UserInfo(BaseModel):
+ """User information."""
+ id: str
+ email: str
+ name: str
+ given_name: str
+ family_name: str
+class AuthenticationInfo(BaseModel):
+ """Authentication information for the user."""
+ user: Optional[UserInfo] = None
+ access: UserAccess
+ auth_enabled: bool
+def get_session_user(request: Request) -> Optional[UserInfo]:
+ """Get the user from the session."""
+ if not env('LILAC_AUTH_ENABLED'):
+ return None
+ user_info_dict = request.session.get('user', None)
+ if user_info_dict:
+ try:
+ return UserInfo.parse_obj(user_info_dict)
+ except ValidationError:
+ return None
+ return None
+def get_user_access() -> UserAccess:
+ """Get the user access."""
+ auth_enabled = env('LILAC_AUTH_ENABLED')
+ if isinstance(auth_enabled, str):
+ auth_enabled = auth_enabled.lower() == 'true'
+ if auth_enabled:
+ return UserAccess(
+ create_dataset=False,
+ dataset=DatasetUserAccess(
+ compute_signals=False, delete_dataset=False, delete_signals=False, update_settings=False),
+ concept=ConceptUserAccess(delete_any_concept=False))
+ return UserAccess(
+ create_dataset=True,
+ dataset=DatasetUserAccess(
+ compute_signals=True, delete_dataset=True, delete_signals=True, update_settings=True),
+ concept=ConceptUserAccess(delete_any_concept=True))

lilac/batch_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""Utils for the python server."""
+import itertools
+from typing import Any, Callable, Generator, Iterable, Iterator, TypeVar, Union, cast
+from .schema import Item
+from .utils import chunks, is_primitive
+def _deep_flatten(input: Union[Iterator, object],
+ is_primitive_predicate: Callable[[object], bool]) -> Generator:
+ """Flattens a nested iterable."""
+ if is_primitive_predicate(input):
+ yield input
+ elif isinstance(input, dict):
+ yield input
+ elif is_primitive(input):
+ yield input
+ else:
+ for elem in cast(Iterator, input):
+ yield from _deep_flatten(elem, is_primitive_predicate)
+def deep_flatten(input: Union[Iterator, Iterable],
+ is_primitive_predicate: Callable[[object], bool] = is_primitive) -> Iterator:
+ """Flattens a deeply nested iterator.
+ Primitives and dictionaries are not flattened. The user can also provide a predicate to determine
+ what is a primitive.
+ """
+ return _deep_flatten(input, is_primitive_predicate)
+def _deep_unflatten(flat_input: Iterator[list[object]], original_input: Union[Iterable, object],
+ is_primitive_predicate: Callable[[object], bool]) -> Union[list, dict]:
+ """Unflattens a deeply flattened iterable according to the original iterable's structure."""
+ if is_primitive_predicate(original_input):
+ return next(flat_input)
+ else:
+ values: Iterable
+ if isinstance(original_input, dict):
+ values = original_input.values()
+ else:
+ values = cast(Iterable, original_input)
+ return [_deep_unflatten(flat_input, orig_elem, is_primitive_predicate) for orig_elem in values]
+def deep_unflatten(flat_input: Union[Iterable, Iterator],
+ original_input: Union[Iterable, object],
+ is_primitive_predicate: Callable[[object], bool] = is_primitive) -> list:
+ """Unflattens a deeply flattened iterable according to the original iterable's structure."""
+ return cast(list, _deep_unflatten(iter(flat_input), original_input, is_primitive_predicate))
+TFlatten = TypeVar('TFlatten')
+def flatten(inputs: Iterable[Iterable[TFlatten]]) -> Iterator[TFlatten]:
+ """Flattens a nested iterator.
+ Only supports flattening one level deep.
+ """
+ for input in inputs:
+ yield from input
+TUnflatten = TypeVar('TUnflatten')
+def unflatten(flat_inputs: Union[Iterable[TUnflatten], Iterator[TUnflatten]],
+ original_inputs: Iterable[Iterable[Any]]) -> Iterator[list[TUnflatten]]:
+ """Unflattens a flattened iterable according to the original iterable's structure."""
+ flat_inputs_iter = iter(flat_inputs)
+ for original_input in original_inputs:
+ yield [next(flat_inputs_iter) for _ in original_input]
+TFlatBatchedInput = TypeVar('TFlatBatchedInput')
+TFlatBatchedOutput = TypeVar('TFlatBatchedOutput')
+def flat_batched_compute(input: Iterable[Iterable[TFlatBatchedInput]],
+ f: Callable[[list[TFlatBatchedInput]], Iterable[TFlatBatchedOutput]],
+ batch_size: int) -> Iterable[Iterable[TFlatBatchedOutput]]:
+ """Flatten the input, batched call f, and return the output unflattened."""
+ # Tee the input so we can use it twice for the input and output shapes.
+ input_1, input_2 = itertools.tee(input, 2)
+ batches = chunks(flatten(input_1), batch_size)
+ batched_outputs = flatten((f(batch) for batch in batches))
+ return unflatten(batched_outputs, input_2)
+TBatchSpanVectorOutput = TypeVar('TBatchSpanVectorOutput', bound=Item)

lilac/cli.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Lilac CLI."""
+import click
+from . import __version__
+from .load import load_command as load
+from .server import start_server
+@click.command()
+@click.option(
+ '--host',
+ help='The host address where the web server will listen to.',
+ default='0.0.0.0',
+ type=str)
+@click.option('--port', help='The port number of the web-server', type=int, default=5432)
+def start(host: str, port: int) -> None:
+ """Starts the Lilac web server."""
+ start_server(host=host, port=port, open=True)
+@click.command()
+def version() -> None:
+ """Prints the version of Lilac."""
+ print(__version__)
+@click.group()
+def cli() -> None:
+ """Lilac CLI."""
+ pass
+cli.add_command(start)
+cli.add_command(version)
+cli.add_command(load)
+if __name__ == '__main__':
+ cli()

lilac/concepts/__init__.py ADDED Viewed

File without changes

lilac/concepts/concept.py ADDED Viewed

	@@ -0,0 +1,330 @@

+"""Defines the concept and the concept models."""
+import dataclasses
+from enum import Enum
+from typing import Callable, Literal, Optional, Union
+import numpy as np
+from joblib import Parallel, delayed
+from pydantic import BaseModel, validator
+from scipy.interpolate import interp1d
+from sklearn.base import clone
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import precision_recall_curve, roc_auc_score
+from sklearn.model_selection import KFold
+from ..embeddings.embedding import get_embed_fn
+from ..schema import SignalInputType
+from ..signals.signal import TextEmbeddingSignal, get_signal_cls
+from ..utils import DebugTimer
+LOCAL_CONCEPT_NAMESPACE = 'local'
+# The maximum number of cross-validation models to train.
+MAX_NUM_CROSS_VAL_MODELS = 15
+# The β weight to use for the F-beta score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fbeta_score.html
+# β = 0.5 means we value precision 2x as much as recall.
+# β = 2 means we value recall 2x as much as precision.
+F_BETA_WEIGHT = 0.5
+class ExampleOrigin(BaseModel):
+ """The origin of an example."""
+ # The namespace that holds the dataset.
+ dataset_namespace: str
+ # The name of the dataset.
+ dataset_name: str
+ # The id of row in the dataset that the example was added from.
+ dataset_row_id: str
+DraftId = Union[Literal['main'], str]
+DRAFT_MAIN = 'main'
+class ExampleIn(BaseModel):
+ """An example in a concept without the id (used for adding new examples)."""
+ label: bool
+ text: Optional[str] = None
+ img: Optional[bytes] = None
+ origin: Optional[ExampleOrigin] = None
+ # The name of the draft to put the example in. If None, puts it in the main draft.
+ draft: Optional[DraftId] = DRAFT_MAIN
+ @validator('text')
+ def parse_text(cls, text: str) -> str:
+ """Fixes surrogate errors in text: https://github.com/ijl/orjson/blob/master/README.md#str ."""
+ return text.encode('utf-8', 'replace').decode('utf-8')
+class Example(ExampleIn):
+ """A single example in a concept used for training a concept model."""
+ id: str
+class Concept(BaseModel):
+ """A concept is a collection of examples."""
+ # The namespace of the concept.
+ namespace: str
+ # The name of the concept.
+ concept_name: str
+ # The type of the data format that this concept represents.
+ type: SignalInputType
+ data: dict[str, Example]
+ version: int = 0
+ description: Optional[str] = None
+ def drafts(self) -> list[DraftId]:
+ """Gets all the drafts for the concept."""
+ drafts: set[DraftId] = set([DRAFT_MAIN]) # Always return the main draft.
+ for example in self.data.values():
+ if example.draft:
+ drafts.add(example.draft)
+ return list(sorted(drafts))
+class OverallScore(str, Enum):
+ """Enum holding the overall score."""
+ NOT_GOOD = 'not_good'
+ OK = 'ok'
+ GOOD = 'good'
+ VERY_GOOD = 'very_good'
+ GREAT = 'great'
+def _get_overall_score(f1_score: float) -> OverallScore:
+ if f1_score < 0.5:
+ return OverallScore.NOT_GOOD
+ if f1_score < 0.8:
+ return OverallScore.OK
+ if f1_score < 0.9:
+ return OverallScore.GOOD
+ if f1_score < 0.95:
+ return OverallScore.VERY_GOOD
+ return OverallScore.GREAT
+class ConceptMetrics(BaseModel):
+ """Metrics for a concept."""
+ # The average F1 score for the concept computed using cross validation.
+ f1: float
+ precision: float
+ recall: float
+ roc_auc: float
+ overall: OverallScore
+@dataclasses.dataclass
+class LogisticEmbeddingModel:
+ """A model that uses logistic regression with embeddings."""
+ _metrics: Optional[ConceptMetrics] = None
+ _threshold: float = 0.5
+ def __post_init__(self) -> None:
+ # See `notebooks/Toxicity.ipynb` for an example of training a concept model.
+ self._model = LogisticRegression(
+ class_weight='balanced', C=30, tol=1e-5, warm_start=True, max_iter=5_000, n_jobs=-1)
+ def score_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
+ """Get the scores for the provided embeddings."""
+ y_probs = self._model.predict_proba(embeddings)[:, 1]
+ # Map [0, threshold, 1] to [0, 0.5, 1].
+ interpolate_fn = interp1d([0, self._threshold, 1], [0, 0.4999, 1])
+ return interpolate_fn(y_probs)
+ def _setup_training(self, X_train: np.ndarray,
+ labels: Union[list[bool], np.ndarray]) -> tuple[np.ndarray, np.ndarray]:
+ y_train = np.array(labels)
+ # Shuffle the data in unison.
+ p = np.random.permutation(len(X_train))
+ X_train = X_train[p]
+ y_train = y_train[p]
+ return X_train, y_train
+ def fit(self, embeddings: np.ndarray, labels: list[bool]) -> None:
+ """Fit the model to the provided embeddings and labels."""
+ label_set = set(labels)
+ if len(label_set) < 2:
+ dim = embeddings.shape[1]
+ random_vector = np.random.randn(dim).astype(np.float32)
+ random_vector /= np.linalg.norm(random_vector)
+ embeddings = np.vstack([embeddings, random_vector])
+ labels.append(False if True in label_set else True)
+ if len(labels) != len(embeddings):
+ raise ValueError(
+ f'Length of embeddings ({len(embeddings)}) must match length of labels ({len(labels)})')
+ X_train, y_train = self._setup_training(embeddings, labels)
+ self._model.fit(X_train, y_train)
+ self._metrics, self._threshold = self._compute_metrics(embeddings, labels)
+ def _compute_metrics(self, embeddings: np.ndarray,
+ labels: list[bool]) -> tuple[Optional[ConceptMetrics], float]:
+ """Return the concept metrics."""
+ labels_np = np.array(labels)
+ n_splits = min(len(labels_np), MAX_NUM_CROSS_VAL_MODELS)
+ fold = KFold(n_splits, shuffle=True, random_state=42)
+ def _fit_and_score(model: LogisticRegression, X_train: np.ndarray, y_train: np.ndarray,
+ X_test: np.ndarray, y_test: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+ if len(set(y_train)) < 2:
+ return np.array([]), np.array([])
+ model.fit(X_train, y_train)
+ y_pred = model.predict_proba(X_test)[:, 1]
+ return y_test, y_pred
+ # Compute the metrics for each validation fold in parallel.
+ jobs: list[Callable] = []
+ for (train_index, test_index) in fold.split(embeddings):
+ X_train, y_train = embeddings[train_index], labels_np[train_index]
+ X_train, y_train = self._setup_training(X_train, y_train)
+ X_test, y_test = embeddings[test_index], labels_np[test_index]
+ model = clone(self._model)
+ jobs.append(delayed(_fit_and_score)(model, X_train, y_train, X_test, y_test))
+ results = Parallel(n_jobs=-1)(jobs)
+ y_test = np.concatenate([y_test for y_test, _ in results], axis=0)
+ y_pred = np.concatenate([y_pred for _, y_pred in results], axis=0)
+ if len(set(y_test)) < 2:
+ return None, 0.5
+ roc_auc_val = roc_auc_score(y_test, y_pred)
+ precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
+ numerator = (1 + F_BETA_WEIGHT**2) * precision * recall
+ denom = (F_BETA_WEIGHT**2 * precision) + recall
+ f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom != 0))
+ max_f1: float = np.max(f1_scores)
+ max_f1_index = np.argmax(f1_scores)
+ max_f1_thresh: float = thresholds[max_f1_index]
+ max_f1_prec: float = precision[max_f1_index]
+ max_f1_recall: float = recall[max_f1_index]
+ metrics = ConceptMetrics(
+ f1=max_f1,
+ precision=max_f1_prec,
+ recall=max_f1_recall,
+ roc_auc=float(roc_auc_val),
+ overall=_get_overall_score(max_f1))
+ return metrics, max_f1_thresh
+def draft_examples(concept: Concept, draft: DraftId) -> dict[str, Example]:
+ """Get the examples in the provided draft by overriding the main draft."""
+ draft_examples: dict[str, dict[str, Example]] = {}
+ for id, example in concept.data.items():
+ draft_examples.setdefault(example.draft or DRAFT_MAIN, {})[example.id] = example
+ if draft == DRAFT_MAIN:
+ return draft_examples.get(DRAFT_MAIN, {})
+ if draft not in draft_examples:
+ raise ValueError(
+ f'Draft {draft} not found in concept. Found drafts: {list(draft_examples.keys())}')
+ # Map the text of the draft to its id so we can dedupe with main.
+ draft_text_ids = {example.text: id for id, example in draft_examples[draft].items()}
+ # Write each of examples from main to the draft examples only if the text does not appear in the
+ # draft.
+ for id, example in draft_examples[DRAFT_MAIN].items():
+ if example.text not in draft_text_ids:
+ draft_examples[draft][id] = example
+ return draft_examples[draft]
+@dataclasses.dataclass
+class ConceptModel:
+ """A concept model. Stores all concept model drafts and manages syncing."""
+ # The concept that this model is for.
+ namespace: str
+ concept_name: str
+ # The name of the embedding for this model.
+ embedding_name: str
+ version: int = 0
+ batch_size = 4096
+ # The following fields are excluded from JSON serialization, but still pickle-able.
+ # Maps a concept id to the embeddings.
+ _embeddings: dict[str, np.ndarray] = dataclasses.field(default_factory=dict)
+ _logistic_models: dict[DraftId, LogisticEmbeddingModel] = dataclasses.field(default_factory=dict)
+ def get_metrics(self, concept: Concept) -> Optional[ConceptMetrics]:
+ """Return the metrics for this model."""
+ return self._get_logistic_model(DRAFT_MAIN)._metrics
+ def score_embeddings(self, draft: DraftId, embeddings: np.ndarray) -> np.ndarray:
+ """Get the scores for the provided embeddings."""
+ return self._get_logistic_model(draft).score_embeddings(embeddings)
+ def coef(self, draft: DraftId) -> np.ndarray:
+ """Get the coefficients of the underlying ML model."""
+ return self._get_logistic_model(draft)._model.coef_.reshape(-1)
+ def _get_logistic_model(self, draft: DraftId) -> LogisticEmbeddingModel:
+ """Get the logistic model for the provided draft."""
+ if draft not in self._logistic_models:
+ self._logistic_models[draft] = LogisticEmbeddingModel()
+ return self._logistic_models[draft]
+ def sync(self, concept: Concept) -> bool:
+ """Update the model with the latest labeled concept data."""
+ if concept.version == self.version:
+ # The model is up to date.
+ return False
+ concept_path = (f'{self.namespace}/{self.concept_name}/'
+ f'{self.embedding_name}')
+ with DebugTimer(f'Computing embeddings for "{concept_path}"'):
+ self._compute_embeddings(concept)
+ # Fit each of the drafts, sort by draft name for deterministic behavior.
+ for draft in concept.drafts():
+ examples = draft_examples(concept, draft)
+ embeddings = np.array([self._embeddings[id] for id in examples.keys()])
+ labels = [example.label for example in examples.values()]
+ model = self._get_logistic_model(draft)
+ with DebugTimer(f'Fitting model for "{concept_path}"'):
+ model.fit(embeddings, labels)
+ # Synchronize the model version with the concept version.
+ self.version = concept.version
+ return True
+ def _compute_embeddings(self, concept: Concept) -> None:
+ signal_cls = get_signal_cls(self.embedding_name)
+ if not signal_cls:
+ raise ValueError(f'Embedding signal "{self.embedding_name}" not found in the registry.')
+ embedding_signal = signal_cls()
+ if not isinstance(embedding_signal, TextEmbeddingSignal):
+ raise ValueError(f'Only text embedding signals are currently supported for concepts. '
+ f'"{self.embedding_name}" is a {type(embedding_signal)}.')
+ embed_fn = get_embed_fn(self.embedding_name, split=False)
+ concept_embeddings: dict[str, np.ndarray] = {}
+ examples = concept.data.items()
+ if not examples:
+ raise ValueError(f'Cannot sync concept "{concept.concept_name}". It has no examples.')
+ # Compute the embeddings for the examples with cache miss.
+ texts_of_missing_embeddings: dict[str, str] = {}
+ for id, example in examples:
+ if id in self._embeddings:
+ # Cache hit.
+ concept_embeddings[id] = self._embeddings[id]
+ else:
+ # Cache miss.
+ # TODO(smilkov): Support images.
+ texts_of_missing_embeddings[id] = example.text or ''
+ missing_ids = texts_of_missing_embeddings.keys()
+ missing_embeddings = embed_fn(list(texts_of_missing_embeddings.values()))
+ for id, (embedding,) in zip(missing_ids, missing_embeddings):
+ concept_embeddings[id] = embedding['vector']
+ self._embeddings = concept_embeddings

lilac/concepts/db_concept.py ADDED Viewed

	@@ -0,0 +1,520 @@

+"""The concept database."""
+import abc
+import glob
+import json
+import os
+import pathlib
+import pickle
+import shutil
+# NOTE: We have to import the module for uuid so it can be mocked.
+import uuid
+from typing import Any, List, Optional, Union, cast
+from pydantic import BaseModel
+from typing_extensions import override
+from ..auth import ConceptAuthorizationException, UserInfo
+from ..env import data_path, env
+from ..schema import SignalInputType
+from ..signals.signal import get_signal_cls
+from ..utils import delete_file, file_exists, open_file
+from .concept import DRAFT_MAIN, Concept, ConceptModel, DraftId, Example, ExampleIn
+CONCEPTS_DIR = 'concept'
+CONCEPT_JSON_FILENAME = 'concept.json'
+class ConceptNamespaceACL(BaseModel):
+ """The access control list for a namespace."""
+ # Whether the current user can read concepts in the namespace.
+ read: bool
+ # Whether the current user can add concepts to the namespace.
+ write: bool
+class ConceptACL(BaseModel):
+ """The access control list for an individual concept."""
+ # Whether the current user can read the concept.
+ read: bool
+ # Whether the current user can edit the concept, including adding examples or deleting the
+ # concept.
+ write: bool
+class ConceptInfo(BaseModel):
+ """Information about a concept."""
+ namespace: str
+ name: str
+ description: Optional[str] = None
+ type: SignalInputType
+ drafts: list[DraftId]
+ acls: ConceptACL
+class ConceptUpdate(BaseModel):
+ """An update to a concept."""
+ # List of examples to be inserted.
+ insert: Optional[list[ExampleIn]] = []
+ # List of examples to be updated.
+ update: Optional[list[Example]] = []
+ # The ids of the examples to be removed.
+ remove: Optional[list[str]] = []
+class ConceptDB(abc.ABC):
+ """Interface for the concept database."""
+ @abc.abstractmethod
+ def list(self, user: Optional[UserInfo] = None) -> list[ConceptInfo]:
+ """List all the concepts."""
+ pass
+ @abc.abstractmethod
+ def namespace_acls(self, namespace: str, user: Optional[UserInfo] = None) -> ConceptNamespaceACL:
+ """Return the ACL for a namespace."""
+ pass
+ @abc.abstractmethod
+ def concept_acls(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> ConceptACL:
+ """Return the ACL for a concept."""
+ pass
+ @abc.abstractmethod
+ def get(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> Optional[Concept]:
+ """Return a concept or None if there isn't one."""
+ pass
+ @abc.abstractmethod
+ def create(self,
+ namespace: str,
+ name: str,
+ type: SignalInputType,
+ description: Optional[str] = None,
+ user: Optional[UserInfo] = None) -> Concept:
+ """Create a concept.
+ Args:
+ namespace: The namespace of the concept.
+ name: The name of the concept.
+ type: The input type of the concept.
+ description: The description of the concept.
+ user: The user creating the concept, if authentication is enabled.
+ """
+ pass
+ @abc.abstractmethod
+ def edit(self,
+ namespace: str,
+ name: str,
+ change: ConceptUpdate,
+ user: Optional[UserInfo] = None) -> Concept:
+ """Edit a concept. If the concept doesn't exist, throw an error."""
+ pass
+ @abc.abstractmethod
+ def remove(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> None:
+ """Remove a concept."""
+ pass
+ @abc.abstractmethod
+ def merge_draft(self,
+ namespace: str,
+ name: str,
+ draft: DraftId,
+ user: Optional[UserInfo] = None) -> Concept:
+ """Merge a draft concept.."""
+ pass
+class ConceptModelDB(abc.ABC):
+ """Interface for the concept model database."""
+ _concept_db: ConceptDB
+ def __init__(self, concept_db: ConceptDB) -> None:
+ self._concept_db = concept_db
+ @abc.abstractmethod
+ def create(self,
+ namespace: str,
+ concept_name: str,
+ embedding_name: str,
+ user: Optional[UserInfo] = None) -> ConceptModel:
+ """Create the concept model."""
+ pass
+ @abc.abstractmethod
+ def get(self,
+ namespace: str,
+ concept_name: str,
+ embedding_name: str,
+ user: Optional[UserInfo] = None) -> Optional[ConceptModel]:
+ """Get the model associated with the provided concept the embedding.
+ Returns None if the model does not exist.
+ """
+ pass
+ @abc.abstractmethod
+ def _save(self, model: ConceptModel) -> None:
+ """Save the concept model."""
+ pass
+ def in_sync(self, model: ConceptModel, user: Optional[UserInfo] = None) -> bool:
+ """Return True if the model is up to date with the concept."""
+ concept = self._concept_db.get(model.namespace, model.concept_name, user=user)
+ if not concept:
+ raise ValueError(f'Concept "{model.namespace}/{model.concept_name}" does not exist.')
+ return concept.version == model.version
+ def sync(self, model: ConceptModel, user: Optional[UserInfo] = None) -> bool:
+ """Sync the concept model. Returns true if the model was updated."""
+ concept = self._concept_db.get(model.namespace, model.concept_name, user=user)
+ if not concept:
+ raise ValueError(f'Concept "{model.namespace}/{model.concept_name}" does not exist.')
+ model_updated = model.sync(concept)
+ if model_updated:
+ self._save(model)
+ return model_updated
+ @abc.abstractmethod
+ def remove(self, namespace: str, concept_name: str, embedding_name: str) -> None:
+ """Remove the model of a concept."""
+ pass
+ @abc.abstractmethod
+ def get_models(self, namespace: str, concept_name: str) -> list[ConceptModel]:
+ """List all the models associated with a concept."""
+ pass
+class DiskConceptModelDB(ConceptModelDB):
+ """Interface for the concept model database."""
+ def __init__(self,
+ concept_db: ConceptDB,
+ base_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
+ super().__init__(concept_db)
+ self._base_dir = base_dir
+ def _get_base_dir(self) -> str:
+ return str(self._base_dir) if self._base_dir else data_path()
+ @override
+ def create(self,
+ namespace: str,
+ concept_name: str,
+ embedding_name: str,
+ user: Optional[UserInfo] = None) -> ConceptModel:
+ if self.get(namespace, concept_name, embedding_name, user=user):
+ raise ValueError('Concept model already exists.')
+ concept = self._concept_db.get(namespace, concept_name, user=user)
+ if not concept:
+ raise ValueError(f'Concept "{namespace}/{concept_name}" does not exist.')
+ return ConceptModel(
+ namespace=namespace, concept_name=concept_name, embedding_name=embedding_name)
+ @override
+ def get(self,
+ namespace: str,
+ concept_name: str,
+ embedding_name: str,
+ user: Optional[UserInfo] = None) -> Optional[ConceptModel]:
+ # Make sure the concept exists.
+ concept = self._concept_db.get(namespace, concept_name, user=user)
+ if not concept:
+ raise ValueError(f'Concept "{namespace}/{concept_name}" does not exist.')
+ # Make sure that the embedding signal exists.
+ if not get_signal_cls(embedding_name):
+ raise ValueError(f'Embedding signal "{embedding_name}" not found in the registry.')
+ concept_model_path = _concept_model_path(self._get_base_dir(), namespace, concept_name,
+ embedding_name)
+ if not file_exists(concept_model_path):
+ return None
+ with open_file(concept_model_path, 'rb') as f:
+ return pickle.load(f)
+ def _save(self, model: ConceptModel) -> None:
+ """Save the concept model."""
+ concept_model_path = _concept_model_path(self._get_base_dir(), model.namespace,
+ model.concept_name, model.embedding_name)
+ with open_file(concept_model_path, 'wb') as f:
+ pickle.dump(model, f)
+ @override
+ def remove(self,
+ namespace: str,
+ concept_name: str,
+ embedding_name: str,
+ user: Optional[UserInfo] = None) -> None:
+ concept_model_path = _concept_model_path(self._get_base_dir(), namespace, concept_name,
+ embedding_name)
+ if not file_exists(concept_model_path):
+ raise ValueError(f'Concept model {namespace}/{concept_name}/{embedding_name} does not exist.')
+ delete_file(concept_model_path)
+ @override
+ def get_models(self,
+ namespace: str,
+ concept_name: str,
+ user: Optional[UserInfo] = None) -> list[ConceptModel]:
+ """List all the models associated with a concept."""
+ model_files = glob.iglob(
+ os.path.join(get_concept_output_dir(self._get_base_dir(), namespace, concept_name), '*.pkl'))
+ models: list[ConceptModel] = []
+ for model_file in model_files:
+ embedding_name = os.path.basename(model_file)[:-len('.pkl')]
+ model = self.get(namespace, concept_name, embedding_name, user=user)
+ if model:
+ models.append(model)
+ return models
+def get_concept_output_dir(base_dir: str, namespace: str, name: str) -> str:
+ """Return the output directory for a given concept."""
+ return os.path.join(base_dir, CONCEPTS_DIR, namespace, name)
+def _concept_json_path(base_dir: str, namespace: str, name: str) -> str:
+ return os.path.join(get_concept_output_dir(base_dir, namespace, name), CONCEPT_JSON_FILENAME)
+def _concept_model_path(base_dir: str, namespace: str, concept_name: str,
+ embedding_name: str) -> str:
+ return os.path.join(
+ get_concept_output_dir(base_dir, namespace, concept_name), f'{embedding_name}.pkl')
+class DiskConceptDB(ConceptDB):
+ """A concept database."""
+ def __init__(self, base_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
+ self._base_dir = base_dir
+ def _get_base_dir(self) -> str:
+ return str(self._base_dir) if self._base_dir else data_path()
+ @override
+ def namespace_acls(self, namespace: str, user: Optional[UserInfo] = None) -> ConceptNamespaceACL:
+ if not env('LILAC_AUTH_ENABLED'):
+ return ConceptNamespaceACL(read=True, write=True)
+ if namespace == 'lilac':
+ return ConceptNamespaceACL(read=True, write=False)
+ if user and user.id == namespace:
+ return ConceptNamespaceACL(read=True, write=True)
+ return ConceptNamespaceACL(read=False, write=False)
+ @override
+ def concept_acls(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> ConceptACL:
+ namespace_acls = self.namespace_acls(namespace, user=user)
+ # Concept ACL inherit from the namespace ACL. We currently don't have concept-specific
+ # ACL.
+ return ConceptACL(read=namespace_acls.read, write=namespace_acls.write)
+ @override
+ def list(self, user: Optional[UserInfo] = None) -> list[ConceptInfo]:
+ namespaces: Optional[list[str]] = None
+ if env('LILAC_AUTH_ENABLED'):
+ namespaces = ['lilac']
+ if user:
+ namespaces += [user.id]
+ # Read the concepts and return a ConceptInfo containing the namespace and name.
+ concept_infos = []
+ for root, _, files in os.walk(self._get_base_dir()):
+ for file in files:
+ if file == CONCEPT_JSON_FILENAME:
+ namespace, name = root.split('/')[-2:]
+ if namespaces and namespace not in namespaces:
+ # Ignore concepts that are not in the namespace, if provided.
+ continue
+ concept = cast(Concept, self.get(namespace, name, user=user))
+ concept_infos.append(
+ ConceptInfo(
+ namespace=namespace,
+ name=name,
+ description=concept.description,
+ type=SignalInputType.TEXT,
+ drafts=concept.drafts(),
+ acls=self.concept_acls(namespace, name, user=user)))
+ return concept_infos
+ @override
+ def get(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> Optional[Concept]:
+ # If the user does not have access to the concept, return None.
+ acls = self.concept_acls(namespace, name, user=user)
+ if not acls.read:
+ raise ConceptAuthorizationException(
+ f'Concept "{namespace}/{name}" does not exist or user does not have access.')
+ concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
+ if not file_exists(concept_json_path):
+ return None
+ with open_file(concept_json_path) as f:
+ obj: dict[str, Any] = json.load(f)
+ if 'namespace' not in obj:
+ obj['namespace'] = namespace
+ return Concept.parse_obj(obj)
+ @override
+ def create(self,
+ namespace: str,
+ name: str,
+ type: SignalInputType,
+ description: Optional[str] = None,
+ user: Optional[UserInfo] = None) -> Concept:
+ """Create a concept."""
+ # If the user does not have access to the write to the concept namespace, throw.
+ acls = self.namespace_acls(namespace, user=user)
+ if not acls.write:
+ raise ConceptAuthorizationException(
+ f'Concept namespace "{namespace}" does not exist or user does not have access.')
+ concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
+ if file_exists(concept_json_path):
+ raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" already exists.')
+ concept = Concept(
+ namespace=namespace, concept_name=name, type=type, data={}, description=description)
+ self._save(concept)
+ return concept
+ def _validate_examples(self, examples: List[Union[ExampleIn, Example]],
+ type: SignalInputType) -> None:
+ for example in examples:
+ inferred_type = 'text' if example.text else 'img'
+ if inferred_type != type:
+ raise ValueError(f'Example type "{inferred_type}" does not match concept type "{type}".')
+ @override
+ def edit(self,
+ namespace: str,
+ name: str,
+ change: ConceptUpdate,
+ user: Optional[UserInfo] = None) -> Concept:
+ # If the user does not have access to the concept, return None.
+ acls = self.concept_acls(namespace, name, user=user)
+ if not acls.write:
+ raise ConceptAuthorizationException(
+ f'Concept "{namespace}/{name}" does not exist or user does not have access.')
+ concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
+ if not file_exists(concept_json_path):
+ raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist. '
+ 'Please call create() first.')
+ inserted_points = change.insert or []
+ updated_points = change.update or []
+ removed_points = change.remove or []
+ concept = cast(Concept, self.get(namespace, name, user=user))
+ self._validate_examples([*inserted_points, *updated_points], concept.type)
+ for remove_example in removed_points:
+ if remove_example not in concept.data:
+ raise ValueError(f'Example with id "{remove_example}" does not exist.')
+ concept.data.pop(remove_example)
+ for example in inserted_points:
+ id = uuid.uuid4().hex
+ concept.data[id] = Example(id=id, **example.dict())
+ for example in updated_points:
+ if example.id not in concept.data:
+ raise ValueError(f'Example with id "{example.id}" does not exist.')
+ # Remove the old example and make a new one with a new id to keep it functional.
+ concept.data.pop(example.id)
+ concept.data[example.id] = example.copy()
+ concept.version += 1
+ self._save(concept)
+ return concept
+ def _save(self, concept: Concept) -> None:
+ concept_json_path = _concept_json_path(self._get_base_dir(), concept.namespace,
+ concept.concept_name)
+ with open_file(concept_json_path, 'w') as f:
+ f.write(concept.json(exclude_none=True, indent=2, exclude_defaults=True))
+ @override
+ def remove(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> None:
+ # If the user does not have access to the concept, return None.
+ acls = self.concept_acls(namespace, name, user=user)
+ if not acls.write:
+ raise ConceptAuthorizationException(
+ f'Concept "{namespace}/{name}" does not exist or user does not have access.')
+ concept_dir = get_concept_output_dir(self._get_base_dir(), namespace, name)
+ if not file_exists(concept_dir):
+ raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist.')
+ shutil.rmtree(concept_dir, ignore_errors=True)
+ @override
+ def merge_draft(self,
+ namespace: str,
+ name: str,
+ draft: DraftId,
+ user: Optional[UserInfo] = None) -> Concept:
+ """Merge a draft concept."""
+ # If the user does not have access to the concept, return None.
+ acls = self.concept_acls(namespace, name, user=user)
+ if not acls.write:
+ raise ConceptAuthorizationException(
+ f'Concept "{namespace}/{name}" does not exist or user does not have access.')
+ concept = self.get(namespace, name, user=user)
+ if not concept:
+ raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist.')
+ if draft == DRAFT_MAIN:
+ return concept
+ # Map the text of examples in main so we can remove them if they are duplicates.
+ main_text_ids: dict[Optional[str], str] = {
+ example.text: id for id, example in concept.data.items() if example.draft == DRAFT_MAIN
+ }
+ draft_examples: dict[str, Example] = {
+ id: example for id, example in concept.data.items() if example.draft == draft
+ }
+ for example in draft_examples.values():
+ example.draft = DRAFT_MAIN
+ # Remove duplicates in main.
+ main_text_id = main_text_ids.get(example.text)
+ if main_text_id:
+ del concept.data[main_text_id]
+ concept.version += 1
+ self._save(concept)
+ return concept
+# A singleton concept database.
+DISK_CONCEPT_DB = DiskConceptDB()
+DISK_CONCEPT_MODEL_DB = DiskConceptModelDB(DISK_CONCEPT_DB)

lilac/config.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Configurations for a dataset run."""
+from typing import Optional
+from pydantic import BaseModel, validator
+from .data.dataset import DatasetSettings
+from .schema import Path, PathTuple, normalize_path
+from .signals.signal import Signal, TextEmbeddingSignal, get_signal_by_type, resolve_signal
+from .sources.source import Source
+from .sources.source_registry import resolve_source
+class SignalConfig(BaseModel):
+ """Configures a signal on a source path."""
+ path: PathTuple
+ signal: Signal
+ @validator('path', pre=True)
+ def parse_path(cls, path: Path) -> PathTuple:
+ """Parse a path."""
+ return normalize_path(path)
+ @validator('signal', pre=True)
+ def parse_signal(cls, signal: dict) -> Signal:
+ """Parse a signal to its specific subclass instance."""
+ return resolve_signal(signal)
+class EmbeddingConfig(BaseModel):
+ """Configures an embedding on a source path."""
+ path: PathTuple
+ embedding: str
+ @validator('path', pre=True)
+ def parse_path(cls, path: Path) -> PathTuple:
+ """Parse a path."""
+ return normalize_path(path)
+ @validator('embedding', pre=True)
+ def validate_embedding(cls, embedding: str) -> str:
+ """Validate the embedding is registered."""
+ get_signal_by_type(embedding, TextEmbeddingSignal)
+ return embedding
+class DatasetConfig(BaseModel):
+ """Configures a dataset with a source and transformations."""
+ # The namespace and name of the dataset.
+ namespace: str
+ name: str
+ # The source configuration.
+ source: Source
+ # Model configuration: embeddings and signals on paths.
+ embeddings: Optional[list[EmbeddingConfig]]
+ # When defined, uses this list of signals instead of running all signals.
+ signals: Optional[list[SignalConfig]]
+ # Dataset settings, default embeddings and UI settings like media paths.
+ settings: Optional[DatasetSettings]
+ @validator('source', pre=True)
+ def parse_source(cls, source: dict) -> Source:
+ """Parse a source to its specific subclass instance."""
+ return resolve_source(source)
+class Config(BaseModel):
+ """Configures a set of datasets for a lilac instance."""
+ datasets: list[DatasetConfig]
+ # When defined, uses this list of signals to run over every dataset, over all media paths, unless
+ # signals is overridden by a specific dataset.
+ signals: list[Signal] = []
+ @validator('signals', pre=True)
+ def parse_signal(cls, signals: list[dict]) -> list[Signal]:
+ """Parse alist of signals to their specific subclass instances."""
+ return [resolve_signal(signal) for signal in signals]

lilac/conftest.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Fixtures for dataset tests."""
+import os
+import pathlib
+from typing import Generator, Optional, Type
+import pytest
+from pytest_mock import MockerFixture
+from .data.dataset import Dataset
+from .data.dataset_duckdb import DatasetDuckDB
+from .data.dataset_test_utils import make_dataset
+from .db_manager import set_default_dataset_cls
+from .schema import Item, Schema
+@pytest.fixture(scope='function', params=[DatasetDuckDB])
+def make_test_data(tmp_path: pathlib.Path, mocker: MockerFixture,
+ request: pytest.FixtureRequest) -> Generator:
+ """A pytest fixture for creating temporary test datasets."""
+ mocker.patch.dict(os.environ, {'LILAC_DATA_PATH': str(tmp_path)})
+ dataset_cls: Type[Dataset] = request.param
+ set_default_dataset_cls(dataset_cls)
+ def _make_test_data(items: list[Item], schema: Optional[Schema] = None) -> Dataset:
+ return make_dataset(dataset_cls, tmp_path, items, schema)
+ # Return the factory for datasets that test methods can use.
+ yield _make_test_data

lilac/data/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .dataset import Column, ConceptQuery, KeywordQuery, Search, SemanticQuery
+__all__ = [
+ 'Column',
+ 'Search',
+ 'KeywordQuery',
+ 'ConceptQuery',
+ 'SemanticQuery',
+]

lilac/data/dataset.py ADDED Viewed

	@@ -0,0 +1,485 @@

+"""The interface for the database."""
+import abc
+import enum
+import pathlib
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+from typing import Any, Iterator, Literal, Optional, Sequence, Union
+import pandas as pd
+from pydantic import BaseModel
+from pydantic import Field as PydanticField
+from pydantic import StrictBool, StrictBytes, StrictFloat, StrictInt, StrictStr, validator
+from ..auth import UserInfo
+from ..schema import VALUE_KEY, Bin, DataType, Path, PathTuple, Schema, normalize_path
+from ..signals.signal import Signal, TextEmbeddingSignal, get_signal_by_type, resolve_signal
+from ..tasks import TaskStepId
+# Threshold for rejecting certain queries (e.g. group by) for columns with large cardinality.
+TOO_MANY_DISTINCT = 1_000_000
+class SelectRowsResult:
+ """The result of a select rows query."""
+ def __init__(self, df: pd.DataFrame, total_num_rows: int) -> None:
+ """Initialize the result."""
+ self._df = df
+ self.total_num_rows = total_num_rows
+ def __iter__(self) -> Iterator:
+ return (row.to_dict() for _, row in self._df.iterrows())
+ def df(self) -> pd.DataFrame:
+ """Convert the result to a pandas DataFrame."""
+ return self._df
+class StatsResult(BaseModel):
+ """The result of a stats() query."""
+ path: PathTuple
+ # The number of leaf values.
+ total_count: int
+ # The approximate number of distinct leaf values.
+ approx_count_distinct: int
+ # Defined for ordinal features.
+ min_val: Optional[Union[float, datetime]] = None
+ max_val: Optional[Union[float, datetime]] = None
+ # Defined for text features.
+ avg_text_length: Optional[float] = None
+class MediaResult(BaseModel):
+ """The result of a media() query."""
+ data: bytes
+class BinaryOp(str, enum.Enum):
+ """The comparison operator between a column and a feature value."""
+ EQUALS = 'equals'
+ NOT_EQUAL = 'not_equal'
+ GREATER = 'greater'
+ GREATER_EQUAL = 'greater_equal'
+ LESS = 'less'
+ LESS_EQUAL = 'less_equal'
+SearchType = Union[Literal['keyword'], Literal['semantic'], Literal['concept']]
+class UnaryOp(str, enum.Enum):
+ """A unary operator on a feature."""
+ EXISTS = 'exists'
+class ListOp(str, enum.Enum):
+ """A list operator on a feature."""
+ IN = 'in'
+class SortOrder(str, enum.Enum):
+ """The sort order for a database query."""
+ DESC = 'DESC'
+ ASC = 'ASC'
+class GroupsSortBy(str, enum.Enum):
+ """The sort for groups queries.
+ Either "count" which sorts by the count of feature value, or "value" which sorts by the
+ feature value itself.
+ """
+ COUNT = 'count'
+ VALUE = 'value'
+class SortResult(BaseModel):
+ """The information about what is sorted after combining searches and explicit sorts."""
+ # The column that was sorted.
+ path: PathTuple
+ # The sort order.
+ order: SortOrder
+ # The alias of the column if it was aliased.
+ alias: Optional[str] = None
+ # The search index if the sort is by a search.
+ search_index: Optional[int] = None
+class SearchResultInfo(BaseModel):
+ """The resulting sort order returned by the select rows schema."""
+ # The input path to the search.
+ search_path: PathTuple
+ # The resulting column that was searched.
+ result_path: PathTuple
+ # The alias of the UDF.
+ alias: Optional[str] = None
+class SelectRowsSchemaUDF(BaseModel):
+ """The UDF for a select rows schema query."""
+ path: PathTuple
+ alias: Optional[str] = None
+class SelectRowsSchemaResult(BaseModel):
+ """The result of a select rows schema query."""
+ data_schema: Schema
+ udfs: list[SelectRowsSchemaUDF] = []
+ search_results: list[SearchResultInfo] = []
+ sorts: Optional[list[SortResult]] = None
+class Column(BaseModel):
+ """A column in the dataset."""
+ path: PathTuple
+ alias: Optional[str] = None # This is the renamed column during querying and response.
+ # Defined when the feature is another column.
+ signal_udf: Optional[Signal] = None
+ class Config:
+ smart_union = True
+ def __init__(self,
+ path: Path,
+ alias: Optional[str] = None,
+ signal_udf: Optional[Signal] = None,
+ **kwargs: Any):
+ """Initialize a column. We override __init__ to allow positional arguments for brevity."""
+ super().__init__(path=normalize_path(path), alias=alias, signal_udf=signal_udf, **kwargs)
+ @validator('signal_udf', pre=True)
+ def parse_signal_udf(cls, signal_udf: Optional[dict]) -> Optional[Signal]:
+ """Parse a signal to its specific subclass instance."""
+ if not signal_udf:
+ return None
+ return resolve_signal(signal_udf)
+ColumnId = Union[Path, Column]
+class DatasetUISettings(BaseModel):
+ """The UI persistent settings for a dataset."""
+ media_paths: list[PathTuple] = []
+ markdown_paths: list[PathTuple] = []
+ @validator('media_paths', pre=True)
+ def parse_media_paths(cls, media_paths: list) -> list:
+ """Parse a path, ensuring it is a tuple."""
+ return [normalize_path(path) for path in media_paths]
+class DatasetSettings(BaseModel):
+ """The persistent settings for a dataset."""
+ ui: Optional[DatasetUISettings] = None
+ preferred_embedding: Optional[str] = None
+class DatasetManifest(BaseModel):
+ """The manifest for a dataset."""
+ namespace: str
+ dataset_name: str
+ data_schema: Schema
+ # Number of items in the dataset.
+ num_items: int
+def column_from_identifier(column: ColumnId) -> Column:
+ """Create a column from a column identifier."""
+ if isinstance(column, Column):
+ return column.copy()
+ return Column(path=column)
+FeatureValue = Union[StrictInt, StrictFloat, StrictBool, StrictStr, StrictBytes, datetime]
+FeatureListValue = list[StrictStr]
+BinaryFilterTuple = tuple[Path, BinaryOp, FeatureValue]
+ListFilterTuple = tuple[Path, ListOp, FeatureListValue]
+UnaryFilterTuple = tuple[Path, UnaryOp]
+FilterOp = Union[BinaryOp, UnaryOp, ListOp]
+class SelectGroupsResult(BaseModel):
+ """The result of a select groups query."""
+ too_many_distinct: bool
+ counts: list[tuple[Optional[FeatureValue], int]]
+ bins: Optional[list[Bin]] = None
+class Filter(BaseModel):
+ """A filter on a column."""
+ path: PathTuple
+ op: FilterOp
+ value: Optional[Union[FeatureValue, FeatureListValue]] = None
+FilterLike = Union[Filter, BinaryFilterTuple, UnaryFilterTuple, ListFilterTuple]
+SearchValue = StrictStr
+class KeywordQuery(BaseModel):
+ """A keyword search query on a column."""
+ type: Literal['keyword'] = 'keyword'
+ search: SearchValue
+class SemanticQuery(BaseModel):
+ """A semantic search on a column."""
+ type: Literal['semantic'] = 'semantic'
+ search: SearchValue
+ embedding: str
+class ConceptQuery(BaseModel):
+ """A concept search query on a column."""
+ type: Literal['concept'] = 'concept'
+ concept_namespace: str
+ concept_name: str
+ embedding: str
+class Search(BaseModel):
+ """A search on a column."""
+ path: Path
+ query: Union[KeywordQuery, SemanticQuery, ConceptQuery] = PydanticField(discriminator='type')
+class Dataset(abc.ABC):
+ """The database implementation to query a dataset."""
+ namespace: str
+ dataset_name: str
+ def __init__(self, namespace: str, dataset_name: str):
+ """Initialize a dataset.
+ Args:
+ namespace: The dataset namespace.
+ dataset_name: The dataset name.
+ """
+ self.namespace = namespace
+ self.dataset_name = dataset_name
+ @abc.abstractmethod
+ def delete(self) -> None:
+ """Deletes the dataset."""
+ pass
+ @abc.abstractmethod
+ def manifest(self) -> DatasetManifest:
+ """Return the manifest for the dataset."""
+ pass
+ @abc.abstractmethod
+ def settings(self) -> DatasetSettings:
+ """Return the persistent settings for the dataset."""
+ pass
+ @abc.abstractmethod
+ def update_settings(self, settings: DatasetSettings) -> None:
+ """Update the settings for the dataset."""
+ pass
+ @abc.abstractmethod
+ def compute_signal(self,
+ signal: Signal,
+ leaf_path: Path,
+ task_step_id: Optional[TaskStepId] = None) -> None:
+ """Compute a signal for a column.
+ Args:
+ signal: The signal to compute over the given columns.
+ leaf_path: The leaf path to compute the signal on.
+ task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
+ progress of the task.
+ """
+ pass
+ def compute_embedding(self,
+ embedding: str,
+ path: Path,
+ task_step_id: Optional[TaskStepId] = None) -> None:
+ """Compute an embedding for a given field path."""
+ signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
+ self.compute_signal(signal, path)
+ @abc.abstractmethod
+ def delete_signal(self, signal_path: Path) -> None:
+ """Delete a computed signal from the dataset.
+ Args:
+ signal_path: The path holding the computed data of the signal.
+ """
+ pass
+ @abc.abstractmethod
+ def select_groups(
+ self,
+ leaf_path: Path,
+ filters: Optional[Sequence[FilterLike]] = None,
+ sort_by: Optional[GroupsSortBy] = None,
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
+ limit: Optional[int] = None,
+ bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None) -> SelectGroupsResult:
+ """Select grouped columns to power a histogram.
+ Args:
+ leaf_path: The leaf path to group by. The path can be a dot-seperated string path, or a tuple
+ of fields.
+ filters: The filters to apply to the query.
+ sort_by: What to sort by, either "count" or "value".
+ sort_order: The sort order.
+ limit: The maximum number of rows to return.
+ bins: The bins to use when bucketizing a float column.
+ Returns
+ A `SelectGroupsResult` iterator where each row is a group.
+ """
+ raise NotImplementedError
+ @abc.abstractmethod
+ def select_rows(self,
+ columns: Optional[Sequence[ColumnId]] = None,
+ searches: Optional[Sequence[Search]] = None,
+ filters: Optional[Sequence[FilterLike]] = None,
+ sort_by: Optional[Sequence[Path]] = None,
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
+ limit: Optional[int] = 100,
+ offset: Optional[int] = 0,
+ task_step_id: Optional[TaskStepId] = None,
+ resolve_span: bool = False,
+ combine_columns: bool = False,
+ user: Optional[UserInfo] = None) -> SelectRowsResult:
+ """Select grouped columns to power a histogram.
+ Args:
+ columns: The columns to select. A column is an instance of `Column` which can either
+ define a path to a feature, or a column with an applied Transform, e.g. a Concept. If none,
+ it selects all columns.
+ searches: The searches to apply to the query.
+ filters: The filters to apply to the query.
+ sort_by: An ordered list of what to sort by. When defined, this is a list of aliases of column
+ names defined by the "alias" field in Column. If no alias is provided for a column, an
+ automatic alias is generated by combining each path element with a "."
+ For example: e.g. ('person', 'name') => person.name. For columns that are transform columns,
+ an alias must be provided explicitly. When sorting by a (nested) list of values, the sort
+ takes the minumum value when `sort_order` is `ASC`, and the maximum value when `sort_order`
+ is `DESC`.
+ sort_order: The sort order.
+ limit: The maximum number of rows to return.
+ offset: The offset to start returning rows from.
+ task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
+ progress.
+ resolve_span: Whether to resolve the span of the row.
+ combine_columns: Whether to combine columns into a single object. The object will be pruned
+ to only include sub-fields that correspond to the requested columns.
+ user: The authenticated user, if auth is enabled and the user is logged in. This is used to
+ apply ACL to the query, especially for concepts.
+ Returns
+ A SelectRowsResult iterator with rows of `Item`s.
+ """
+ pass
+ @abc.abstractmethod
+ def select_rows_schema(self,
+ columns: Optional[Sequence[ColumnId]] = None,
+ sort_by: Optional[Sequence[Path]] = None,
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
+ searches: Optional[Sequence[Search]] = None,
+ combine_columns: bool = False) -> SelectRowsSchemaResult:
+ """Returns the schema of the result of `select_rows` above with the same arguments."""
+ pass
+ @abc.abstractmethod
+ def stats(self, leaf_path: Path) -> StatsResult:
+ """Compute stats for a leaf path.
+ Args:
+ leaf_path: The leaf path to compute stats for.
+ Returns
+ A StatsResult.
+ """
+ pass
+ @abc.abstractmethod
+ def media(self, item_id: str, leaf_path: Path) -> MediaResult:
+ """Return the media for a leaf path.
+ Args:
+ item_id: The item id to get media for.
+ leaf_path: The leaf path for the media.
+ Returns
+ A MediaResult.
+ """
+ pass
+ @abc.abstractmethod
+ def to_json(self, filepath: Union[str, pathlib.Path], jsonl: bool = True) -> None:
+ """Export the dataset to a JSON file.
+ Args:
+ filepath: The path to the file to export to.
+ jsonl: Whether to export to JSONL or JSON.
+ """
+ pass
+ @abc.abstractmethod
+ def to_pandas(self) -> pd.DataFrame:
+ """Export the dataset to a pandas DataFrame."""
+ pass
+ @abc.abstractmethod
+ def to_parquet(self, filepath: Union[str, pathlib.Path]) -> None:
+ """Export the dataset to a parquet file.
+ Args:
+ filepath: The path to the file to export to.
+ """
+ pass
+ @abc.abstractmethod
+ def to_csv(self, filepath: Union[str, pathlib.Path]) -> None:
+ """Export the dataset to a csv file.
+ Args:
+ filepath: The path to the file to export to.
+ """
+ pass
+def default_settings(dataset: Dataset) -> DatasetSettings:
+ """Gets the default settings for a dataset."""
+ schema = dataset.manifest().data_schema
+ leaf_paths = [path for path, field in schema.leafs.items() if field.dtype == DataType.STRING]
+ pool = ThreadPoolExecutor()
+ stats: list[StatsResult] = list(pool.map(lambda leaf: dataset.stats(leaf), leaf_paths))
+ sorted_stats = sorted([stat for stat in stats if stat.avg_text_length],
+ key=lambda stat: stat.avg_text_length or -1.0)
+ media_paths: set[PathTuple] = set()
+ if sorted_stats:
+ media_paths = set([sorted_stats[-1].path])
+ return DatasetSettings(ui=DatasetUISettings(media_paths=media_paths))
+def make_parquet_id(signal: Signal,
+ source_path: PathTuple,
+ is_computed_signal: Optional[bool] = False) -> str:
+ """Return a unique identifier for this parquet table."""
+ # Don't use the VALUE_KEY as part of the parquet id to reduce the size of paths.
+ path = source_path[:-1] if source_path[-1] == VALUE_KEY else source_path
+ column_alias = '.'.join(map(str, path))
+ if column_alias.endswith('.*'):
+ # Remove the trailing .* from the column name.
+ column_alias = column_alias[:-2]
+ return f'{signal.key(is_computed_signal=is_computed_signal)}({column_alias})'

lilac/data/dataset_duckdb.py ADDED Viewed

	@@ -0,0 +1,1717 @@

+"""The DuckDB implementation of the dataset database."""
+import functools
+import gc
+import glob
+import math
+import os
+import pathlib
+import re
+import shutil
+import threading
+from typing import Any, Iterable, Iterator, Optional, Sequence, Union, cast
+import duckdb
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_object_dtype
+from pydantic import BaseModel, validator
+from typing_extensions import override
+from ..auth import UserInfo
+from ..batch_utils import deep_flatten, deep_unflatten
+from ..embeddings.vector_store import VectorDBIndex
+from ..env import data_path, env
+from ..schema import (
+ MANIFEST_FILENAME,
+ PATH_WILDCARD,
+ TEXT_SPAN_END_FEATURE,
+ TEXT_SPAN_START_FEATURE,
+ UUID_COLUMN,
+ VALUE_KEY,
+ Bin,
+ DataType,
+ Field,
+ Item,
+ Path,
+ PathKey,
+ PathTuple,
+ RichData,
+ Schema,
+ SourceManifest,
+ column_paths_match,
+ is_float,
+ is_integer,
+ is_ordinal,
+ is_temporal,
+ normalize_path,
+ signal_type_supports_dtype,
+)
+from ..signals.concept_labels import ConceptLabelsSignal
+from ..signals.concept_scorer import ConceptScoreSignal
+from ..signals.semantic_similarity import SemanticSimilaritySignal
+from ..signals.signal import (
+ Signal,
+ TextEmbeddingSignal,
+ VectorSignal,
+ get_signal_by_type,
+ resolve_signal,
+)
+from ..signals.substring_search import SubstringSignal
+from ..tasks import TaskStepId, progress
+from ..utils import DebugTimer, get_dataset_output_dir, log, open_file
+from . import dataset
+from .dataset import (
+ BinaryOp,
+ Column,
+ ColumnId,
+ Dataset,
+ DatasetManifest,
+ DatasetSettings,
+ FeatureListValue,
+ FeatureValue,
+ Filter,
+ FilterLike,
+ GroupsSortBy,
+ ListOp,
+ MediaResult,
+ Search,
+ SearchResultInfo,
+ SelectGroupsResult,
+ SelectRowsResult,
+ SelectRowsSchemaResult,
+ SelectRowsSchemaUDF,
+ SortOrder,
+ SortResult,
+ StatsResult,
+ UnaryOp,
+ column_from_identifier,
+ default_settings,
+ make_parquet_id,
+)
+from .dataset_utils import (
+ count_primitives,
+ create_signal_schema,
+ flatten_keys,
+ merge_schemas,
+ schema_contains_path,
+ sparse_to_dense_compute,
+ wrap_in_dicts,
+ write_embeddings_to_disk,
+ write_items_to_parquet,
+)
+UUID_INDEX_FILENAME = 'uuids.npy'
+SIGNAL_MANIFEST_FILENAME = 'signal_manifest.json'
+DATASET_SETTINGS_FILENAME = 'settings.json'
+SOURCE_VIEW_NAME = 'source'
+# Sample size for approximating the distinct count of a column.
+SAMPLE_SIZE_DISTINCT_COUNT = 100_000
+NUM_AUTO_BINS = 15
+BINARY_OP_TO_SQL: dict[BinaryOp, str] = {
+ BinaryOp.EQUALS: '=',
+ BinaryOp.NOT_EQUAL: '!=',
+ BinaryOp.GREATER: '>',
+ BinaryOp.GREATER_EQUAL: '>=',
+ BinaryOp.LESS: '<',
+ BinaryOp.LESS_EQUAL: '<='
+}
+class DuckDBSearchUDF(BaseModel):
+ """The transformation of searches to column UDFs."""
+ udf: Column
+ search_path: PathTuple
+ output_path: PathTuple
+ sort: Optional[tuple[PathTuple, SortOrder]] = None
+class DuckDBSearchUDFs(BaseModel):
+ """The transformation of searches to column UDFs with sorts."""
+ udfs: list[Column]
+ output_paths: list[PathTuple]
+ sorts: list[tuple[PathTuple, SortOrder]]
+class DatasetDuckDB(Dataset):
+ """The DuckDB implementation of the dataset database."""
+ def __init__(self, namespace: str, dataset_name: str, vector_store: str = 'hnsw'):
+ super().__init__(namespace, dataset_name)
+ self.dataset_path = get_dataset_output_dir(data_path(), namespace, dataset_name)
+ # TODO: Infer the manifest from the parquet files so this is lighter weight.
+ self._source_manifest = read_source_manifest(self.dataset_path)
+ self._signal_manifests: list[SignalManifest] = []
+ self.con = duckdb.connect(database=':memory:')
+ # Maps a path and embedding to the vector index. This is lazily generated as needed.
+ self._vector_indices: dict[tuple[PathKey, str], VectorDBIndex] = {}
+ self.vector_store = vector_store
+ self._manifest_lock = threading.Lock()
+ # Calling settings creates the default settings JSON file if it doesn't exist.
+ self.settings()
+ @override
+ def delete(self) -> None:
+ """Deletes the dataset."""
+ self.con.close()
+ shutil.rmtree(self.dataset_path, ignore_errors=True)
+ def _create_view(self, view_name: str, files: list[str]) -> None:
+ self.con.execute(f"""
+ CREATE OR REPLACE VIEW {_escape_col_name(view_name)} AS (SELECT * FROM read_parquet({files}));
+ """)
+ # NOTE: This is cached, but when the latest mtime of any file in the dataset directory changes
+ # the results are invalidated.
+ @functools.cache
+ def _recompute_joint_table(self, latest_mtime_micro_sec: int) -> DatasetManifest:
+ del latest_mtime_micro_sec # This is used as the cache key.
+ merged_schema = self._source_manifest.data_schema.copy(deep=True)
+ self._signal_manifests = []
+ # Make a joined view of all the column groups.
+ self._create_view(SOURCE_VIEW_NAME,
+ [os.path.join(self.dataset_path, f) for f in self._source_manifest.files])
+ # Add the signal column groups.
+ for root, _, files in os.walk(self.dataset_path):
+ for file in files:
+ if not file.endswith(SIGNAL_MANIFEST_FILENAME):
+ continue
+ with open_file(os.path.join(root, file)) as f:
+ signal_manifest = SignalManifest.parse_raw(f.read())
+ self._signal_manifests.append(signal_manifest)
+ signal_files = [os.path.join(root, f) for f in signal_manifest.files]
+ if signal_files:
+ self._create_view(signal_manifest.parquet_id, signal_files)
+ merged_schema = merge_schemas([self._source_manifest.data_schema] +
+ [m.data_schema for m in self._signal_manifests])
+ # The logic below generates the following example query:
+ # CREATE OR REPLACE VIEW t AS (
+ # SELECT
+ # source.*,
+ # "parquet_id1"."root_column" AS "parquet_id1",
+ # "parquet_id2"."root_column" AS "parquet_id2"
+ # FROM source JOIN "parquet_id1" USING (uuid,) JOIN "parquet_id2" USING (uuid,)
+ # );
+ # NOTE: "root_column" for each signal is defined as the top-level column.
+ select_sql = ', '.join([f'{SOURCE_VIEW_NAME}.*'] + [(
+ f'{_escape_col_name(manifest.parquet_id)}.{_escape_col_name(_root_column(manifest))} '
+ f'AS {_escape_col_name(manifest.parquet_id)}')
+ for manifest in self._signal_manifests
+ if manifest.files])
+ join_sql = ' '.join([SOURCE_VIEW_NAME] + [
+ f'join {_escape_col_name(manifest.parquet_id)} using ({UUID_COLUMN},)'
+ for manifest in self._signal_manifests
+ if manifest.files
+ ])
+ view_or_table = 'TABLE'
+ use_views = env('DUCKDB_USE_VIEWS', 0) or 0
+ if int(use_views):
+ view_or_table = 'VIEW'
+ sql_cmd = f"""CREATE OR REPLACE {view_or_table} t AS (SELECT {select_sql} FROM {join_sql})"""
+ self.con.execute(sql_cmd)
+ # Get the total size of the table.
+ size_query = 'SELECT COUNT() as count FROM t'
+ size_query_result = cast(Any, self._query(size_query)[0])
+ num_items = cast(int, size_query_result[0])
+ return DatasetManifest(
+ namespace=self.namespace,
+ dataset_name=self.dataset_name,
+ data_schema=merged_schema,
+ num_items=num_items)
+ @override
+ def manifest(self) -> DatasetManifest:
+ # Use the latest modification time of all files under the dataset path as the cache key for
+ # re-computing the manifest and the joined view.
+ with self._manifest_lock:
+ all_dataset_files = glob.iglob(os.path.join(self.dataset_path, '**'), recursive=True)
+ latest_mtime = max(map(os.path.getmtime, all_dataset_files))
+ latest_mtime_micro_sec = int(latest_mtime * 1e6)
+ return self._recompute_joint_table(latest_mtime_micro_sec)
+ @override
+ def settings(self) -> DatasetSettings:
+ # Read the settings file from disk.
+ settings_filepath = _settings_filepath(self.namespace, self.dataset_name)
+ if not os.path.exists(settings_filepath):
+ self.update_settings(default_settings(self))
+ with open(settings_filepath) as f:
+ return DatasetSettings.parse_raw(f.read())
+ @override
+ def update_settings(self, settings: DatasetSettings) -> None:
+ # Write the settings file from disk.
+ settings_filepath = _settings_filepath(self.namespace, self.dataset_name)
+ with open(settings_filepath, 'w') as f:
+ f.write(settings.json())
+ def count(self, filters: Optional[list[FilterLike]] = None) -> int:
+ """Count the number of rows."""
+ raise NotImplementedError('count is not yet implemented for DuckDB.')
+ def _get_vector_db_index(self, embedding: str, path: PathTuple) -> VectorDBIndex:
+ # Refresh the manifest to make sure we have the latest signal manifests.
+ self.manifest()
+ index_key = (path, embedding)
+ if index_key in self._vector_indices:
+ return self._vector_indices[index_key]
+ manifests = [
+ m for m in self._signal_manifests
+ if schema_contains_path(m.data_schema, path) and m.vector_store and m.signal.name == embedding
+ ]
+ if not manifests:
+ raise ValueError(f'No embedding found for path {path}.')
+ if len(manifests) > 1:
+ raise ValueError(f'Multiple embeddings found for path {path}. Got: {manifests}')
+ manifest = manifests[0]
+ if not manifest.vector_store:
+ raise ValueError(f'Signal manifest for path {path} is not an embedding. '
+ f'Got signal manifest: {manifest}')
+ base_path = os.path.join(self.dataset_path, _signal_dir(manifest.enriched_path),
+ manifest.signal.name)
+ with DebugTimer(f'Loading vector store "{manifest.vector_store}" for "{path}"'
+ f' with embedding "{embedding}"'):
+ vector_index = VectorDBIndex(manifest.vector_store)
+ vector_index.load(base_path)
+ # Cache the vector index.
+ self._vector_indices[index_key] = vector_index
+ return vector_index
+ @override
+ def compute_signal(self,
+ signal: Signal,
+ leaf_path: Path,
+ task_step_id: Optional[TaskStepId] = None) -> None:
+ if isinstance(signal, TextEmbeddingSignal):
+ return self.compute_embedding(signal.name, leaf_path, task_step_id)
+ source_path = normalize_path(leaf_path)
+ manifest = self.manifest()
+ if task_step_id is None:
+ # Make a dummy task step so we report progress via tqdm.
+ task_step_id = ('', 0)
+ # The manifest may have changed after computing the dependencies.
+ manifest = self.manifest()
+ signal_col = Column(path=source_path, alias='value', signal_udf=signal)
+ select_rows_result = self.select_rows([signal_col],
+ task_step_id=task_step_id,
+ resolve_span=True)
+ df = select_rows_result.df()
+ values = df['value']
+ enriched_path = _col_destination_path(signal_col, is_computed_signal=True)
+ spec = _split_path_into_subpaths_of_lists(enriched_path)
+ output_dir = os.path.join(self.dataset_path, _signal_dir(enriched_path))
+ signal_schema = create_signal_schema(signal, source_path, manifest.data_schema)
+ enriched_signal_items = cast(Iterable[Item], wrap_in_dicts(values, spec))
+ for uuid, item in zip(df[UUID_COLUMN], enriched_signal_items):
+ item[UUID_COLUMN] = uuid
+ enriched_signal_items = list(enriched_signal_items)
+ parquet_filename, _ = write_items_to_parquet(
+ items=enriched_signal_items,
+ output_dir=output_dir,
+ schema=signal_schema,
+ filename_prefix='data',
+ shard_index=0,
+ num_shards=1)
+ signal_manifest = SignalManifest(
+ files=[parquet_filename],
+ data_schema=signal_schema,
+ signal=signal,
+ enriched_path=source_path,
+ parquet_id=make_parquet_id(signal, source_path, is_computed_signal=True))
+ signal_manifest_filepath = os.path.join(output_dir, SIGNAL_MANIFEST_FILENAME)
+ with open_file(signal_manifest_filepath, 'w') as f:
+ f.write(signal_manifest.json(exclude_none=True, indent=2))
+ log(f'Wrote signal output to {output_dir}')
+ @override
+ def compute_embedding(self,
+ embedding: str,
+ path: Path,
+ task_step_id: Optional[TaskStepId] = None) -> None:
+ source_path = normalize_path(path)
+ manifest = self.manifest()
+ if task_step_id is None:
+ # Make a dummy task step so we report progress via tqdm.
+ task_step_id = ('', 0)
+ signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
+ signal_col = Column(path=source_path, alias='value', signal_udf=signal)
+ select_rows_result = self.select_rows([signal_col],
+ task_step_id=task_step_id,
+ resolve_span=True)
+ df = select_rows_result.df()
+ values = df['value']
+ enriched_path = _col_destination_path(signal_col, is_computed_signal=True)
+ output_dir = os.path.join(self.dataset_path, _signal_dir(enriched_path))
+ signal_schema = create_signal_schema(signal, source_path, manifest.data_schema)
+ write_embeddings_to_disk(
+ vector_store=self.vector_store,
+ uuids=df[UUID_COLUMN],
+ signal_items=values,
+ output_dir=output_dir)
+ del select_rows_result, df, values
+ gc.collect()
+ signal_manifest = SignalManifest(
+ files=[],
+ data_schema=signal_schema,
+ signal=signal,
+ enriched_path=source_path,
+ parquet_id=make_parquet_id(signal, source_path, is_computed_signal=True),
+ vector_store=self.vector_store)
+ signal_manifest_filepath = os.path.join(output_dir, SIGNAL_MANIFEST_FILENAME)
+ with open_file(signal_manifest_filepath, 'w') as f:
+ f.write(signal_manifest.json(exclude_none=True, indent=2))
+ log(f'Wrote embedding index to {output_dir}')
+ @override
+ def delete_signal(self, signal_path: Path) -> None:
+ signal_path = normalize_path(signal_path)
+ manifest = self.manifest()
+ if not manifest.data_schema.has_field(signal_path):
+ raise ValueError(f'Unknown signal path: {signal_path}')
+ output_dir = os.path.join(self.dataset_path, _signal_dir(signal_path))
+ shutil.rmtree(output_dir, ignore_errors=True)
+ def _validate_filters(self, filters: Sequence[Filter], col_aliases: dict[str, PathTuple],
+ manifest: DatasetManifest) -> None:
+ for filter in filters:
+ if filter.path[0] in col_aliases:
+ # This is a filter on a column alias, which is always allowed.
+ continue
+ current_field = Field(fields=manifest.data_schema.fields)
+ for path_part in filter.path:
+ if path_part == VALUE_KEY:
+ if not current_field.dtype:
+ raise ValueError(f'Unable to filter on path {filter.path}. The field has no value.')
+ continue
+ if current_field.fields:
+ if path_part not in current_field.fields:
+ raise ValueError(f'Unable to filter on path {filter.path}. '
+ f'Path part "{path_part}" not found in the dataset.')
+ current_field = current_field.fields[str(path_part)]
+ continue
+ elif current_field.repeated_field:
+ current_field = current_field.repeated_field
+ continue
+ else:
+ raise ValueError(f'Unable to filter on path {filter.path}. '
+ f'Path part "{path_part}" is not defined on a primitive value.')
+ while current_field.repeated_field:
+ current_field = current_field.repeated_field
+ filter.path = (*filter.path, PATH_WILDCARD)
+ if not current_field.dtype:
+ raise ValueError(f'Unable to filter on path {filter.path}. The field has no value.')
+ def _validate_udfs(self, udf_cols: Sequence[Column], source_schema: Schema) -> None:
+ for col in udf_cols:
+ path = col.path
+ # Signal transforms must operate on a leaf field.
+ leaf = source_schema.leafs.get(path)
+ if not leaf or not leaf.dtype:
+ raise ValueError(f'Leaf "{path}" not found in dataset. '
+ 'Signal transforms must operate on a leaf field.')
+ # Signal transforms must have the same dtype as the leaf field.
+ signal = cast(Signal, col.signal_udf)
+ if not signal_type_supports_dtype(signal.input_type, leaf.dtype):
+ raise ValueError(f'Leaf "{path}" has dtype "{leaf.dtype}" which is not supported '
+ f'by "{signal.key()}" with signal input type "{signal.input_type}".')
+ def _validate_selection(self, columns: Sequence[Column], select_schema: Schema) -> None:
+ # Validate all the columns and make sure they exist in the `select_schema`.
+ for column in columns:
+ current_field = Field(fields=select_schema.fields)
+ path = column.path
+ for path_part in path:
+ if path_part == VALUE_KEY:
+ if not current_field.dtype:
+ raise ValueError(f'Unable to select path {path}. The field that has no value.')
+ continue
+ if current_field.fields:
+ if path_part not in current_field.fields:
+ raise ValueError(f'Unable to select path {path}. '
+ f'Path part "{path_part}" not found in the dataset.')
+ current_field = current_field.fields[path_part]
+ continue
+ elif current_field.repeated_field:
+ if path_part.isdigit():
+ raise ValueError(f'Unable to select path {path}. Selecting a specific index of '
+ 'a repeated field is currently not supported.')
+ if path_part != PATH_WILDCARD:
+ raise ValueError(f'Unable to select path {path}. '
+ f'Path part "{path_part}" should be a wildcard.')
+ current_field = current_field.repeated_field
+ elif not current_field.dtype:
+ raise ValueError(f'Unable to select path {path}. '
+ f'Path part "{path_part}" is not defined on a primitive value.')
+ def _validate_columns(self, columns: Sequence[Column], source_schema: Schema,
+ select_schema: Schema) -> None:
+ udf_cols = [col for col in columns if col.signal_udf]
+ self._validate_udfs(udf_cols, source_schema)
+ self._validate_selection(columns, select_schema)
+ def _validate_sort_path(self, path: PathTuple, schema: Schema) -> None:
+ current_field = Field(fields=schema.fields)
+ for path_part in path:
+ if path_part == VALUE_KEY:
+ if not current_field.dtype:
+ raise ValueError(f'Unable to sort by path {path}. The field that has no value.')
+ continue
+ if current_field.fields:
+ if path_part not in current_field.fields:
+ raise ValueError(f'Unable to sort by path {path}. '
+ f'Path part "{path_part}" not found in the dataset.')
+ current_field = current_field.fields[path_part]
+ continue
+ elif current_field.repeated_field:
+ if path_part.isdigit():
+ raise ValueError(f'Unable to sort by path {path}. Selecting a specific index of '
+ 'a repeated field is currently not supported.')
+ if path_part != PATH_WILDCARD:
+ raise ValueError(f'Unable to sort by path {path}. '
+ f'Path part "{path_part}" should be a wildcard.')
+ current_field = current_field.repeated_field
+ elif not current_field.dtype:
+ raise ValueError(f'Unable to sort by path {path}. '
+ f'Path part "{path_part}" is not defined on a primitive value.')
+ if not current_field.dtype:
+ raise ValueError(f'Unable to sort by path {path}. The field has no value.')
+ @override
+ def stats(self, leaf_path: Path) -> StatsResult:
+ if not leaf_path:
+ raise ValueError('leaf_path must be provided')
+ path = normalize_path(leaf_path)
+ manifest = self.manifest()
+ leaf = manifest.data_schema.get_field(path)
+ # Find the inner-most leaf in case this field is repeated.
+ while leaf.repeated_field:
+ leaf = leaf.repeated_field
+ path = (*path, PATH_WILDCARD)
+ if not leaf.dtype:
+ raise ValueError(f'Leaf "{path}" not found in dataset')
+ duckdb_path = self._leaf_path_to_duckdb_path(path, manifest.data_schema)
+ inner_select = _select_sql(
+ duckdb_path, flatten=True, unnest=True, span_from=self._get_span_from(path, manifest))
+ # Compute approximate count by sampling the data to avoid OOM.
+ sample_size = SAMPLE_SIZE_DISTINCT_COUNT
+ avg_length_query = ''
+ if leaf.dtype == DataType.STRING:
+ avg_length_query = ', avg(length(val)) as avgTextLength'
+ row: Optional[tuple[int, ...]] = None
+ if leaf.dtype == DataType.BOOLEAN:
+ approx_count_distinct = 2
+ else:
+ approx_count_query = f"""
+ SELECT approx_count_distinct(val) as approxCountDistinct {avg_length_query}
+ FROM (SELECT {inner_select} AS val FROM t LIMIT {sample_size});
+ """
+ row = self._query(approx_count_query)[0]
+ approx_count_distinct = row[0]
+ total_count_query = f'SELECT count(val) FROM (SELECT {inner_select} as val FROM t)'
+ total_count = self._query(total_count_query)[0][0]
+ if leaf.dtype != DataType.BOOLEAN:
+ # Adjust the counts for the sample size.
+ factor = max(1, total_count / sample_size)
+ approx_count_distinct = round(approx_count_distinct * factor)
+ result = StatsResult(
+ path=path, total_count=total_count, approx_count_distinct=approx_count_distinct)
+ if leaf.dtype == DataType.STRING and row:
+ result.avg_text_length = row[1]
+ # Compute min/max values for ordinal leafs, without sampling the data.
+ if is_ordinal(leaf.dtype):
+ min_max_query = f"""
+ SELECT MIN(val) AS minVal, MAX(val) AS maxVal
+ FROM (SELECT {inner_select} as val FROM t)
+ {'WHERE NOT isnan(val)' if is_float(leaf.dtype) else ''}
+ """
+ row = self._query(min_max_query)[0]
+ result.min_val, result.max_val = row
+ return result
+ @override
+ def select_groups(
+ self,
+ leaf_path: Path,
+ filters: Optional[Sequence[FilterLike]] = None,
+ sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT,
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
+ limit: Optional[int] = None,
+ bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None) -> SelectGroupsResult:
+ if not leaf_path:
+ raise ValueError('leaf_path must be provided')
+ path = normalize_path(leaf_path)
+ manifest = self.manifest()
+ leaf = manifest.data_schema.get_field(path)
+ # Find the inner-most leaf in case this field is repeated.
+ while leaf.repeated_field:
+ leaf = leaf.repeated_field
+ path = (*path, PATH_WILDCARD)
+ if not leaf.dtype:
+ raise ValueError(f'Leaf "{path}" not found in dataset')
+ inner_val = 'inner_val'
+ outer_select = inner_val
+ # Normalize the bins to be `list[Bin]`.
+ named_bins = _normalize_bins(bins or leaf.bins)
+ stats = self.stats(leaf_path)
+ leaf_is_float = is_float(leaf.dtype)
+ leaf_is_integer = is_integer(leaf.dtype)
+ if not leaf.categorical and (leaf_is_float or leaf_is_integer):
+ if named_bins is None:
+ # Auto-bin.
+ named_bins = _auto_bins(stats, NUM_AUTO_BINS)
+ sql_bounds = []
+ for label, start, end in named_bins:
+ if start is None:
+ start = cast(float, "'-Infinity'")
+ if end is None:
+ end = cast(float, "'Infinity'")
+ sql_bounds.append(f"('{label}', {start}, {end})")
+ bin_index_col = 'col0'
+ bin_min_col = 'col1'
+ bin_max_col = 'col2'
+ is_nan_filter = f'NOT isnan({inner_val}) AND' if leaf_is_float else ''
+ # We cast the field to `double` so binning works for both `float` and `int` fields.
+ outer_select = f"""(
+ SELECT {bin_index_col} FROM (
+ VALUES {', '.join(sql_bounds)}
+ ) WHERE {is_nan_filter}
+ {inner_val}::DOUBLE >= {bin_min_col} AND {inner_val}::DOUBLE < {bin_max_col}
+ )"""
+ else:
+ if stats.approx_count_distinct >= dataset.TOO_MANY_DISTINCT:
+ return SelectGroupsResult(too_many_distinct=True, counts=[], bins=named_bins)
+ count_column = 'count'
+ value_column = 'value'
+ limit_query = f'LIMIT {limit}' if limit else ''
+ duckdb_path = self._leaf_path_to_duckdb_path(path, manifest.data_schema)
+ inner_select = _select_sql(
+ duckdb_path, flatten=True, unnest=True, span_from=self._get_span_from(path, manifest))
+ filters, _ = self._normalize_filters(filters, col_aliases={}, udf_aliases={}, manifest=manifest)
+ filter_queries = self._create_where(manifest, filters, searches=[])
+ where_query = ''
+ if filter_queries:
+ where_query = f"WHERE {' AND '.join(filter_queries)}"
+ query = f"""
+ SELECT {outer_select} AS {value_column}, COUNT() AS {count_column}
+ FROM (SELECT {inner_select} AS {inner_val} FROM t {where_query})
+ GROUP BY {value_column}
+ ORDER BY {sort_by} {sort_order}
+ {limit_query}
+ """
+ df = self._query_df(query)
+ counts = list(df.itertuples(index=False, name=None))
+ if is_temporal(leaf.dtype):
+ # Replace any NaT with None and pd.Timestamp to native datetime objects.
+ counts = [(None if pd.isnull(val) else val.to_pydatetime(), count) for val, count in counts]
+ return SelectGroupsResult(too_many_distinct=False, counts=counts, bins=named_bins)
+ def _topk_udf_to_sort_by(
+ self,
+ udf_columns: list[Column],
+ sort_by: list[PathTuple],
+ limit: Optional[int],
+ sort_order: Optional[SortOrder],
+ ) -> Optional[Column]:
+ if (sort_order != SortOrder.DESC) or (not limit) or (not sort_by):
+ return None
+ if len(sort_by) < 1:
+ return None
+ primary_sort_by = sort_by[0]
+ udf_cols_to_sort_by = [
+ udf_col for udf_col in udf_columns if udf_col.alias == primary_sort_by[0] or
+ _path_contains(_col_destination_path(udf_col), primary_sort_by)
+ ]
+ if not udf_cols_to_sort_by:
+ return None
+ udf_col = udf_cols_to_sort_by[0]
+ if udf_col.signal_udf and not isinstance(udf_col.signal_udf, VectorSignal):
+ return None
+ return udf_col
+ def _normalize_columns(self, columns: Optional[Sequence[ColumnId]],
+ schema: Schema) -> list[Column]:
+ """Normalizes the columns to a list of `Column` objects."""
+ cols = [column_from_identifier(col) for col in columns or []]
+ star_in_cols = any(col.path == ('*',) for col in cols)
+ if not cols or star_in_cols:
+ # Select all columns.
+ cols.extend([Column((name,)) for name in schema.fields.keys()])
+ if star_in_cols:
+ cols = [col for col in cols if col.path != ('*',)]
+ return cols
+ def _merge_sorts(self, search_udfs: list[DuckDBSearchUDF], sort_by: Optional[Sequence[Path]],
+ sort_order: Optional[SortOrder]) -> list[SortResult]:
+ # True when the user has explicitly sorted by the alias of a search UDF (e.g. in ASC order).
+ is_explicit_search_sort = False
+ for sort_by_path in sort_by or []:
+ for search_udf in search_udfs:
+ if column_paths_match(sort_by_path, search_udf.output_path):
+ is_explicit_search_sort = True
+ break
+ sort_results: list[SortResult] = []
+ if sort_by and not is_explicit_search_sort:
+ if not sort_order:
+ raise ValueError('`sort_order` is required when `sort_by` is specified.')
+ # If the user has explicitly set a sort by, and it's not a search UDF alias, override.
+ sort_results = [
+ SortResult(path=normalize_path(sort_by), order=sort_order) for sort_by in sort_by if sort_by
+ ]
+ else:
+ search_udfs_with_sort = [search_udf for search_udf in search_udfs if search_udf.sort]
+ if search_udfs_with_sort:
+ # Override the sort by the last search sort order when the user hasn't provided an
+ # explicit sort order.
+ last_search_udf = search_udfs_with_sort[-1]
+ assert last_search_udf.sort, 'Expected search UDFs with sort to have a sort.'
+ udf_sort_path, udf_sort_order = last_search_udf.sort
+ sort_results = [
+ SortResult(
+ path=udf_sort_path,
+ order=sort_order or udf_sort_order,
+ search_index=len(search_udfs_with_sort) - 1)
+ ]
+ return sort_results
+ @override
+ def select_rows(self,
+ columns: Optional[Sequence[ColumnId]] = None,
+ searches: Optional[Sequence[Search]] = None,
+ filters: Optional[Sequence[FilterLike]] = None,
+ sort_by: Optional[Sequence[Path]] = None,
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
+ limit: Optional[int] = None,
+ offset: Optional[int] = 0,
+ task_step_id: Optional[TaskStepId] = None,
+ resolve_span: bool = False,
+ combine_columns: bool = False,
+ user: Optional[UserInfo] = None) -> SelectRowsResult:
+ manifest = self.manifest()
+ cols = self._normalize_columns(columns, manifest.data_schema)
+ # Always return the UUID column.
+ col_paths = [col.path for col in cols]
+ if (UUID_COLUMN,) not in col_paths:
+ cols.append(column_from_identifier(UUID_COLUMN))
+ schema = manifest.data_schema
+ if combine_columns:
+ schema = self.select_rows_schema(
+ columns, sort_by, sort_order, searches, combine_columns=True).data_schema
+ self._validate_columns(cols, manifest.data_schema, schema)
+ self._normalize_searches(searches, manifest)
+ search_udfs = self._search_udfs(searches, manifest)
+ cols.extend([search_udf.udf for search_udf in search_udfs])
+ udf_columns = [col for col in cols if col.signal_udf]
+ # Set extra information on any concept signals.
+ for udf_col in udf_columns:
+ if isinstance(udf_col.signal_udf, (ConceptScoreSignal, ConceptLabelsSignal)):
+ # Concept are access controlled so we tell it about the user.
+ udf_col.signal_udf.set_user(user)
+ # Decide on the exact sorting order.
+ sort_results = self._merge_sorts(search_udfs, sort_by, sort_order)
+ sort_by = cast(list[PathTuple],
+ [(sort.alias,) if sort.alias else sort.path for sort in sort_results])
+ # Choose the first sort order as we only support a single sort order for now.
+ sort_order = sort_results[0].order if sort_results else None
+ col_aliases: dict[str, PathTuple] = {col.alias: col.path for col in cols if col.alias}
+ udf_aliases: dict[str, PathTuple] = {
+ col.alias: col.path for col in cols if col.signal_udf and col.alias
+ }
+ path_to_udf_col_name: dict[PathTuple, str] = {}
+ for col in cols:
+ if col.signal_udf:
+ alias = col.alias or _unique_alias(col)
+ dest_path = _col_destination_path(col)
+ path_to_udf_col_name[dest_path] = alias
+ # Filtering and searching.
+ where_query = ''
+ filters, udf_filters = self._normalize_filters(filters, col_aliases, udf_aliases, manifest)
+ filter_queries = self._create_where(manifest, filters, searches)
+ if filter_queries:
+ where_query = f"WHERE {' AND '.join(filter_queries)}"
+ total_num_rows = manifest.num_items
+ con = self.con.cursor()
+ topk_udf_col = self._topk_udf_to_sort_by(udf_columns, sort_by, limit, sort_order)
+ if topk_udf_col:
+ path_keys: Optional[list[PathKey]] = None
+ if where_query:
+ # If there are filters, we need to send UUIDs to the top k query.
+ df = con.execute(f'SELECT {UUID_COLUMN} FROM t {where_query}').df()
+ total_num_rows = len(df)
+ # Convert UUIDs to path keys.
+ path_keys = [(uuid,) for uuid in df[UUID_COLUMN]]
+ if path_keys is not None and len(path_keys) == 0:
+ where_query = 'WHERE false'
+ else:
+ topk_signal = cast(VectorSignal, topk_udf_col.signal_udf)
+ # The input is an embedding.
+ vector_index = self._get_vector_db_index(topk_signal.embedding, topk_udf_col.path)
+ k = (limit or 0) + (offset or 0)
+ with DebugTimer(f'Compute topk on "{topk_udf_col.path}" using embedding '
+ f'"{topk_signal.embedding}" with vector store "{self.vector_store}"'):
+ topk = topk_signal.vector_compute_topk(k, vector_index, path_keys)
+ topk_uuids = list(dict.fromkeys([cast(str, uuid) for (uuid, *_), _ in topk]))
+ # Update the offset to account for the number of unique UUIDs.
+ offset = len(dict.fromkeys([cast(str, uuid) for (uuid, *_), _ in topk[:offset]]))
+ # Ignore all the other filters and filter DuckDB results only by the top k UUIDs.
+ uuid_filter = Filter(path=(UUID_COLUMN,), op=ListOp.IN, value=topk_uuids)
+ filter_query = self._create_where(manifest, [uuid_filter])[0]
+ where_query = f'WHERE {filter_query}'
+ # Map a final column name to a list of temporary namespaced column names that need to be merged.
+ columns_to_merge: dict[str, dict[str, Column]] = {}
+ temp_column_to_offset_column: dict[str, tuple[str, Field]] = {}
+ select_queries: list[str] = []
+ for column in cols:
+ path = column.path
+ # If the signal is vector-based, we don't need to select the actual data, just the uuids
+ # plus an arbitrarily nested array of `None`s`.
+ empty = bool(column.signal_udf and schema.get_field(path).dtype == DataType.EMBEDDING)
+ select_sqls: list[str] = []
+ final_col_name = column.alias or _unique_alias(column)
+ if final_col_name not in columns_to_merge:
+ columns_to_merge[final_col_name] = {}
+ duckdb_paths = self._column_to_duckdb_paths(column, schema, combine_columns)
+ span_from = self._get_span_from(path, manifest) if resolve_span or column.signal_udf else None
+ for parquet_id, duckdb_path in duckdb_paths:
+ sql = _select_sql(
+ duckdb_path, flatten=False, unnest=False, empty=empty, span_from=span_from)
+ temp_column_name = (
+ final_col_name if len(duckdb_paths) == 1 else f'{final_col_name}/{parquet_id}')
+ select_sqls.append(f'{sql} AS {_escape_string_literal(temp_column_name)}')
+ columns_to_merge[final_col_name][temp_column_name] = column
+ if column.signal_udf and span_from and _schema_has_spans(column.signal_udf.fields()):
+ sql = _select_sql(duckdb_path, flatten=False, unnest=False, empty=empty, span_from=None)
+ temp_offset_column_name = f'{temp_column_name}/offset'
+ temp_offset_column_name = temp_offset_column_name.replace("'", "\\'")
+ select_sqls.append(f'{sql} AS {_escape_string_literal(temp_offset_column_name)}')
+ temp_column_to_offset_column[temp_column_name] = (temp_offset_column_name,
+ column.signal_udf.fields())
+ # `select_sqls` can be empty if this column points to a path that will be created by a UDF.
+ if select_sqls:
+ select_queries.append(', '.join(select_sqls))
+ sort_sql_before_udf: list[str] = []
+ sort_sql_after_udf: list[str] = []
+ for path in sort_by:
+ # We only allow sorting by nodes with a value.
+ sort_path = path
+ first_subpath = str(path[0])
+ rest_of_path = path[1:]
+ signal_alias = '.'.join(map(str, path))
+ udf_path = _path_to_udf_duckdb_path(path, path_to_udf_col_name)
+ if not udf_path:
+ # Re-route the path if it starts with an alias by pointing it to the actual path.
+ if first_subpath in col_aliases:
+ path = (*col_aliases[first_subpath], *rest_of_path)
+ self._validate_sort_path(path, schema)
+ path = self._leaf_path_to_duckdb_path(path, schema)
+ else:
+ path = udf_path
+ sort_sql = _select_sql(path, flatten=True, unnest=False)
+ has_repeated_field = any(subpath == PATH_WILDCARD for subpath in path)
+ if has_repeated_field:
+ sort_sql = (f'list_min({sort_sql})'
+ if sort_order == SortOrder.ASC else f'list_max({sort_sql})')
+ # Separate sort columns into two groups: those that need to be sorted before and after UDFs.
+ if udf_path:
+ sort_sql_after_udf.append(sort_sql)
+ else:
+ sort_sql_before_udf.append(sort_sql)
+ order_query = ''
+ if sort_sql_before_udf:
+ order_query = (f'ORDER BY {", ".join(sort_sql_before_udf)} '
+ f'{cast(SortOrder, sort_order).value}')
+ limit_query = ''
+ if limit:
+ if topk_udf_col:
+ limit_query = f'LIMIT {limit + (offset or 0)}'
+ elif sort_sql_after_udf:
+ limit_query = ''
+ else:
+ limit_query = f'LIMIT {limit} OFFSET {offset or 0}'
+ if not topk_udf_col and where_query:
+ total_num_rows = cast(tuple,
+ con.execute(f'SELECT COUNT(*) FROM t {where_query}').fetchone())[0]
+ # Fetch the data from DuckDB.
+ df = con.execute(f"""
+ SELECT {', '.join(select_queries)} FROM t
+ {where_query}
+ {order_query}
+ {limit_query}
+ """).df()
+ df = _replace_nan_with_none(df)
+ # Run UDFs on the transformed columns.
+ for udf_col in udf_columns:
+ signal = cast(Signal, udf_col.signal_udf)
+ signal_alias = udf_col.alias or _unique_alias(udf_col)
+ temp_signal_cols = columns_to_merge[signal_alias]
+ if len(temp_signal_cols) != 1:
+ raise ValueError(
+ f'Unable to compute signal {signal.name}. Signal UDFs only operate on leafs, but got '
+ f'{len(temp_signal_cols)} underlying columns that contain data related to {udf_col.path}.'
+ )
+ signal_column = list(temp_signal_cols.keys())[0]
+ input = df[signal_column]
+ with DebugTimer(f'Computing signal "{signal.name}"'):
+ signal.setup()
+ if isinstance(signal, VectorSignal):
+ embedding_signal = signal
+ vector_store = self._get_vector_db_index(embedding_signal.embedding, udf_col.path)
+ flat_keys = list(flatten_keys(df[UUID_COLUMN], input))
+ signal_out = sparse_to_dense_compute(
+ iter(flat_keys), lambda keys: embedding_signal.vector_compute(keys, vector_store))
+ # Add progress.
+ if task_step_id is not None:
+ signal_out = progress(
+ signal_out,
+ task_step_id=task_step_id,
+ estimated_len=len(flat_keys),
+ step_description=f'Computing {signal.key()}')
+ df[signal_column] = deep_unflatten(signal_out, input)
+ else:
+ num_rich_data = count_primitives(input)
+ flat_input = cast(Iterator[Optional[RichData]], deep_flatten(input))
+ signal_out = sparse_to_dense_compute(
+ flat_input, lambda x: signal.compute(cast(Iterable[RichData], x)))
+ # Add progress.
+ if task_step_id is not None:
+ signal_out = progress(
+ signal_out,
+ task_step_id=task_step_id,
+ estimated_len=num_rich_data,
+ step_description=f'Computing {signal.key()}')
+ signal_out_list = list(signal_out)
+ if signal_column in temp_column_to_offset_column:
+ offset_column_name, field = temp_column_to_offset_column[signal_column]
+ nested_spans: Iterable[Item] = df[offset_column_name]
+ flat_spans = deep_flatten(nested_spans)
+ for span, item in zip(flat_spans, signal_out_list):
+ _offset_any_span(cast(int, span[VALUE_KEY][TEXT_SPAN_START_FEATURE]), item, field)
+ if len(signal_out_list) != num_rich_data:
+ raise ValueError(
+ f'The signal generated {len(signal_out_list)} values but the input data had '
+ f"{num_rich_data} values. This means the signal either didn't generate a "
+ '"None" for a sparse output, or generated too many items.')
+ df[signal_column] = deep_unflatten(signal_out_list, input)
+ signal.teardown()
+ if not df.empty and (udf_filters or sort_sql_after_udf):
+ # Re-upload the udf outputs to duckdb so we can filter/sort on them.
+ rel = con.from_df(df)
+ if udf_filters:
+ udf_filter_queries = self._create_where(manifest, udf_filters)
+ if udf_filter_queries:
+ rel = rel.filter(' AND '.join(udf_filter_queries))
+ total_num_rows = cast(tuple, rel.count('*').fetchone())[0]
+ if sort_sql_after_udf:
+ if not sort_order:
+ raise ValueError('`sort_order` is required when `sort_by` is specified.')
+ rel = rel.order(f'{", ".join(sort_sql_after_udf)} {sort_order.value}')
+ if limit:
+ rel = rel.limit(limit, offset or 0)
+ df = _replace_nan_with_none(rel.df())
+ if combine_columns:
+ all_columns: dict[str, Column] = {}
+ for col_dict in columns_to_merge.values():
+ all_columns.update(col_dict)
+ columns_to_merge = {'*': all_columns}
+ for offset_column, _ in temp_column_to_offset_column.values():
+ del df[offset_column]
+ for final_col_name, temp_columns in columns_to_merge.items():
+ for temp_col_name, column in temp_columns.items():
+ if combine_columns:
+ dest_path = _col_destination_path(column)
+ spec = _split_path_into_subpaths_of_lists(dest_path)
+ df[temp_col_name] = wrap_in_dicts(df[temp_col_name], spec)
+ # If the temp col name is the same as the final name, we can skip merging. This happens when
+ # we select a source leaf column.
+ if temp_col_name == final_col_name:
+ continue
+ if final_col_name not in df:
+ df[final_col_name] = df[temp_col_name]
+ else:
+ df[final_col_name] = merge_series(df[final_col_name], df[temp_col_name])
+ del df[temp_col_name]
+ con.close()
+ if combine_columns:
+ # Since we aliased every column to `*`, the object with have only '*' as the key. We need to
+ # elevate the all the columns under '*'.
+ df = pd.DataFrame.from_records(df['*'])
+ return SelectRowsResult(df, total_num_rows)
+ @override
+ def select_rows_schema(self,
+ columns: Optional[Sequence[ColumnId]] = None,
+ sort_by: Optional[Sequence[Path]] = None,
+ sort_order: Optional[SortOrder] = None,
+ searches: Optional[Sequence[Search]] = None,
+ combine_columns: bool = False) -> SelectRowsSchemaResult:
+ """Returns the schema of the result of `select_rows` above with the same arguments."""
+ if not combine_columns:
+ raise NotImplementedError(
+ 'select_rows_schema with combine_columns=False is not yet supported.')
+ manifest = self.manifest()
+ cols = self._normalize_columns(columns, manifest.data_schema)
+ # Always return the UUID column.
+ col_paths = [col.path for col in cols]
+ if (UUID_COLUMN,) not in col_paths:
+ cols.append(column_from_identifier(UUID_COLUMN))
+ self._normalize_searches(searches, manifest)
+ search_udfs = self._search_udfs(searches, manifest)
+ cols.extend([search_udf.udf for search_udf in search_udfs])
+ udfs: list[SelectRowsSchemaUDF] = []
+ col_schemas: list[Schema] = []
+ for col in cols:
+ dest_path = _col_destination_path(col)
+ if col.signal_udf:
+ udfs.append(SelectRowsSchemaUDF(path=dest_path, alias=col.alias))
+ field = col.signal_udf.fields()
+ field.signal = col.signal_udf.dict()
+ elif manifest.data_schema.has_field(dest_path):
+ field = manifest.data_schema.get_field(dest_path)
+ else:
+ # This column might refer to an output of a udf. We postpone validation to later.
+ continue
+ col_schemas.append(_make_schema_from_path(dest_path, field))
+ sort_results = self._merge_sorts(search_udfs, sort_by, sort_order)
+ search_results = [
+ SearchResultInfo(search_path=search_udf.search_path, result_path=search_udf.output_path)
+ for search_udf in search_udfs
+ ]
+ new_schema = merge_schemas(col_schemas)
+ # Now that we have the new schema, we can validate all the column selections.
+ self._validate_columns(cols, manifest.data_schema, new_schema)
+ return SelectRowsSchemaResult(
+ data_schema=new_schema, udfs=udfs, search_results=search_results, sorts=sort_results or None)
+ @override
+ def media(self, item_id: str, leaf_path: Path) -> MediaResult:
+ raise NotImplementedError('Media is not yet supported for the DuckDB implementation.')
+ def _get_span_from(self, path: PathTuple, manifest: DatasetManifest) -> Optional[PathTuple]:
+ leafs = manifest.data_schema.leafs
+ # Remove the value key so we can check the dtype from leafs.
+ span_path = path[:-1] if path[-1] == VALUE_KEY else path
+ is_span = (span_path in leafs and leafs[span_path].dtype == DataType.STRING_SPAN)
+ return _derived_from_path(path, manifest.data_schema) if is_span else None
+ def _leaf_path_to_duckdb_path(self, leaf_path: PathTuple, schema: Schema) -> PathTuple:
+ ((_, duckdb_path),) = self._column_to_duckdb_paths(
+ Column(leaf_path), schema, combine_columns=False, select_leaf=True)
+ return duckdb_path
+ def _column_to_duckdb_paths(self,
+ column: Column,
+ schema: Schema,
+ combine_columns: bool,
+ select_leaf: bool = False) -> list[tuple[str, PathTuple]]:
+ path = column.path
+ parquet_manifests: list[Union[SourceManifest, SignalManifest]] = [
+ self._source_manifest, *self._signal_manifests
+ ]
+ duckdb_paths: list[tuple[str, PathTuple]] = []
+ source_has_path = False
+ select_leaf = select_leaf or column.signal_udf is not None
+ for m in parquet_manifests:
+ if not m.files:
+ continue
+ # Skip this parquet file if it doesn't contain the path.
+ if not schema_contains_path(m.data_schema, path):
+ continue
+ if isinstance(m, SourceManifest):
+ source_has_path = True
+ if isinstance(m, SignalManifest) and source_has_path and not combine_columns:
+ # Skip this signal if the source already has the path and we are not combining columns.
+ continue
+ # Skip this parquet file if the path doesn't have a dtype.
+ if select_leaf and not m.data_schema.get_field(path).dtype:
+ continue
+ if isinstance(m, SignalManifest) and path == (UUID_COLUMN,):
+ # Do not select UUID from the signal because it's already in the source.
+ continue
+ duckdb_path = path
+ parquet_id = 'source'
+ if isinstance(m, SignalManifest):
+ duckdb_path = (m.parquet_id, *path[1:])
+ parquet_id = m.parquet_id
+ duckdb_paths.append((parquet_id, duckdb_path))
+ if not duckdb_paths:
+ # This path is probably a result of a udf. Make sure the result schema contains it.
+ if not schema.has_field(path):
+ raise ValueError(f'Invalid path "{path}": No manifest contains path. Valid paths: '
+ f'{list(schema.leafs.keys())}')
+ return duckdb_paths
+ def _normalize_filters(self, filter_likes: Optional[Sequence[FilterLike]],
+ col_aliases: dict[str, PathTuple], udf_aliases: dict[str, PathTuple],
+ manifest: DatasetManifest) -> tuple[list[Filter], list[Filter]]:
+ """Normalize `FilterLike` to `Filter` and split into filters on source and filters on UDFs."""
+ filter_likes = filter_likes or []
+ filters: list[Filter] = []
+ udf_filters: list[Filter] = []
+ for filter in filter_likes:
+ # Normalize `FilterLike` to `Filter`.
+ if not isinstance(filter, Filter):
+ if len(filter) == 3:
+ path, op, value = filter # type: ignore
+ elif len(filter) == 2:
+ path, op = filter # type: ignore
+ value = None
+ else:
+ raise ValueError(f'Invalid filter: {filter}. Must be a tuple with 2 or 3 elements.')
+ filter = Filter(path=normalize_path(path), op=op, value=value)
+ if str(filter.path[0]) in udf_aliases:
+ udf_filters.append(filter)
+ else:
+ filters.append(filter)
+ self._validate_filters(filters, col_aliases, manifest)
+ return filters, udf_filters
+ def _normalize_searches(self, searches: Optional[Sequence[Search]],
+ manifest: DatasetManifest) -> None:
+ """Validate searches."""
+ if not searches:
+ return
+ for search in searches:
+ search.path = normalize_path(search.path)
+ field = manifest.data_schema.get_field(search.path)
+ if field.dtype != DataType.STRING:
+ raise ValueError(f'Invalid search path: {search.path}. '
+ f'Must be a string field, got dtype {field.dtype}')
+ def _search_udfs(self, searches: Optional[Sequence[Search]],
+ manifest: DatasetManifest) -> list[DuckDBSearchUDF]:
+ searches = searches or []
+ """Create a UDF for each search for finding the location of the text with spans."""
+ search_udfs: list[DuckDBSearchUDF] = []
+ for search in searches:
+ search_path = normalize_path(search.path)
+ if search.query.type == 'keyword':
+ udf = Column(path=search_path, signal_udf=SubstringSignal(query=search.query.search))
+ search_udfs.append(
+ DuckDBSearchUDF(
+ udf=udf,
+ search_path=search_path,
+ output_path=(*_col_destination_path(udf), PATH_WILDCARD)))
+ elif search.query.type == 'semantic' or search.query.type == 'concept':
+ embedding = search.query.embedding
+ if not embedding:
+ raise ValueError(f'Please provide an embedding for semantic search. Got search: {search}')
+ try:
+ manifest.data_schema.get_field((*search_path, embedding))
+ except Exception as e:
+ raise ValueError(
+ f'Embedding {embedding} has not been computed. '
+ f'Please compute the embedding index before issuing a {search.query.type} query.'
+ ) from e
+ search_signal: Optional[Signal] = None
+ if search.query.type == 'semantic':
+ search_signal = SemanticSimilaritySignal(
+ query=search.query.search, embedding=search.query.embedding)
+ elif search.query.type == 'concept':
+ search_signal = ConceptScoreSignal(
+ namespace=search.query.concept_namespace,
+ concept_name=search.query.concept_name,
+ embedding=search.query.embedding)
+ # Add the label UDF.
+ concept_labels_signal = ConceptLabelsSignal(
+ namespace=search.query.concept_namespace, concept_name=search.query.concept_name)
+ concept_labels_udf = Column(path=search_path, signal_udf=concept_labels_signal)
+ search_udfs.append(
+ DuckDBSearchUDF(
+ udf=concept_labels_udf,
+ search_path=search_path,
+ output_path=_col_destination_path(concept_labels_udf),
+ sort=None))
+ udf = Column(path=search_path, signal_udf=search_signal)
+ output_path = _col_destination_path(udf)
+ search_udfs.append(
+ DuckDBSearchUDF(
+ udf=udf,
+ search_path=search_path,
+ output_path=_col_destination_path(udf),
+ sort=((*output_path, PATH_WILDCARD, 'score'), SortOrder.DESC)))
+ else:
+ raise ValueError(f'Unknown search operator {search.query.type}.')
+ return search_udfs
+ def _create_where(self,
+ manifest: DatasetManifest,
+ filters: list[Filter],
+ searches: Optional[Sequence[Search]] = []) -> list[str]:
+ if not filters and not searches:
+ return []
+ searches = searches or []
+ sql_filter_queries: list[str] = []
+ # Add search where queries.
+ for search in searches:
+ duckdb_path = self._leaf_path_to_duckdb_path(
+ normalize_path(search.path), manifest.data_schema)
+ select_str = _select_sql(duckdb_path, flatten=False, unnest=False)
+ if search.query.type == 'keyword':
+ sql_op = 'ILIKE'
+ query_val = _escape_like_value(search.query.search)
+ elif search.query.type == 'semantic' or search.query.type == 'concept':
+ # Semantic search and concepts don't yet filter.
+ continue
+ else:
+ raise ValueError(f'Unknown search operator {search.query.type}.')
+ filter_query = f'{select_str} {sql_op} {query_val}'
+ sql_filter_queries.append(filter_query)
+ # Add filter where queries.
+ binary_ops = set(BinaryOp)
+ unary_ops = set(UnaryOp)
+ list_ops = set(ListOp)
+ for f in filters:
+ duckdb_path = self._leaf_path_to_duckdb_path(f.path, manifest.data_schema)
+ select_str = _select_sql(
+ duckdb_path, flatten=True, unnest=False, span_from=self._get_span_from(f.path, manifest))
+ is_array = any(subpath == PATH_WILDCARD for subpath in f.path)
+ nan_filter = ''
+ field = manifest.data_schema.get_field(f.path)
+ filter_nans = field.dtype and is_float(field.dtype)
+ if f.op in binary_ops:
+ sql_op = BINARY_OP_TO_SQL[cast(BinaryOp, f.op)]
+ filter_val = cast(FeatureValue, f.value)
+ if isinstance(filter_val, str):
+ filter_val = _escape_string_literal(filter_val)
+ elif isinstance(filter_val, bytes):
+ filter_val = _bytes_to_blob_literal(filter_val)
+ else:
+ filter_val = str(filter_val)
+ if is_array:
+ nan_filter = 'NOT isnan(x) AND' if filter_nans else ''
+ filter_query = (f'len(list_filter({select_str}, '
+ f'x -> {nan_filter} x {sql_op} {filter_val})) > 0')
+ else:
+ nan_filter = f'NOT isnan({select_str}) AND' if filter_nans else ''
+ filter_query = f'{nan_filter} {select_str} {sql_op} {filter_val}'
+ elif f.op in unary_ops:
+ if f.op == UnaryOp.EXISTS:
+ filter_query = f'len({select_str}) > 0' if is_array else f'{select_str} IS NOT NULL'
+ else:
+ raise ValueError(f'Unary op: {f.op} is not yet supported')
+ elif f.op in list_ops:
+ if f.op == ListOp.IN:
+ filter_list_val = cast(FeatureListValue, f.value)
+ if not isinstance(filter_list_val, list):
+ raise ValueError('filter with array value can only use the IN comparison')
+ wrapped_filter_val = [f"'{part}'" for part in filter_list_val]
+ filter_val = f'({", ".join(wrapped_filter_val)})'
+ filter_query = f'{select_str} IN {filter_val}'
+ else:
+ raise ValueError(f'List op: {f.op} is not yet supported')
+ else:
+ raise ValueError(f'Invalid filter op: {f.op}')
+ sql_filter_queries.append(filter_query)
+ return sql_filter_queries
+ def _execute(self, query: str) -> duckdb.DuckDBPyConnection:
+ """Execute a query in duckdb."""
+ # FastAPI is multi-threaded so we have to create a thread-specific connection cursor to allow
+ # these queries to be thread-safe.
+ local_con = self.con.cursor()
+ if not env('DEBUG', False):
+ return local_con.execute(query)
+ # Debug mode.
+ log('Executing:')
+ log(query)
+ with DebugTimer('Query'):
+ return local_con.execute(query)
+ def _query(self, query: str) -> list[tuple]:
+ result = self._execute(query)
+ rows = result.fetchall()
+ result.close()
+ return rows
+ def _query_df(self, query: str) -> pd.DataFrame:
+ """Execute a query that returns a data frame."""
+ result = self._execute(query)
+ df = _replace_nan_with_none(result.df())
+ result.close()
+ return df
+ def _path_to_col(self, path: Path, quote_each_part: bool = True) -> str:
+ """Convert a path to a column name."""
+ if isinstance(path, str):
+ path = (path,)
+ return '.'.join([
+ f'{_escape_col_name(path_comp)}' if quote_each_part else str(path_comp) for path_comp in path
+ ])
+ @override
+ def to_json(self, filepath: Union[str, pathlib.Path], jsonl: bool = True) -> None:
+ self._execute(f"COPY t TO '{filepath}' (FORMAT JSON, ARRAY {'FALSE' if jsonl else 'TRUE'})")
+ log(f'Dataset exported to {filepath}')
+ @override
+ def to_pandas(self) -> pd.DataFrame:
+ return self._query_df('SELECT * FROM t')
+ @override
+ def to_csv(self, filepath: Union[str, pathlib.Path]) -> None:
+ self._execute(f"COPY t TO '{filepath}' (FORMAT CSV, HEADER)")
+ log(f'Dataset exported to {filepath}')
+ @override
+ def to_parquet(self, filepath: Union[str, pathlib.Path]) -> None:
+ self._execute(f"COPY t TO '{filepath}' (FORMAT PARQUET)")
+ log(f'Dataset exported to {filepath}')
+def _escape_string_literal(string: str) -> str:
+ string = string.replace("'", "''")
+ return f"'{string}'"
+def _escape_col_name(col_name: str) -> str:
+ col_name = col_name.replace('"', '""')
+ return f'"{col_name}"'
+def _escape_like_value(value: str) -> str:
+ value = value.replace('%', '\\%').replace('_', '\\_')
+ return f"'%{value}%' ESCAPE '\\'"
+def _inner_select(sub_paths: list[PathTuple],
+ inner_var: Optional[str] = None,
+ empty: bool = False,
+ span_from: Optional[PathTuple] = None) -> str:
+ """Recursively generate the inner select statement for a list of sub paths."""
+ current_sub_path = sub_paths[0]
+ lambda_var = inner_var + 'x' if inner_var else 'x'
+ if not inner_var:
+ lambda_var = 'x'
+ inner_var = _escape_col_name(current_sub_path[0])
+ current_sub_path = current_sub_path[1:]
+ # Select the path inside structs. E.g. x['a']['b']['c'] given current_sub_path = [a, b, c].
+ path_key = inner_var + ''.join([f'[{_escape_string_literal(p)}]' for p in current_sub_path])
+ if len(sub_paths) == 1:
+ if span_from:
+ derived_col = _select_sql(span_from, flatten=False, unnest=False)
+ path_key = (f'{derived_col}[{path_key}.{VALUE_KEY}.{TEXT_SPAN_START_FEATURE}+1:'
+ f'{path_key}.{VALUE_KEY}.{TEXT_SPAN_END_FEATURE}]')
+ return 'NULL' if empty else path_key
+ return (f'list_transform({path_key}, {lambda_var} -> '
+ f'{_inner_select(sub_paths[1:], lambda_var, empty, span_from)})')
+def _split_path_into_subpaths_of_lists(leaf_path: PathTuple) -> list[PathTuple]:
+ """Split a path into a subpath of lists.
+ E.g. [a, b, c, *, d, *, *] gets splits [[a, b, c], [d], [], []].
+ """
+ sub_paths: list[PathTuple] = []
+ offset = 0
+ while offset <= len(leaf_path):
+ new_offset = leaf_path.index(PATH_WILDCARD,
+ offset) if PATH_WILDCARD in leaf_path[offset:] else len(leaf_path)
+ sub_path = leaf_path[offset:new_offset]
+ sub_paths.append(sub_path)
+ offset = new_offset + 1
+ return sub_paths
+def _select_sql(path: PathTuple,
+ flatten: bool,
+ unnest: bool,
+ empty: bool = False,
+ span_from: Optional[PathTuple] = None) -> str:
+ """Create a select column for a path.
+ Args:
+ path: A path to a feature. E.g. ['a', 'b', 'c'].
+ flatten: Whether to flatten the result.
+ unnest: Whether to unnest the result.
+ empty: Whether to return an empty list (used for embedding signals that don't need the data).
+ span_from: The path this span is derived from. If specified, the span will be resolved
+ to a substring of the original string.
+ """
+ sub_paths = _split_path_into_subpaths_of_lists(path)
+ selection = _inner_select(sub_paths, None, empty, span_from)
+ # We only flatten when the result of a nested list to avoid segfault.
+ is_result_nested_list = len(sub_paths) >= 3 # E.g. subPaths = [[a, b, c], *, *].
+ if flatten and is_result_nested_list:
+ selection = f'flatten({selection})'
+ # We only unnest when the result is a list. // E.g. subPaths = [[a, b, c], *].
+ is_result_a_list = len(sub_paths) >= 2
+ if unnest and is_result_a_list:
+ selection = f'unnest({selection})'
+ return selection
+def read_source_manifest(dataset_path: str) -> SourceManifest:
+ """Read the manifest file."""
+ with open_file(os.path.join(dataset_path, MANIFEST_FILENAME), 'r') as f:
+ return SourceManifest.parse_raw(f.read())
+def _signal_dir(enriched_path: PathTuple) -> str:
+ """Get the filename prefix for a signal parquet file."""
+ path_without_wildcards = (p for p in enriched_path if p != PATH_WILDCARD)
+ return os.path.join(*path_without_wildcards)
+def split_column_name(column: str, split_name: str) -> str:
+ """Get the name of a split column."""
+ return f'{column}.{split_name}'
+def split_parquet_prefix(column_name: str, splitter_name: str) -> str:
+ """Get the filename prefix for a split parquet file."""
+ return f'{column_name}.{splitter_name}'
+def _bytes_to_blob_literal(bytes: bytes) -> str:
+ """Convert bytes to a blob literal."""
+ escaped_hex = re.sub(r'(.{2})', r'\\x\1', bytes.hex())
+ return f"'{escaped_hex}'::BLOB"
+class SignalManifest(BaseModel):
+ """The manifest that describes a signal computation including schema and parquet files."""
+ # List of a parquet filepaths storing the data. The paths are relative to the manifest.
+ files: list[str]
+ # An identifier for this parquet table. Will be used as the view name in SQL.
+ parquet_id: str
+ data_schema: Schema
+ signal: Signal
+ # The column path that this signal is derived from.
+ enriched_path: PathTuple
+ # The name of the vector store. Present when the signal is an embedding.
+ vector_store: Optional[str] = None
+ @validator('signal', pre=True)
+ def parse_signal(cls, signal: dict) -> Signal:
+ """Parse a signal to its specific subclass instance."""
+ return resolve_signal(signal)
+def _merge_cells(dest_cell: Item, source_cell: Item) -> Item:
+ if source_cell is None or isinstance(source_cell, float) and math.isnan(source_cell):
+ # Nothing to merge here (missing value).
+ return dest_cell
+ if isinstance(dest_cell, dict):
+ if isinstance(source_cell, list):
+ raise ValueError(f'Failed to merge cells. Destination is a dict ({dest_cell!r}), '
+ f'but source is a list ({source_cell!r}).')
+ if isinstance(source_cell, dict):
+ res = {**dest_cell}
+ for key, value in source_cell.items():
+ res[key] = (value if key not in dest_cell else _merge_cells(dest_cell[key], value))
+ return res
+ else:
+ return {VALUE_KEY: source_cell, **dest_cell}
+ elif isinstance(dest_cell, list):
+ if not isinstance(source_cell, list):
+ raise ValueError('Failed to merge cells. Destination is a list, but source is not.')
+ return [
+ _merge_cells(dest_subcell, source_subcell)
+ for dest_subcell, source_subcell in zip(dest_cell, source_cell)
+ ]
+ else:
+ # The destination is a primitive.
+ if isinstance(source_cell, list):
+ raise ValueError(f'Failed to merge cells. Destination is a primitive ({dest_cell!r}), '
+ f'but source is a list ({source_cell!r}).')
+ if isinstance(source_cell, dict):
+ return {VALUE_KEY: dest_cell, **source_cell}
+ else:
+ # Primitives can be merged together if they are equal. This can happen if a user selects a
+ # column that is the child of another.
+ # NOTE: This can be removed if we fix https://github.com/lilacai/lilac/issues/166.
+ if source_cell != dest_cell:
+ raise ValueError(f'Cannot merge source "{source_cell!r}" into destination "{dest_cell!r}"')
+ return dest_cell
+def merge_series(destination: pd.Series, source: pd.Series) -> list[Item]:
+ """Merge two series of values recursively."""
+ return _merge_cells(destination.tolist(), source.tolist())
+def _unique_alias(column: Column) -> str:
+ """Get a unique alias for a selection column."""
+ if column.signal_udf:
+ return make_parquet_id(column.signal_udf, column.path)
+ return '.'.join(map(str, column.path))
+def _path_contains(parent_path: PathTuple, child_path: PathTuple) -> bool:
+ """Check if a path contains another path."""
+ if len(parent_path) > len(child_path):
+ return False
+ return all(parent_path[i] == child_path[i] for i in range(len(parent_path)))
+def _path_to_udf_duckdb_path(path: PathTuple,
+ path_to_udf_col_name: dict[PathTuple, str]) -> Optional[PathTuple]:
+ first_subpath, *rest_of_path = path
+ for parent_path, udf_col_name in path_to_udf_col_name.items():
+ # If the user selected udf(document.*.text) as "udf" and wanted to sort by "udf.len", we need to
+ # sort by "udf.*.len" where the "*" came from the fact that the udf was applied to a list of
+ # "text" fields.
+ wildcards = [x for x in parent_path if x == PATH_WILDCARD]
+ if _path_contains(parent_path, path):
+ return (udf_col_name, *wildcards, *path[len(parent_path):])
+ elif first_subpath == udf_col_name:
+ return (udf_col_name, *wildcards, *rest_of_path)
+ return None
+def _col_destination_path(column: Column, is_computed_signal: Optional[bool] = False) -> PathTuple:
+ """Get the destination path where the output of this selection column will be stored."""
+ source_path = column.path
+ if not column.signal_udf:
+ return source_path
+ signal_key = column.signal_udf.key(is_computed_signal=is_computed_signal)
+ # If we are enriching a value we should store the signal data in the value's parent.
+ if source_path[-1] == VALUE_KEY:
+ dest_path = (*source_path[:-1], signal_key)
+ else:
+ dest_path = (*source_path, signal_key)
+ return dest_path
+def _root_column(manifest: SignalManifest) -> str:
+ """Returns the root column of a signal manifest."""
+ field_keys = manifest.data_schema.fields.keys()
+ if len(field_keys) != 2:
+ raise ValueError('Expected exactly two fields in signal manifest, '
+ f'the row UUID and root this signal is enriching. Got {field_keys}.')
+ return next(filter(lambda field: field != UUID_COLUMN, manifest.data_schema.fields.keys()))
+def _derived_from_path(path: PathTuple, schema: Schema) -> PathTuple:
+ # Find the closest parent of `path` that is a signal root.
+ for i in reversed(range(len(path))):
+ sub_path = path[:i]
+ if schema.get_field(sub_path).signal is not None:
+ # Skip the signal name at the end to get the source path that was enriched.
+ return sub_path[:-1]
+ raise ValueError('Cannot find the source path for the enriched path: {path}')
+def _make_schema_from_path(path: PathTuple, field: Field) -> Schema:
+ """Returns a schema that contains only the given path."""
+ for sub_path in reversed(path):
+ if sub_path == PATH_WILDCARD:
+ field = Field(repeated_field=field)
+ else:
+ field = Field(fields={sub_path: field})
+ if not field.fields:
+ raise ValueError(f'Invalid path: {path}. Must contain at least one field name.')
+ return Schema(fields=field.fields)
+def _replace_nan_with_none(df: pd.DataFrame) -> pd.DataFrame:
+ """DuckDB returns np.nan for missing field in string column, replace with None for correctness."""
+ # TODO(https://github.com/duckdb/duckdb/issues/4066): Remove this once duckdb fixes upstream.
+ for col in df.columns:
+ if is_object_dtype(df[col]):
+ df[col].replace(np.nan, None, inplace=True)
+ return df
+def _offset_any_span(offset: int, item: Item, schema: Field) -> None:
+ """Offsets any spans inplace by the given parent offset."""
+ if schema.dtype == DataType.STRING_SPAN:
+ item = cast(dict, item)
+ item[VALUE_KEY][TEXT_SPAN_START_FEATURE] += offset
+ item[VALUE_KEY][TEXT_SPAN_END_FEATURE] += offset
+ if schema.fields:
+ item = cast(dict, item)
+ for key, sub_schema in schema.fields.items():
+ _offset_any_span(offset, item[key], sub_schema)
+ if schema.repeated_field:
+ item = cast(list, item)
+ for sub_item in item:
+ _offset_any_span(offset, sub_item, schema.repeated_field)
+def _schema_has_spans(field: Field) -> bool:
+ if field.dtype and field.dtype == DataType.STRING_SPAN:
+ return True
+ if field.fields:
+ children_have_spans = any(_schema_has_spans(sub_field) for sub_field in field.fields.values())
+ if children_have_spans:
+ return True
+ if field.repeated_field:
+ return _schema_has_spans(field.repeated_field)
+ return False
+def _normalize_bins(bins: Optional[Union[Sequence[Bin], Sequence[float]]]) -> Optional[list[Bin]]:
+ if bins is None:
+ return None
+ if not isinstance(bins[0], (float, int)):
+ return cast(list[Bin], bins)
+ named_bins: list[Bin] = []
+ for i in range(len(bins) + 1):
+ start = cast(float, bins[i - 1]) if i > 0 else None
+ end = cast(float, bins[i]) if i < len(bins) else None
+ named_bins.append((str(i), start, end))
+ return named_bins
+def _auto_bins(stats: StatsResult, num_bins: int) -> list[Bin]:
+ min_val = cast(float, stats.min_val)
+ max_val = cast(float, stats.max_val)
+ bin_width = (max_val - min_val) / num_bins
+ bins: list[Bin] = []
+ for i in range(num_bins):
+ start = None if i == 0 else min_val + i * bin_width
+ end = None if i == num_bins - 1 else min_val + (i + 1) * bin_width
+ bins.append((str(i), start, end))
+ return bins
+def _settings_filepath(namespace: str, dataset_name: str) -> str:
+ return os.path.join(
+ get_dataset_output_dir(data_path(), namespace, dataset_name), DATASET_SETTINGS_FILENAME)

lilac/data/dataset_test_utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Tests utils of for dataset_test."""
+import os
+import pathlib
+from datetime import datetime
+from typing import Optional, Type, cast
+import numpy as np
+from typing_extensions import Protocol
+from ..embeddings.vector_store import VectorDBIndex
+from ..schema import (
+ MANIFEST_FILENAME,
+ PARQUET_FILENAME_PREFIX,
+ VALUE_KEY,
+ DataType,
+ Field,
+ Item,
+ PathKey,
+ Schema,
+ SourceManifest,
+)
+from ..utils import get_dataset_output_dir, open_file
+from .dataset import Dataset
+from .dataset_utils import is_primitive, write_items_to_parquet
+TEST_NAMESPACE = 'test_namespace'
+TEST_DATASET_NAME = 'test_dataset'
+def _infer_dtype(value: Item) -> DataType:
+ if isinstance(value, str):
+ return DataType.STRING
+ elif isinstance(value, bool):
+ return DataType.BOOLEAN
+ elif isinstance(value, bytes):
+ return DataType.BINARY
+ elif isinstance(value, float):
+ return DataType.FLOAT32
+ elif isinstance(value, int):
+ return DataType.INT32
+ elif isinstance(value, datetime):
+ return DataType.TIMESTAMP
+ else:
+ raise ValueError(f'Cannot infer dtype of primitive value: {value}')
+def _infer_field(item: Item) -> Field:
+ """Infer the schema from the items."""
+ if isinstance(item, dict):
+ fields: dict[str, Field] = {}
+ for k, v in item.items():
+ fields[k] = _infer_field(cast(Item, v))
+ dtype = None
+ if VALUE_KEY in fields:
+ dtype = fields[VALUE_KEY].dtype
+ del fields[VALUE_KEY]
+ return Field(fields=fields, dtype=dtype)
+ elif is_primitive(item):
+ return Field(dtype=_infer_dtype(item))
+ elif isinstance(item, list):
+ return Field(repeated_field=_infer_field(item[0]))
+ else:
+ raise ValueError(f'Cannot infer schema of item: {item}')
+def _infer_schema(items: list[Item]) -> Schema:
+ """Infer the schema from the items."""
+ schema = Schema(fields={})
+ for item in items:
+ field = _infer_field(item)
+ if not field.fields:
+ raise ValueError(f'Invalid schema of item. Expected an object, but got: {item}')
+ schema.fields = {**schema.fields, **field.fields}
+ return schema
+class TestDataMaker(Protocol):
+ """A function that creates a test dataset."""
+ def __call__(self, items: list[Item], schema: Optional[Schema] = None) -> Dataset:
+ """Create a test dataset."""
+ ...
+def make_dataset(dataset_cls: Type[Dataset],
+ tmp_path: pathlib.Path,
+ items: list[Item],
+ schema: Optional[Schema] = None) -> Dataset:
+ """Create a test dataset."""
+ schema = schema or _infer_schema(items)
+ _write_items(tmp_path, TEST_DATASET_NAME, items, schema)
+ return dataset_cls(TEST_NAMESPACE, TEST_DATASET_NAME)
+def _write_items(tmpdir: pathlib.Path, dataset_name: str, items: list[Item],
+ schema: Schema) -> None:
+ """Write the items JSON to the dataset format: manifest.json and parquet files."""
+ source_dir = get_dataset_output_dir(str(tmpdir), TEST_NAMESPACE, dataset_name)
+ os.makedirs(source_dir)
+ simple_parquet_files, _ = write_items_to_parquet(
+ items, source_dir, schema, filename_prefix=PARQUET_FILENAME_PREFIX, shard_index=0, num_shards=1)
+ manifest = SourceManifest(files=[simple_parquet_files], data_schema=schema)
+ with open_file(os.path.join(source_dir, MANIFEST_FILENAME), 'w') as f:
+ f.write(manifest.json(indent=2, exclude_none=True))
+def enriched_item(value: Optional[Item] = None, metadata: dict[str, Item] = {}) -> Item:
+ """Wrap a value in a dict with the value key."""
+ return {VALUE_KEY: value, **metadata}
+def make_vector_index(vector_store: str, vector_dict: dict[PathKey,
+ list[list[float]]]) -> VectorDBIndex:
+ """Make a vector index from a dictionary of vector keys to vectors."""
+ embeddings: list[np.ndarray] = []
+ spans: list[tuple[PathKey, list[tuple[int, int]]]] = []
+ for path_key, vectors in vector_dict.items():
+ vector_spans: list[tuple[int, int]] = []
+ for i, vector in enumerate(vectors):
+ embeddings.append(np.array(vector))
+ vector_spans.append((0, 0))
+ spans.append((path_key, vector_spans))
+ vector_index = VectorDBIndex(vector_store)
+ vector_index.add(spans, np.array(embeddings))
+ return vector_index

lilac/data/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""Utilities for working with datasets."""
+import gc
+import json
+import math
+import os
+import pprint
+import secrets
+from collections.abc import Iterable
+from typing import Any, Callable, Iterator, Optional, Sequence, TypeVar, Union, cast
+import numpy as np
+import pyarrow as pa
+from ..batch_utils import deep_flatten
+from ..embeddings.vector_store import VectorDBIndex
+from ..env import env
+from ..parquet_writer import ParquetWriter
+from ..schema import (
+ EMBEDDING_KEY,
+ PATH_WILDCARD,
+ TEXT_SPAN_END_FEATURE,
+ TEXT_SPAN_START_FEATURE,
+ UUID_COLUMN,
+ VALUE_KEY,
+ Field,
+ Item,
+ PathKey,
+ PathTuple,
+ Schema,
+ VectorKey,
+ field,
+ schema,
+ schema_to_arrow_schema,
+)
+from ..signals.signal import Signal
+from ..utils import is_primitive, log, open_file
+def _replace_embeddings_with_none(input: Union[Item, Item]) -> Union[Item, Item]:
+ if isinstance(input, np.ndarray):
+ return None
+ if isinstance(input, dict):
+ return {k: _replace_embeddings_with_none(v) for k, v in input.items()}
+ if isinstance(input, list):
+ return [_replace_embeddings_with_none(v) for v in input]
+ return input
+def replace_embeddings_with_none(input: Union[Item, Item]) -> Item:
+ """Replaces all embeddings with None."""
+ return cast(Item, _replace_embeddings_with_none(input))
+def count_primitives(input: Union[Iterable, Iterator]) -> int:
+ """Iterate through each element of the input, flattening each one, computing a count.
+ Sum the final set of counts. This is the important iterable not to exhaust.
+ """
+ return sum((len(list(deep_flatten(i))) for i in input))
+def _wrap_value_in_dict(input: Union[object, dict], props: PathTuple) -> Union[object, dict]:
+ # If the signal produced no value, or nan, we should return None so the parquet value is sparse.
+ if isinstance(input, float) and math.isnan(input):
+ input = None
+ for prop in reversed(props):
+ input = {prop: input}
+ return input
+def _wrap_in_dicts(input: Union[object, Iterable[object]],
+ spec: list[PathTuple]) -> Union[object, Iterable[object]]:
+ """Wraps an object or iterable in a dict according to the spec."""
+ props = spec[0] if spec else tuple()
+ if len(spec) == 1:
+ return _wrap_value_in_dict(input, props)
+ if input is None or isinstance(input, float) and math.isnan(input):
+ # Return empty dict for missing inputs.
+ return {}
+ res = [_wrap_in_dicts(elem, spec[1:]) for elem in cast(Iterable, input)]
+ return _wrap_value_in_dict(res, props)
+def wrap_in_dicts(input: Iterable[object], spec: list[PathTuple]) -> Iterable[object]:
+ """Wraps an object or iterable in a dict according to the spec."""
+ return [_wrap_in_dicts(elem, spec) for elem in input]
+def _merge_field_into(schema: Field, destination: Field) -> None:
+ if isinstance(schema, Field):
+ destination.signal = destination.signal or schema.signal
+ destination.dtype = destination.dtype or schema.dtype
+ if schema.fields:
+ destination.fields = destination.fields or {}
+ for field_name, subfield in schema.fields.items():
+ if field_name not in destination.fields:
+ destination.fields[field_name] = subfield.copy(deep=True)
+ else:
+ _merge_field_into(subfield, destination.fields[field_name])
+ elif schema.repeated_field:
+ if not destination.repeated_field:
+ raise ValueError('Failed to merge schemas. Origin schema is repeated, but destination is not')
+ _merge_field_into(schema.repeated_field, destination.repeated_field)
+ else:
+ if destination.dtype != schema.dtype:
+ raise ValueError(f'Failed to merge schemas. Origin schema has dtype {schema.dtype}, '
+ f'but destination has dtype {destination.dtype}')
+def merge_schemas(schemas: Sequence[Union[Schema, Field]]) -> Schema:
+ """Merge a list of schemas."""
+ merged_schema = Schema(fields={})
+ for s in schemas:
+ _merge_field_into(cast(Field, s), cast(Field, merged_schema))
+ return merged_schema
+def schema_contains_path(schema: Schema, path: PathTuple) -> bool:
+ """Check if a schema contains a path."""
+ current_field = cast(Field, schema)
+ for path_part in path:
+ if path_part == PATH_WILDCARD:
+ if current_field.repeated_field is None:
+ return False
+ current_field = current_field.repeated_field
+ else:
+ if current_field.fields is None or path_part not in current_field.fields:
+ return False
+ current_field = current_field.fields[str(path_part)]
+ return True
+def create_signal_schema(signal: Signal, source_path: PathTuple, current_schema: Schema) -> Schema:
+ """Create a schema describing the enriched fields added an enrichment."""
+ leafs = current_schema.leafs
+ # Validate that the enrich fields are actually a valid leaf path.
+ if source_path not in leafs:
+ raise ValueError(f'"{source_path}" is not a valid leaf path. Leaf paths: {leafs.keys()}')
+ signal_schema = signal.fields()
+ signal_schema.signal = signal.dict()
+ enriched_schema = field(fields={signal.key(is_computed_signal=True): signal_schema})
+ for path_part in reversed(source_path):
+ if path_part == PATH_WILDCARD:
+ enriched_schema = Field(repeated_field=enriched_schema)
+ else:
+ enriched_schema = Field(fields={path_part: enriched_schema})
+ if not enriched_schema.fields:
+ raise ValueError('This should not happen since enriched_schema always has fields (see above)')
+ return schema({UUID_COLUMN: 'string', **cast(dict, enriched_schema.fields)})
+def write_embeddings_to_disk(vector_store: str, uuids: Iterable[str], signal_items: Iterable[Item],
+ output_dir: str) -> None:
+ """Write a set of embeddings to disk."""
+ def embedding_predicate(input: Any) -> bool:
+ return (isinstance(input, list) and len(input) > 0 and isinstance(input[0], dict) and
+ EMBEDDING_KEY in input[0])
+ path_keys = flatten_keys(uuids, signal_items, is_primitive_predicate=embedding_predicate)
+ all_embeddings = cast(Iterable[Item],
+ deep_flatten(signal_items, is_primitive_predicate=embedding_predicate))
+ embedding_vectors: list[np.ndarray] = []
+ all_spans: list[tuple[PathKey, list[tuple[int, int]]]] = []
+ for path_key, embeddings in zip(path_keys, all_embeddings):
+ if not path_key or not embeddings:
+ # Sparse embeddings may not have an embedding for every key.
+ continue
+ spans: list[tuple[int, int]] = []
+ for e in embeddings:
+ span = e[VALUE_KEY]
+ vector = e[EMBEDDING_KEY]
+ # We squeeze here because embedding functions can return outer dimensions of 1.
+ embedding_vectors.append(vector.reshape(-1))
+ spans.append((span[TEXT_SPAN_START_FEATURE], span[TEXT_SPAN_END_FEATURE]))
+ all_spans.append((path_key, spans))
+ embedding_matrix = np.array(embedding_vectors)
+ del path_keys, all_embeddings, embedding_vectors
+ gc.collect()
+ # Write to disk.
+ vector_index = VectorDBIndex(vector_store)
+ vector_index.add(all_spans, embedding_matrix)
+ vector_index.save(output_dir)
+ del vector_index
+ gc.collect()
+def write_items_to_parquet(items: Iterable[Item], output_dir: str, schema: Schema,
+ filename_prefix: str, shard_index: int,
+ num_shards: int) -> tuple[str, int]:
+ """Write a set of items to a parquet file, in columnar format."""
+ arrow_schema = schema_to_arrow_schema(schema)
+ out_filename = parquet_filename(filename_prefix, shard_index, num_shards)
+ filepath = os.path.join(output_dir, out_filename)
+ f = open_file(filepath, mode='wb')
+ writer = ParquetWriter(schema)
+ writer.open(f)
+ debug = env('DEBUG', False)
+ num_items = 0
+ for item in items:
+ # Add a UUID column.
+ if UUID_COLUMN not in item:
+ item[UUID_COLUMN] = secrets.token_urlsafe(nbytes=12) # 16 base64 characters.
+ if debug:
+ try:
+ _validate(item, arrow_schema)
+ except Exception as e:
+ raise ValueError(f'Error validating item: {json.dumps(item)}') from e
+ writer.write(item)
+ num_items += 1
+ writer.close()
+ f.close()
+ return out_filename, num_items
+def _validate(item: Item, schema: pa.Schema) -> None:
+ # Try to parse the item using the inferred schema.
+ try:
+ pa.RecordBatch.from_pylist([item], schema=schema)
+ except pa.ArrowTypeError:
+ log('Failed to parse arrow item using the arrow schema.')
+ log('Item:')
+ log(pprint.pformat(item, indent=2))
+ log('Arrow schema:')
+ log(schema)
+ raise # Re-raise the same exception, same stacktrace.
+def parquet_filename(prefix: str, shard_index: int, num_shards: int) -> str:
+ """Return the filename for a parquet file."""
+ return f'{prefix}-{shard_index:05d}-of-{num_shards:05d}.parquet'
+def _flatten_keys(uuid: str, nested_input: Iterable, location: list[int],
+ is_primitive_predicate: Callable[[object], bool]) -> Iterator[VectorKey]:
+ if is_primitive_predicate(nested_input) or is_primitive(nested_input) or isinstance(
+ nested_input, dict):
+ yield (uuid, *location)
+ return
+ for i, input in enumerate(nested_input):
+ yield from _flatten_keys(uuid, input, [*location, i], is_primitive_predicate)
+def flatten_keys(
+ uuids: Iterable[str],
+ nested_input: Iterable,
+ is_primitive_predicate: Callable[[object],
+ bool] = is_primitive) -> Iterator[Optional[VectorKey]]:
+ """Flatten the uuid keys of a nested input."""
+ for uuid, input in zip(uuids, nested_input):
+ if input is None:
+ yield None
+ continue
+ yield from _flatten_keys(uuid, input, [], is_primitive_predicate)
+Tin = TypeVar('Tin')
+Tout = TypeVar('Tout')
+def sparse_to_dense_compute(
+ sparse_input: Iterator[Optional[Tin]],
+ func: Callable[[Iterable[Tin]], Iterable[Tout]]) -> Iterator[Optional[Tout]]:
+ """Densifies the input before calling the provided `func` and sparsifies the output."""
+ locations: list[int] = []
+ total_size: int = 0
+ def densify(x: Iterator[Optional[Tin]]) -> Iterator[Tin]:
+ nonlocal locations, total_size
+ for i, value in enumerate(x):
+ total_size += 1
+ if value is not None:
+ locations.append(i)
+ yield value
+ dense_input = densify(sparse_input)
+ dense_output = iter(func(dense_input))
+ index = 0
+ location_index = 0
+ while True:
+ try:
+ out = next(dense_output)
+ out_index = locations[location_index]
+ while index < out_index:
+ yield None
+ index += 1
+ yield out
+ location_index += 1
+ index += 1
+ except StopIteration:
+ while index < total_size:
+ yield None
+ index += 1
+ return

lilac/data/duckdb_utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""Utils for duckdb."""
+import os
+import duckdb
+from ..env import data_path, env
+def duckdb_setup(con: duckdb.DuckDBPyConnection) -> str:
+ """Setup DuckDB. This includes setting up the extensions directory and GCS access."""
+ con.execute(f"""
+ SET extension_directory='{os.path.join(data_path(), '.duckdb')}';
+ """)
+ con.install_extension('httpfs')
+ con.load_extension('httpfs')
+ if env('GCS_REGION'):
+ return f"""
+ SET s3_region='{env('GCS_REGION')}';
+ SET s3_access_key_id='{env('GCS_ACCESS_KEY')}';
+ SET s3_secret_access_key='{env('GCS_SECRET_KEY')}';
+ SET s3_endpoint='storage.googleapis.com';
+ """
+ return ''

lilac/data_loader.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""A data loader standalone binary. This should only be run as a script to load a dataset.
+To run the source loader as a binary directly:
+poetry run python -m lilac.data_loader \
+ --dataset_name=movies_dataset \
+ --output_dir=./data/ \
+ --config_path=./datasets/the_movies_dataset.json
+"""
+import os
+import pathlib
+import uuid
+from typing import Iterable, Optional, Union
+import pandas as pd
+from .data.dataset import Dataset
+from .data.dataset_utils import write_items_to_parquet
+from .db_manager import get_dataset
+from .env import data_path
+from .schema import (
+ MANIFEST_FILENAME,
+ PARQUET_FILENAME_PREFIX,
+ UUID_COLUMN,
+ Field,
+ Item,
+ Schema,
+ SourceManifest,
+ field,
+ is_float,
+)
+from .sources.source import Source
+from .tasks import TaskStepId, progress
+from .utils import get_dataset_output_dir, log, open_file
+def create_dataset(
+ namespace: str,
+ dataset_name: str,
+ source_config: Source,
+) -> Dataset:
+ """Load a dataset from a given source configuration."""
+ process_source(data_path(), namespace, dataset_name, source_config)
+ return get_dataset(namespace, dataset_name)
+def process_source(base_dir: Union[str, pathlib.Path],
+ namespace: str,
+ dataset_name: str,
+ source: Source,
+ task_step_id: Optional[TaskStepId] = None) -> tuple[str, int]:
+ """Process a source."""
+ output_dir = get_dataset_output_dir(base_dir, namespace, dataset_name)
+ source.setup()
+ source_schema = source.source_schema()
+ items = source.process()
+ # Add UUIDs and fix NaN in string columns.
+ items = normalize_items(items, source_schema.fields)
+ # Add progress.
+ items = progress(
+ items,
+ task_step_id=task_step_id,
+ estimated_len=source_schema.num_items,
+ step_description=f'Reading from source {source.name}...')
+ # Filter out the `None`s after progress.
+ items = (item for item in items if item is not None)
+ data_schema = Schema(fields={**source_schema.fields, UUID_COLUMN: field('string')})
+ filepath, num_items = write_items_to_parquet(
+ items=items,
+ output_dir=output_dir,
+ schema=data_schema,
+ filename_prefix=PARQUET_FILENAME_PREFIX,
+ shard_index=0,
+ num_shards=1)
+ filenames = [os.path.basename(filepath)]
+ manifest = SourceManifest(files=filenames, data_schema=data_schema, images=None)
+ with open_file(os.path.join(output_dir, MANIFEST_FILENAME), 'w') as f:
+ f.write(manifest.json(indent=2, exclude_none=True))
+ log(f'Dataset "{dataset_name}" written to {output_dir}')
+ return output_dir, num_items
+def normalize_items(items: Iterable[Item], fields: dict[str, Field]) -> Item:
+ """Sanitize items by removing NaNs and NaTs."""
+ replace_nan_fields = [
+ field_name for field_name, field in fields.items() if field.dtype and not is_float(field.dtype)
+ ]
+ for item in items:
+ if item is None:
+ yield item
+ continue
+ # Add row uuid if it doesn't exist.
+ if UUID_COLUMN not in item:
+ item[UUID_COLUMN] = uuid.uuid4().hex
+ # Fix NaN values.
+ for field_name in replace_nan_fields:
+ item_value = item.get(field_name)
+ if item_value and pd.isna(item_value):
+ item[field_name] = None
+ yield item

lilac/db_manager.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Manages mapping the dataset name to the database instance."""
+import os
+import threading
+from typing import Type
+from .data.dataset import Dataset
+_DEFAULT_DATASET_CLS: Type[Dataset]
+_CACHED_DATASETS: dict[str, Dataset] = {}
+_db_lock = threading.Lock()
+def get_dataset(namespace: str, dataset_name: str) -> Dataset:
+ """Get the dataset instance."""
+ if not _DEFAULT_DATASET_CLS:
+ raise ValueError('Default dataset class not set.')
+ cache_key = f'{namespace}/{dataset_name}'
+ # https://docs.pytest.org/en/latest/example/simple.html#pytest-current-test-environment-variable
+ inside_test = 'PYTEST_CURRENT_TEST' in os.environ
+ with _db_lock:
+ if cache_key not in _CACHED_DATASETS or inside_test:
+ _CACHED_DATASETS[cache_key] = _DEFAULT_DATASET_CLS(
+ namespace=namespace, dataset_name=dataset_name)
+ return _CACHED_DATASETS[cache_key]
+def remove_dataset_from_cache(namespace: str, dataset_name: str) -> None:
+ """Remove the dataset from the db manager cache."""
+ cache_key = f'{namespace}/{dataset_name}'
+ with _db_lock:
+ if cache_key in _CACHED_DATASETS:
+ del _CACHED_DATASETS[cache_key]
+# TODO(nsthorat): Make this a registry once we have multiple dataset implementations. This breaks a
+# circular dependency.
+def set_default_dataset_cls(dataset_cls: Type[Dataset]) -> None:
+ """Set the default dataset class."""
+ global _DEFAULT_DATASET_CLS
+ _DEFAULT_DATASET_CLS = dataset_cls

lilac/embeddings/__init__.py ADDED Viewed

File without changes

lilac/embeddings/cohere.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""Cohere embeddings."""
+from typing import TYPE_CHECKING, Iterable, cast
+import numpy as np
+from typing_extensions import override
+from ..env import env
+from ..schema import Item, RichData
+from ..signals.signal import TextEmbeddingSignal
+from ..signals.splitters.chunk_splitter import split_text
+from .embedding import compute_split_embeddings
+if TYPE_CHECKING:
+ from cohere import Client
+NUM_PARALLEL_REQUESTS = 10
+COHERE_BATCH_SIZE = 96
+class Cohere(TextEmbeddingSignal):
+ """Computes embeddings using Cohere's embedding API.
+ <br>**Important**: This will send data to an external server!
+ <br>To use this signal, you must get a Cohere API key from
+ [cohere.com/embed](https://cohere.com/embed) and add it to your .env.local.
+ <br>For details on pricing, see: https://cohere.com/pricing.
+ """
+ name = 'cohere'
+ display_name = 'Cohere Embeddings'
+ _model: 'Client'
+ @override
+ def setup(self) -> None:
+ """Validate that the api key and python package exists in environment."""
+ api_key = env('COHERE_API_KEY')
+ if not api_key:
+ raise ValueError('`COHERE_API_KEY` environment variable not set.')
+ try:
+ import cohere
+ self._model = cohere.Client(api_key, max_retries=10)
+ except ImportError:
+ raise ImportError('Could not import the "cohere" python package. '
+ 'Please install it with `pip install cohere`.')
+ @override
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
+ """Compute embeddings for the given documents."""
+ def embed_fn(texts: list[str]) -> list[np.ndarray]:
+ return self._model.embed(texts, truncate='END').embeddings
+ docs = cast(Iterable[str], docs)
+ split_fn = split_text if self._split else None
+ yield from compute_split_embeddings(
+ docs, COHERE_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)

lilac/embeddings/default_vector_stores.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Registers all vector stores."""
+from .vector_store import register_vector_store
+from .vector_store_hnsw import HNSWVectorStore
+from .vector_store_numpy import NumpyVectorStore
+def register_default_vector_stores() -> None:
+ """Register all the default vector stores."""
+ register_vector_store(HNSWVectorStore)
+ register_vector_store(NumpyVectorStore)

lilac/embeddings/embedding.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Embedding registry."""
+from concurrent.futures import ThreadPoolExecutor
+from typing import Callable, Generator, Iterable, Iterator, Optional, Union, cast
+import numpy as np
+from pydantic import StrictStr
+from sklearn.preprocessing import normalize
+from ..schema import (
+ EMBEDDING_KEY,
+ TEXT_SPAN_END_FEATURE,
+ TEXT_SPAN_START_FEATURE,
+ VALUE_KEY,
+ Item,
+ RichData,
+ SpanVector,
+ lilac_embedding,
+)
+from ..signals.signal import TextEmbeddingSignal, get_signal_by_type
+from ..signals.splitters.chunk_splitter import TextChunk
+from ..utils import chunks
+EmbeddingId = Union[StrictStr, TextEmbeddingSignal]
+EmbedFn = Callable[[Iterable[RichData]], Iterator[list[SpanVector]]]
+def get_embed_fn(embedding_name: str, split: bool) -> EmbedFn:
+ """Return a function that returns the embedding matrix for the given embedding signal."""
+ embedding_cls = get_signal_by_type(embedding_name, TextEmbeddingSignal)
+ embedding = embedding_cls(split=split)
+ embedding.setup()
+ def _embed_fn(data: Iterable[RichData]) -> Iterator[list[SpanVector]]:
+ items = embedding.compute(data)
+ for item in items:
+ if not item:
+ raise ValueError('Embedding signal returned None.')
+ yield [{
+ 'vector': item_val[EMBEDDING_KEY].reshape(-1),
+ 'span':
+ (item_val[VALUE_KEY][TEXT_SPAN_START_FEATURE], item_val[VALUE_KEY][TEXT_SPAN_END_FEATURE])
+ } for item_val in item]
+ return _embed_fn
+def compute_split_embeddings(docs: Iterable[str],
+ batch_size: int,
+ embed_fn: Callable[[list[str]], list[np.ndarray]],
+ split_fn: Optional[Callable[[str], list[TextChunk]]] = None,
+ num_parallel_requests: int = 1) -> Generator[Item, None, None]:
+ """Compute text embeddings in batches of chunks, using the provided splitter and embedding fn."""
+ pool = ThreadPoolExecutor()
+ def _splitter(doc: str) -> list[TextChunk]:
+ if not doc:
+ return []
+ if split_fn:
+ return split_fn(doc)
+ else:
+ # Return a single chunk that spans the entire document.
+ return [(doc, (0, len(doc)))]
+ num_docs = 0
+ def _flat_split_batch_docs(docs: Iterable[str]) -> Generator[tuple[int, TextChunk], None, None]:
+ """Split a batch of documents into chunks and yield them."""
+ nonlocal num_docs
+ for i, doc in enumerate(docs):
+ num_docs += 1
+ chunks = _splitter(doc)
+ for chunk in chunks:
+ yield (i, chunk)
+ doc_chunks = _flat_split_batch_docs(docs)
+ items_to_yield: Optional[list[Item]] = None
+ current_index = 0
+ mega_batch_size = batch_size * num_parallel_requests
+ for batch in chunks(doc_chunks, mega_batch_size):
+ texts = [text for _, (text, _) in batch]
+ embeddings: list[np.ndarray] = []
+ for x in list(pool.map(lambda x: embed_fn(x), chunks(texts, batch_size))):
+ embeddings.extend(x)
+ matrix = cast(np.ndarray, normalize(np.array(embeddings, dtype=np.float32)))
+ # np.split returns a shallow copy of each embedding so we don't increase the mem footprint.
+ embeddings_batch = cast(list[np.ndarray], np.split(matrix, matrix.shape[0]))
+ for (index, (_, (start, end))), embedding in zip(batch, embeddings_batch):
+ embedding = embedding.reshape(-1)
+ if index == current_index:
+ if items_to_yield is None:
+ items_to_yield = []
+ items_to_yield.append(lilac_embedding(start, end, embedding))
+ else:
+ yield items_to_yield
+ current_index += 1
+ while current_index < index:
+ yield None
+ current_index += 1
+ items_to_yield = [lilac_embedding(start, end, embedding)]
+ while current_index < num_docs:
+ yield items_to_yield
+ items_to_yield = None
+ current_index += 1

lilac/embeddings/gte.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
+from typing import TYPE_CHECKING, Iterable, cast
+from typing_extensions import override
+from ..schema import Item, RichData
+from ..signals.signal import TextEmbeddingSignal
+from ..signals.splitters.chunk_splitter import split_text
+from .embedding import compute_split_embeddings
+from .transformer_utils import get_model
+if TYPE_CHECKING:
+ pass
+# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
+GTE_SMALL = 'thenlper/gte-small'
+GTE_BASE = 'thenlper/gte-base'
+# Maps a tuple of model name and device to the optimal batch size, found empirically.
+_OPTIMAL_BATCH_SIZES: dict[str, dict[str, int]] = {
+ GTE_SMALL: {
+ '': 64, # Default batch size.
+ 'mps': 256,
+ },
+ GTE_BASE: {
+ '': 64, # Default batch size.
+ 'mps': 128,
+ }
+}
+class GTESmall(TextEmbeddingSignal):
+ """Computes Gegeral Text Embeddings (GTE).
+ <br>This embedding runs on-device. See the [model card](https://huggingface.co/thenlper/gte-small)
+ for details.
+ """
+ name = 'gte-small'
+ display_name = 'Gegeral Text Embeddings (small)'
+ _model_name = GTE_SMALL
+ @override
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
+ """Call the embedding function."""
+ batch_size, model = get_model(self._model_name, _OPTIMAL_BATCH_SIZES[self._model_name])
+ embed_fn = model.encode
+ split_fn = split_text if self._split else None
+ docs = cast(Iterable[str], docs)
+ yield from compute_split_embeddings(docs, batch_size, embed_fn=embed_fn, split_fn=split_fn)
+class GTEBase(GTESmall):
+ """Computes Gegeral Text Embeddings (GTE).
+ <br>This embedding runs on-device. See the [model card](https://huggingface.co/thenlper/gte-base)
+ for details.
+ """
+ name = 'gte-base'
+ display_name = 'Gegeral Text Embeddings (base)'
+ _model_name = GTE_BASE

lilac/embeddings/openai.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""OpenAI embeddings."""
+from typing import TYPE_CHECKING, Any, Iterable, cast
+import numpy as np
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+from typing_extensions import override
+from ..env import env
+from ..schema import Item, RichData
+from ..signals.signal import TextEmbeddingSignal
+from ..signals.splitters.chunk_splitter import split_text
+from .embedding import compute_split_embeddings
+if TYPE_CHECKING:
+ import openai
+NUM_PARALLEL_REQUESTS = 10
+OPENAI_BATCH_SIZE = 128
+EMBEDDING_MODEL = 'text-embedding-ada-002'
+class OpenAI(TextEmbeddingSignal):
+ """Computes embeddings using OpenAI's embedding API.
+ <br>**Important**: This will send data to an external server!
+ <br>To use this signal, you must get an OpenAI API key from
+ [platform.openai.com](https://platform.openai.com/) and add it to your .env.local.
+ <br>For details on pricing, see: https://openai.com/pricing.
+ """
+ name = 'openai'
+ display_name = 'OpenAI Embeddings'
+ _model: type['openai.Embedding']
+ @override
+ def setup(self) -> None:
+ api_key = env('OPENAI_API_KEY')
+ if not api_key:
+ raise ValueError('`OPENAI_API_KEY` environment variable not set.')
+ try:
+ import openai
+ openai.api_key = api_key
+ self._model = openai.Embedding
+ except ImportError:
+ raise ImportError('Could not import the "openai" python package. '
+ 'Please install it with `pip install openai`.')
+ @override
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
+ """Compute embeddings for the given documents."""
+ @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
+ def embed_fn(texts: list[str]) -> list[np.ndarray]:
+ # Replace newlines, which can negatively affect performance.
+ # See https://github.com/search?q=repo%3Aopenai%2Fopenai-python+replace+newlines&type=code
+ texts = [text.replace('\n', ' ') for text in texts]
+ response: Any = self._model.create(input=texts, model=EMBEDDING_MODEL)
+ return [np.array(embedding['embedding'], dtype=np.float32) for embedding in response['data']]
+ docs = cast(Iterable[str], docs)
+ split_fn = split_text if self._split else None
+ yield from compute_split_embeddings(
+ docs, OPENAI_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)

lilac/embeddings/palm.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""PaLM embeddings."""
+from typing import TYPE_CHECKING, Iterable, cast
+import numpy as np
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+from typing_extensions import override
+from ..env import env
+from ..schema import Item, RichData
+from ..signals.signal import TextEmbeddingSignal
+from ..signals.splitters.chunk_splitter import split_text
+from .embedding import compute_split_embeddings
+if TYPE_CHECKING:
+ import google.generativeai as palm
+PALM_BATCH_SIZE = 1 # PaLM API only supports batch size 1.
+NUM_PARALLEL_REQUESTS = 256 # Because batch size is 1, we can send many requests in parallel.
+EMBEDDING_MODEL = 'models/embedding-gecko-001'
+class PaLM(TextEmbeddingSignal):
+ """Computes embeddings using PaLM's embedding API.
+ <br>**Important**: This will send data to an external server!
+ <br>To use this signal, you must get a PaLM API key from
+ [makersuite.google.com](https://makersuite.google.com/app/apikey) and add it to your .env.local.
+ """
+ name = 'palm'
+ display_name = 'PaLM Embeddings'
+ _model: 'palm.generate_embeddings'
+ @override
+ def setup(self) -> None:
+ api_key = env('PALM_API_KEY')
+ if not api_key:
+ raise ValueError('`PALM_API_KEY` environment variable not set.')
+ try:
+ import google.generativeai as palm
+ palm.configure(api_key=api_key)
+ self._model = palm.generate_embeddings
+ except ImportError:
+ raise ImportError('Could not import the "google.generativeai" python package. '
+ 'Please install it with `pip install google-generativeai`.')
+ @override
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
+ """Compute embeddings for the given documents."""
+ @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
+ def embed_fn(texts: list[str]) -> list[np.ndarray]:
+ assert len(texts) == 1, 'PaLM API only supports batch size 1.'
+ response = self._model(model=EMBEDDING_MODEL, text=texts[0])
+ return [np.array(response['embedding'], dtype=np.float32)]
+ docs = cast(Iterable[str], docs)
+ split_fn = split_text if self._split else None
+ yield from compute_split_embeddings(
+ docs, PALM_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)

lilac/embeddings/sbert.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Sentence-BERT embeddings. Open-source models, designed to run on device."""
+from typing import Iterable, cast
+from typing_extensions import override
+from ..schema import Item, RichData
+from ..signals.signal import TextEmbeddingSignal
+from ..signals.splitters.chunk_splitter import split_text
+from .embedding import compute_split_embeddings
+from .transformer_utils import get_model
+# The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2`` is 5 times
+# faster and still offers good quality. See https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models/
+MINI_LM_MODEL = 'all-MiniLM-L6-v2'
+# Maps a tuple of model name and device to the optimal batch size, found empirically.
+_OPTIMAL_BATCH_SIZES: dict[str, dict[str, int]] = {
+ MINI_LM_MODEL: {
+ '': 64, # Default batch size.
+ 'mps': 256,
+ }
+}
+class SBERT(TextEmbeddingSignal):
+ """Computes embeddings using Sentence-BERT library."""
+ name = 'sbert'
+ display_name = 'SBERT Embeddings'
+ @override
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
+ """Call the embedding function."""
+ batch_size, model = get_model(MINI_LM_MODEL, _OPTIMAL_BATCH_SIZES[MINI_LM_MODEL])
+ embed_fn = model.encode
+ split_fn = split_text if self._split else None
+ docs = cast(Iterable[str], docs)
+ yield from compute_split_embeddings(docs, batch_size, embed_fn=embed_fn, split_fn=split_fn)

lilac/embeddings/transformer_utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""Utils for transformer embeddings."""
+import functools
+import os
+from typing import TYPE_CHECKING, Optional
+from ..env import data_path
+from ..utils import log
+if TYPE_CHECKING:
+ from sentence_transformers import SentenceTransformer
+def get_model(model_name: str,
+ optimal_batch_sizes: dict[str, int] = {}) -> tuple[int, 'SentenceTransformer']:
+ """Get a transformer model and the optimal batch size for it."""
+ try:
+ import torch.backends.mps
+ from sentence_transformers import SentenceTransformer
+ except ImportError:
+ raise ImportError('Could not import the "sentence_transformers" python package. '
+ 'Please install it with `pip install sentence-transformers`.')
+ preferred_device: Optional[str] = None
+ if torch.backends.mps.is_available():
+ preferred_device = 'mps'
+ elif not torch.backends.mps.is_built():
+ log('MPS not available because the current PyTorch install was not built with MPS enabled.')
+ @functools.cache
+ def _get_model(model_name: str) -> 'SentenceTransformer':
+ return SentenceTransformer(
+ model_name, device=preferred_device, cache_folder=os.path.join(data_path(), '.cache'))
+ batch_size = optimal_batch_sizes[preferred_device or '']
+ return batch_size, _get_model(model_name)

lilac/embeddings/vector_store.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""Interface for storing vectors."""
+import abc
+import os
+import pickle
+from typing import Iterable, Optional, Type
+import numpy as np
+from ..schema import SpanVector, VectorKey
+from ..utils import open_file
+class VectorStore(abc.ABC):
+ """Interface for storing and retrieving vectors."""
+ # The global name of the vector store.
+ name: str
+ @abc.abstractmethod
+ def save(self, base_path: str) -> None:
+ """Save the store to disk."""
+ pass
+ @abc.abstractmethod
+ def load(self, base_path: str) -> None:
+ """Load the store from disk."""
+ pass
+ @abc.abstractmethod
+ def size(self) -> int:
+ """Return the number of vectors in the store."""
+ pass
+ @abc.abstractmethod
+ def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
+ """Add or edit the given keyed embeddings to the store.
+ If the keys already exist they will be overwritten, acting as an "upsert".
+ Args:
+ keys: The keys to add the embeddings for.
+ embeddings: The embeddings to add. This should be a 2D matrix with the same length as keys.
+ """
+ pass
+ @abc.abstractmethod
+ def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
+ """Return the embeddings for given keys.
+ Args:
+ keys: The keys to return the embeddings for. If None, return all embeddings.
+ Returns
+ The embeddings for the given keys.
+ """
+ pass
+ def topk(self,
+ query: np.ndarray,
+ k: int,
+ keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
+ """Return the top k most similar vectors.
+ Args:
+ query: The query vector.
+ k: The number of results to return.
+ keys: Optional keys to restrict the search to.
+ Returns
+ A list of (key, score) tuples.
+ """
+ raise NotImplementedError
+PathKey = VectorKey
+_SPANS_PICKLE_NAME = 'spans.pkl'
+class VectorDBIndex:
+ """Stores and retrives span vectors.
+ This wraps a regular vector store by adding a mapping from path keys, such as (uuid1, 0),
+ to span keys, such as (uuid1, 0, 0), which denotes the first span in the (uuid1, 0) text document.
+ """
+ def __init__(self, vector_store: str) -> None:
+ self._vector_store: VectorStore = get_vector_store_cls(vector_store)()
+ # Map a path key to spans for that path.
+ self._id_to_spans: dict[PathKey, list[tuple[int, int]]] = {}
+ def load(self, base_path: str) -> None:
+ """Load the vector index from disk."""
+ assert not self._id_to_spans, 'Cannot load into a non-empty index.'
+ with open_file(os.path.join(base_path, _SPANS_PICKLE_NAME), 'rb') as f:
+ self._id_to_spans.update(pickle.load(f))
+ self._vector_store.load(os.path.join(base_path, self._vector_store.name))
+ def save(self, base_path: str) -> None:
+ """Save the vector index to disk."""
+ assert self._id_to_spans, 'Cannot save an empty index.'
+ with open_file(os.path.join(base_path, _SPANS_PICKLE_NAME), 'wb') as f:
+ pickle.dump(list(self._id_to_spans.items()), f)
+ self._vector_store.save(os.path.join(base_path, self._vector_store.name))
+ def add(self, spans: list[tuple[PathKey, list[tuple[int, int]]]], embeddings: np.ndarray) -> None:
+ """Add the given spans and embeddings.
+ Args:
+ spans: The spans to initialize the index with.
+ embeddings: The embeddings to initialize the index with.
+ """
+ assert not self._id_to_spans, 'Cannot add to a non-empty index.'
+ self._id_to_spans.update(spans)
+ vector_keys = [(*path_key, i) for path_key, spans in spans for i in range(len(spans))]
+ assert len(vector_keys) == len(embeddings), (
+ f'Number of spans ({len(vector_keys)}) and embeddings ({len(embeddings)}) must match.')
+ self._vector_store.add(vector_keys, embeddings)
+ def get_vector_store(self) -> VectorStore:
+ """Return the underlying vector store."""
+ return self._vector_store
+ def get(self, keys: Iterable[PathKey]) -> Iterable[list[SpanVector]]:
+ """Return the spans with vectors for each key in `keys`.
+ Args:
+ keys: The keys to return the vectors for.
+ Returns
+ The span vectors for the given keys.
+ """
+ all_spans: list[list[tuple[int, int]]] = []
+ vector_keys: list[VectorKey] = []
+ for path_key in keys:
+ spans = self._id_to_spans[path_key]
+ all_spans.append(spans)
+ vector_keys.extend([(*path_key, i) for i in range(len(spans))])
+ all_vectors = self._vector_store.get(vector_keys)
+ offset = 0
+ for spans in all_spans:
+ vectors = all_vectors[offset:offset + len(spans)]
+ yield [{'span': span, 'vector': vector} for span, vector in zip(spans, vectors)]
+ offset += len(spans)
+ def topk(self,
+ query: np.ndarray,
+ k: int,
+ path_keys: Optional[Iterable[PathKey]] = None) -> list[tuple[PathKey, float]]:
+ """Return the top k most similar vectors.
+ Args:
+ query: The query vector.
+ k: The number of results to return.
+ path_keys: Optional key prefixes to restrict the search to.
+ Returns
+ A list of (key, score) tuples.
+ """
+ span_keys: Optional[list[VectorKey]] = None
+ if path_keys is not None:
+ span_keys = [
+ (*path_key, i) for path_key in path_keys for i in range(len(self._id_to_spans[path_key]))
+ ]
+ span_k = k
+ path_key_scores: dict[PathKey, float] = {}
+ total_num_span_keys = self._vector_store.size()
+ while (len(path_key_scores) < k and span_k < total_num_span_keys and
+ (not span_keys or span_k < len(span_keys))):
+ span_k += k
+ vector_key_scores = self._vector_store.topk(query, span_k, span_keys)
+ for (*path_key_list, _), score in vector_key_scores:
+ path_key = tuple(path_key_list)
+ if path_key not in path_key_scores:
+ path_key_scores[path_key] = score
+ return list(path_key_scores.items())[:k]
+VECTOR_STORE_REGISTRY: dict[str, Type[VectorStore]] = {}
+def register_vector_store(vector_store_cls: Type[VectorStore]) -> None:
+ """Register a vector store in the global registry."""
+ if vector_store_cls.name in VECTOR_STORE_REGISTRY:
+ raise ValueError(f'Vector store "{vector_store_cls.name}" has already been registered!')
+ VECTOR_STORE_REGISTRY[vector_store_cls.name] = vector_store_cls
+def get_vector_store_cls(vector_store_name: str) -> Type[VectorStore]:
+ """Return a registered vector store given the name in the registry."""
+ return VECTOR_STORE_REGISTRY[vector_store_name]
+def clear_vector_store_registry() -> None:
+ """Clear the vector store registry."""
+ VECTOR_STORE_REGISTRY.clear()

lilac/embeddings/vector_store_hnsw.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""HNSW vector store."""
+import multiprocessing
+from typing import Iterable, Optional, Set, cast
+import hnswlib
+import numpy as np
+import pandas as pd
+from typing_extensions import override
+from ..schema import VectorKey
+from ..utils import DebugTimer
+from .vector_store import VectorStore
+_HNSW_SUFFIX = '.hnswlib.bin'
+_LOOKUP_SUFFIX = '.lookup.pkl'
+class HNSWVectorStore(VectorStore):
+ """HNSW-backed vector store."""
+ name = 'hnsw'
+ def __init__(self) -> None:
+ # Maps a `VectorKey` to a row index in `_embeddings`.
+ self._key_to_label: Optional[pd.Series] = None
+ self._index: Optional[hnswlib.Index] = None
+ @override
+ def save(self, base_path: str) -> None:
+ assert self._key_to_label is not None and self._index is not None, (
+ 'The vector store has no embeddings. Call load() or add() first.')
+ self._index.save_index(base_path + _HNSW_SUFFIX)
+ self._key_to_label.to_pickle(base_path + _LOOKUP_SUFFIX)
+ @override
+ def load(self, base_path: str) -> None:
+ self._key_to_label = pd.read_pickle(base_path + _LOOKUP_SUFFIX)
+ dim = int(self._key_to_label.name)
+ index = hnswlib.Index(space='ip', dim=dim)
+ index.set_ef(10)
+ index.set_num_threads(multiprocessing.cpu_count())
+ index.load_index(base_path + _HNSW_SUFFIX)
+ self._index = index
+ @override
+ def size(self) -> int:
+ assert self._index is not None, (
+ 'The vector store has no embeddings. Call load() or add() first.')
+ return self._index.get_current_count()
+ @override
+ def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
+ assert self._index is None, (
+ 'Embeddings already exist in this store. Upsert is not yet supported.')
+ if len(keys) != embeddings.shape[0]:
+ raise ValueError(
+ f'Length of keys ({len(keys)}) does not match number of embeddings {embeddings.shape[0]}.')
+ dim = embeddings.shape[1]
+ with DebugTimer('hnswlib index creation'):
+ index = hnswlib.Index(space='ip', dim=dim)
+ index.set_ef(10)
+ index.set_num_threads(multiprocessing.cpu_count())
+ index.init_index(max_elements=len(keys), ef_construction=50, M=16)
+ # Cast to float32 since dot product with float32 is 40-50x faster than float16 and 2.5x faster
+ # than float64.
+ embeddings = embeddings.astype(np.float32)
+ row_indices = np.arange(len(keys), dtype=np.int32)
+ self._key_to_label = pd.Series(row_indices, index=keys, dtype=np.int32)
+ self._key_to_label.name = str(dim)
+ index.add_items(embeddings, row_indices)
+ self._index = index
+ @override
+ def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
+ assert self._index is not None and self._key_to_label is not None, (
+ 'No embeddings exist in this store.')
+ if not keys:
+ return np.array(self._index.get_items(self._key_to_label.values), dtype=np.float32)
+ locs = self._key_to_label.loc[cast(list[str], keys)].values
+ return np.array(self._index.get_items(locs), dtype=np.float32)
+ @override
+ def topk(self,
+ query: np.ndarray,
+ k: int,
+ keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
+ assert self._index is not None and self._key_to_label is not None, (
+ 'No embeddings exist in this store.')
+ labels: Set[int] = set()
+ if keys is not None:
+ labels = set(self._key_to_label.loc[cast(list[str], keys)].tolist())
+ k = min(k, len(labels))
+ def filter_func(label: int) -> bool:
+ return label in labels
+ query = np.expand_dims(query.astype(np.float32), axis=0)
+ locs, dists = self._index.knn_query(query, k=k, filter=filter_func if labels else None)
+ locs = locs[0]
+ dists = dists[0]
+ topk_keys = self._key_to_label.index.values[locs]
+ return [(key, 1 - dist) for key, dist in zip(topk_keys, dists)]

lilac/embeddings/vector_store_numpy.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""NumpyVectorStore class for storing vectors in numpy arrays."""
+from typing import Iterable, Optional, cast
+import numpy as np
+import pandas as pd
+from typing_extensions import override
+from ..schema import VectorKey
+from .vector_store import VectorStore
+_EMBEDDINGS_SUFFIX = '.matrix.npy'
+_LOOKUP_SUFFIX = '.lookup.pkl'
+class NumpyVectorStore(VectorStore):
+ """Stores vectors as in-memory np arrays."""
+ name = 'numpy'
+ def __init__(self) -> None:
+ self._embeddings: Optional[np.ndarray] = None
+ # Maps a `VectorKey` to a row index in `_embeddings`.
+ self._key_to_index: Optional[pd.Series] = None
+ @override
+ def size(self) -> int:
+ assert self._embeddings is not None, (
+ 'The vector store has no embeddings. Call load() or add() first.')
+ return len(self._embeddings)
+ @override
+ def save(self, base_path: str) -> None:
+ assert self._embeddings is not None and self._key_to_index is not None, (
+ 'The vector store has no embeddings. Call load() or add() first.')
+ np.save(base_path + _EMBEDDINGS_SUFFIX, self._embeddings, allow_pickle=False)
+ self._key_to_index.to_pickle(base_path + _LOOKUP_SUFFIX)
+ @override
+ def load(self, base_path: str) -> None:
+ self._embeddings = np.load(base_path + _EMBEDDINGS_SUFFIX, allow_pickle=False)
+ self._key_to_index = pd.read_pickle(base_path + _LOOKUP_SUFFIX)
+ @override
+ def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
+ if self._embeddings or self._key_to_index:
+ raise ValueError('Embeddings already exist in this store. Upsert is not yet supported.')
+ if len(keys) != embeddings.shape[0]:
+ raise ValueError(
+ f'Length of keys ({len(keys)}) does not match number of embeddings {embeddings.shape[0]}.')
+ # Cast to float32 since dot product with float32 is 40-50x faster than float16 and 2.5x faster
+ # than float64.
+ self._embeddings = embeddings.astype(np.float32)
+ row_indices = np.arange(len(embeddings), dtype=np.uint32)
+ self._key_to_index = pd.Series(row_indices, index=keys, dtype=np.uint32)
+ @override
+ def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
+ assert self._embeddings is not None and self._key_to_index is not None, (
+ 'The vector store has no embeddings. Call load() or add() first.')
+ if not keys:
+ return self._embeddings
+ locs = self._key_to_index.loc[cast(list[str], keys)]
+ return self._embeddings.take(locs, axis=0)
+ @override
+ def topk(self,
+ query: np.ndarray,
+ k: int,
+ keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
+ assert self._embeddings is not None and self._key_to_index is not None, (
+ 'The vector store has no embeddings. Call load() or add() first.')
+ if keys is not None:
+ row_indices = self._key_to_index.loc[cast(list[str], keys)]
+ embeddings = self._embeddings.take(row_indices, axis=0)
+ keys = list(keys)
+ else:
+ keys, embeddings = cast(list[VectorKey], self._key_to_index.index.tolist()), self._embeddings
+ query = query.astype(embeddings.dtype)
+ similarities: np.ndarray = np.dot(embeddings, query).reshape(-1)
+ k = min(k, len(similarities))
+ # We do a partition + sort only top K to save time: O(n + klogk) instead of O(nlogn).
+ indices = np.argpartition(similarities, -k)[-k:]
+ # Indices sorted by value from largest to smallest.
+ indices = indices[np.argsort(similarities[indices])][::-1]
+ topk_similarities = similarities[indices]
+ topk_keys = [keys[idx] for idx in indices]
+ return list(zip(topk_keys, topk_similarities))

lilac/env.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Load environment variables from .env file."""
+import os
+from typing import Any, Literal, Optional, Union, cast
+from dotenv import load_dotenv
+EnvironmentKeys = Union[Literal['LILAC_DATA_PATH'],
+ # Authentication on the demo.
+ Literal['LILAC_AUTH_ENABLED'], Literal['GOOGLE_CLIENT_ID'],
+ Literal['GOOGLE_CLIENT_SECRET'], Literal['LILAC_OAUTH_SECRET_KEY'],
+ # DuckDB accessing GCS.
+ Literal['GCS_REGION'], Literal['GCS_ACCESS_KEY'], Literal['GCS_SECRET_KEY'],
+ # Embedding API keys.
+ Literal['OPENAI_API_KEY'], Literal['COHERE_API_KEY'],
+ Literal['PALM_API_KEY'],
+ # HuggingFace demos.
+ Literal['HF_USERNAME'], Literal['HF_STAGING_DEMO_REPO'],
+ Literal['SPACE_ID'], Literal['HF_ACCESS_TOKEN'],
+ # DuckDB
+ Literal['DUCKDB_USE_VIEWS'],
+ # Debugging
+ Literal['DEBUG'], Literal['DISABLE_LOGS']]
+def _init_env() -> None:
+ in_test = os.environ.get('LILAC_TEST', None)
+ # Load the .env files into the environment in order of highest to lowest priority.
+ if not in_test: # Skip local environment variables when testing.
+ load_dotenv('.env.local')
+ load_dotenv('.env.demo')
+ load_dotenv('.env')
+ if os.environ.get('LILAC_AUTH_ENABLED', None):
+ if not os.environ.get('GOOGLE_CLIENT_ID', None) or not os.environ.get(
+ 'GOOGLE_CLIENT_SECRET', None):
+ raise ValueError(
+ 'Missing `GOOGLE_CLIENT_ID` or `GOOGLE_CLIENT_SECRET` when `LILAC_AUTH_ENABLED=true`')
+ SECRET_KEY = os.environ.get('LILAC_OAUTH_SECRET_KEY', None)
+ if not SECRET_KEY:
+ raise ValueError('Missing `LILAC_OAUTH_SECRET_KEY` when `LILAC_AUTH_ENABLED=true`')
+ if os.environ.get('LILAC_AUTH_ENABLED', None):
+ if not os.environ.get('GOOGLE_CLIENT_ID', None) or not os.environ.get(
+ 'GOOGLE_CLIENT_SECRET', None):
+ raise ValueError(
+ 'Missing `GOOGLE_CLIENT_ID` or `GOOGLE_CLIENT_SECRET` when `LILAC_AUTH_ENABLED=true`')
+ SECRET_KEY = os.environ.get('LILAC_OAUTH_SECRET_KEY', None)
+ if not SECRET_KEY:
+ raise ValueError('Missing `LILAC_OAUTH_SECRET_KEY` when `LILAC_AUTH_ENABLED=true`')
+def env(key: EnvironmentKeys, default: Optional[Any] = None) -> Any:
+ """Return the value of an environment variable."""
+ return os.environ.get(key, default)
+def data_path() -> str:
+ """Return the base path for data."""
+ return cast(str, env('LILAC_DATA_PATH', './data'))
+# Initialize the environment at import time.
+_init_env()

lilac/load.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""A script to load a dataset or set of datasets from a config for a Lilac instance.
+Usage:
+poetry run python -m lilac.load \
+ --output_dir=demo_data \
+ --config_path=demo.yml
+"""
+import gc
+import json
+import os
+import pathlib
+import shutil
+import click
+import dask
+import psutil
+import yaml
+from distributed import Client
+from .config import Config, EmbeddingConfig, SignalConfig
+from .data_loader import process_source
+from .db_manager import get_dataset
+from .schema import UUID_COLUMN
+from .tasks import TaskManager, TaskStepId
+from .utils import DebugTimer, get_datasets_dir, list_datasets
+@click.command()
+@click.option(
+ '--output_dir', required=True, type=str, help='The output directory to write files to.')
+@click.option(
+ '--config_path',
+ required=True,
+ type=str,
+ help='The path to a json or yml file describing the configuration. '
+ 'The file contents should be an instance of `lilac.Config`.')
+@click.option(
+ '--overwrite',
+ help='When True, runs all all data from scratch, overwriting existing data. When false, only'
+ 'load new datasets, embeddings, and signals.',
+ type=bool,
+ is_flag=True,
+ default=False)
+def load_command(output_dir: str, config_path: str, overwrite: bool) -> None:
+ """Run the source loader as a binary."""
+ load(output_dir, config_path, overwrite)
+def load(output_dir: str, config_path: str, overwrite: bool) -> None:
+ """Run the source loader as a binary."""
+ old_data_path = os.environ.get('LILAC_DATA_PATH')
+ os.environ['LILAC_DATA_PATH'] = output_dir
+ # Turn off debug logging.
+ del os.environ['DEBUG']
+ config_ext = pathlib.Path(config_path).suffix
+ if config_ext in ['.yml', '.yaml']:
+ with open(config_path, 'r') as f:
+ config_dict = yaml.safe_load(f)
+ elif config_ext in ['.json']:
+ with open(config_path, 'r') as f:
+ config_dict = json.load(f)
+ else:
+ raise ValueError(f'Unsupported config file extension: {config_ext}')
+ config = Config(**config_dict)
+ # Explicitly create a dask client in sync mode.
+ dask.config.set({'distributed.worker.daemon': False})
+ total_memory_gb = psutil.virtual_memory().total / (1024**3)
+ task_manager = TaskManager(Client(memory_limit=f'{total_memory_gb} GB'))
+ if overwrite:
+ shutil.rmtree(get_datasets_dir(output_dir), ignore_errors=True)
+ existing_datasets = [f'{d.namespace}/{d.dataset_name}' for d in list_datasets(output_dir)]
+ print()
+ print('*** Load datasets ***')
+ if overwrite:
+ datasets_to_load = config.datasets
+ else:
+ datasets_to_load = [
+ d for d in config.datasets if f'{d.namespace}/{d.name}' not in existing_datasets
+ ]
+ skipped_datasets = [
+ d for d in config.datasets if f'{d.namespace}/{d.name}' in existing_datasets
+ ]
+ print('Skipping loaded datasets:', ', '.join([d.name for d in skipped_datasets]))
+ with DebugTimer(f'Loading datasets: {", ".join([d.name for d in datasets_to_load])}'):
+ for d in datasets_to_load:
+ shutil.rmtree(os.path.join(output_dir, d.name), ignore_errors=True)
+ task_id = task_manager.task_id(f'Load dataset {d.namespace}/{d.name}')
+ task_manager.execute(task_id, process_source, output_dir, d.namespace, d.name, d.source,
+ (task_id, 0))
+ task_manager.wait()
+ print()
+ total_num_rows = 0
+ for d in datasets_to_load:
+ num_rows = get_dataset(d.namespace, d.name).select_rows([UUID_COLUMN], limit=1).total_num_rows
+ print(f'{d.namespace}/{d.name} loaded with {num_rows:,} rows.')
+ gc.collect()
+ total_num_rows += num_rows
+ print(f'Done loading {len(datasets_to_load)} datasets with {total_num_rows:,} rows.')
+ print('*** Dataset settings ***')
+ for d in config.datasets:
+ if d.settings:
+ dataset = get_dataset(d.namespace, d.name)
+ dataset.update_settings(d.settings)
+ print()
+ print('*** Compute embeddings ***')
+ with DebugTimer('Loading embeddings'):
+ for d in config.datasets:
+ # If embeddings are explicitly set, use only those.
+ embeddings = d.embeddings or []
+ # If embeddings are not explicitly set, use the media paths and preferred embedding from
+ # settings.
+ if not embeddings:
+ if d.settings and d.settings.ui:
+ for path in d.settings.ui.media_paths or []:
+ if d.settings.preferred_embedding:
+ embeddings.append(
+ EmbeddingConfig(path=path, embedding=d.settings.preferred_embedding))
+ print('emb configs', embeddings)
+ for e in embeddings:
+ task_id = task_manager.task_id(f'Compute embedding {e.embedding} on {e.path}')
+ task_manager.execute(task_id, _compute_embedding, d.namespace, d.name, e, output_dir,
+ overwrite, (task_id, 0))
+ task_manager.wait()
+ exit()
+ print()
+ print('*** Compute signals ***')
+ with DebugTimer('Computing signals'):
+ for d in config.datasets:
+ # If signals are explicitly set, use only those.
+ signals = d.signals or []
+ # If signals are not explicitly set, use the media paths and config.signals.
+ if not signals:
+ if d.settings and d.settings.ui:
+ for path in d.settings.ui.media_paths or []:
+ for signal in config.signals or []:
+ signals.append(SignalConfig(path=path, signal=signal))
+ for s in signals:
+ task_id = task_manager.task_id(f'Compute signal {s.signal} on {s.path}')
+ task_manager.execute(task_id, _compute_signal, d.namespace, d.name, s, output_dir,
+ overwrite, (task_id, 0))
+ task_manager.wait()
+ print()
+ print('Done!')
+ if old_data_path:
+ os.environ['LILAC_DATA_PATH'] = old_data_path
+def _compute_signal(namespace: str, name: str, signal_config: SignalConfig, output_dir: str,
+ overwrite: bool, task_step_id: TaskStepId) -> None:
+ os.environ['LILAC_DATA_PATH'] = output_dir
+ # Turn off debug logging.
+ if 'DEBUG' in os.environ:
+ del os.environ['DEBUG']
+ compute_signal = False
+ if overwrite:
+ compute_signal = True
+ dataset = get_dataset(namespace, name)
+ if not compute_signal:
+ field = dataset.manifest().data_schema.get_field(signal_config.path)
+ signal_field = (field.fields or {}).get(signal_config.signal.key())
+ if not signal_field or signal_field.signal != signal_config.signal.dict():
+ compute_signal = True
+ if compute_signal:
+ dataset.compute_signal(signal_config.signal, signal_config.path, task_step_id)
+ gc.collect()
+def _compute_embedding(namespace: str, name: str, embedding_config: EmbeddingConfig,
+ output_dir: str, overwrite: bool, task_step_id: TaskStepId) -> None:
+ os.environ['LILAC_DATA_PATH'] = output_dir
+ # Turn off debug logging.
+ if 'DEBUG' in os.environ:
+ del os.environ['DEBUG']
+ compute_embedding = False
+ if overwrite:
+ compute_embedding = True
+ dataset = get_dataset(namespace, name)
+ if not compute_embedding:
+ field = dataset.manifest().data_schema.get_field(embedding_config.path)
+ embedding_field = (field.fields or {}).get(embedding_config.embedding)
+ if not embedding_field:
+ compute_embedding = True
+ if compute_embedding:
+ dataset.compute_embedding(embedding_config.embedding, embedding_config.path, task_step_id)
+ gc.collect()
+if __name__ == '__main__':
+ load()

lilac/make_openapi.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Writes the openapi.json file to the specified output.
+This is meant to run as a standalone script. It lives in lilac/ so we can import the FastAPI app.
+"""
+import json
+import click
+from fastapi.openapi.utils import get_openapi
+from .server import app
+@click.command()
+@click.option(
+ '--output', required=True, type=str, help='The output filepath for the opepnapi.json file.')
+def main(output: str) -> None:
+ """Create the openapi.json file for the API to generate TypeScript stubs."""
+ with open(output, 'w') as f:
+ json.dump(
+ get_openapi(
+ title=app.title,
+ version=app.version,
+ openapi_version=app.openapi_version,
+ description=app.description,
+ routes=app.routes), f)
+if __name__ == '__main__':
+ main()

lilac/parquet_writer.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""A Parquet file writer that wraps the pyarrow writer."""
+from typing import IO, Optional
+import pyarrow as pa
+import pyarrow.parquet as pq
+from .schema import Item, Schema, schema_to_arrow_schema
+class ParquetWriter:
+ """A writer to parquet."""
+ def __init__(self,
+ schema: Schema,
+ codec: str = 'snappy',
+ row_group_buffer_size: int = 128 * 1024 * 1024,
+ record_batch_size: int = 10_000):
+ self._schema = schema_to_arrow_schema(schema)
+ self._codec = codec
+ self._row_group_buffer_size = row_group_buffer_size
+ self._buffer: list[list[Optional[Item]]] = [[] for _ in range(len(self._schema.names))]
+ self._buffer_size = record_batch_size
+ self._record_batches: list[pa.RecordBatch] = []
+ self._record_batches_byte_size = 0
+ self.writer: pq.ParquetWriter = None
+ def open(self, file_handle: IO) -> None:
+ """Open the destination file for writing."""
+ self.writer = pq.ParquetWriter(file_handle, self._schema, compression=self._codec)
+ def write(self, record: Item) -> None:
+ """Write the record to the destination file."""
+ if len(self._buffer[0]) >= self._buffer_size:
+ self._flush_buffer()
+ if self._record_batches_byte_size >= self._row_group_buffer_size:
+ self._write_batches()
+ # reorder the data in columnar format.
+ for i, n in enumerate(self._schema.names):
+ self._buffer[i].append(record.get(n))
+ def close(self) -> None:
+ """Flushes the write buffer and closes the destination file."""
+ if len(self._buffer[0]) > 0:
+ self._flush_buffer()
+ if self._record_batches_byte_size > 0:
+ self._write_batches()
+ self.writer.close()
+ def _write_batches(self) -> None:
+ table = pa.Table.from_batches(self._record_batches, schema=self._schema)
+ self._record_batches = []
+ self._record_batches_byte_size = 0
+ self.writer.write_table(table)
+ def _flush_buffer(self) -> None:
+ arrays: list[pa.array] = [[] for _ in range(len(self._schema.names))]
+ for x, y in enumerate(self._buffer):
+ arrays[x] = pa.array(y, type=self._schema.types[x])
+ self._buffer[x] = []
+ rb = pa.RecordBatch.from_arrays(arrays, schema=self._schema)
+ self._record_batches.append(rb)
+ size = 0
+ for x in arrays:
+ for b in x.buffers(): # type: ignore
+ if b is not None:
+ size = size + b.size
+ self._record_batches_byte_size = self._record_batches_byte_size + size

lilac/router_concept.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""Router for the concept database."""
+from typing import Annotated, Iterable, Optional, cast
+from fastapi import APIRouter, HTTPException
+from fastapi.params import Depends
+from openai_function_call import OpenAISchema
+from pydantic import BaseModel, Field
+from .auth import UserInfo, get_session_user
+from .concepts.concept import DRAFT_MAIN, Concept, ConceptMetrics, DraftId, draft_examples
+from .concepts.db_concept import DISK_CONCEPT_DB, DISK_CONCEPT_MODEL_DB, ConceptInfo, ConceptUpdate
+from .env import env
+from .router_utils import RouteErrorHandler, server_compute_concept
+from .schema import RichData, SignalInputType
+from .signals.concept_scorer import ConceptScoreSignal
+router = APIRouter(route_class=RouteErrorHandler)
+@router.get('/', response_model_exclude_none=True)
+def get_concepts(
+ user: Annotated[Optional[UserInfo], Depends(get_session_user)]) -> list[ConceptInfo]:
+ """List the concepts."""
+ return DISK_CONCEPT_DB.list(user)
+@router.get('/{namespace}/{concept_name}', response_model_exclude_none=True)
+def get_concept(namespace: str,
+ concept_name: str,
+ draft: Optional[DraftId] = DRAFT_MAIN,
+ user: Annotated[Optional[UserInfo], Depends(get_session_user)] = None) -> Concept:
+ """Get a concept from a database."""
+ concept = DISK_CONCEPT_DB.get(namespace, concept_name, user)
+ if not concept:
+ raise HTTPException(
+ status_code=404,
+ detail=f'Concept "{namespace}/{concept_name}" was not found or user does not have access.')
+ # Only return the examples from the draft.
+ concept.data = draft_examples(concept, draft or DRAFT_MAIN)
+ return concept
+class CreateConceptOptions(BaseModel):
+ """Options for creating a concept."""
+ # Namespace of the concept.
+ namespace: str
+ # Name of the concept.
+ name: str
+ # Input type (modality) of the concept.
+ type: SignalInputType
+ description: Optional[str] = None
+@router.post('/create', response_model_exclude_none=True)
+def create_concept(options: CreateConceptOptions,
+ user: Annotated[Optional[UserInfo],
+ Depends(get_session_user)]) -> Concept:
+ """Edit a concept in the database."""
+ return DISK_CONCEPT_DB.create(options.namespace, options.name, options.type, options.description,
+ user)
+@router.post('/{namespace}/{concept_name}', response_model_exclude_none=True)
+def edit_concept(namespace: str, concept_name: str, change: ConceptUpdate,
+ user: Annotated[Optional[UserInfo], Depends(get_session_user)]) -> Concept:
+ """Edit a concept in the database."""
+ return DISK_CONCEPT_DB.edit(namespace, concept_name, change, user)
+@router.delete('/{namespace}/{concept_name}')
+def delete_concept(namespace: str, concept_name: str,
+ user: Annotated[Optional[UserInfo],
+ Depends(get_session_user)]) -> None:
+ """Deletes the concept from the database."""
+ DISK_CONCEPT_DB.remove(namespace, concept_name, user)
+class MergeConceptDraftOptions(BaseModel):
+ """Merge a draft into main."""
+ draft: DraftId
+@router.post('/{namespace}/{concept_name}/merge_draft', response_model_exclude_none=True)
+def merge_concept_draft(namespace: str, concept_name: str, options: MergeConceptDraftOptions,
+ user: Annotated[Optional[UserInfo],
+ Depends(get_session_user)]) -> Concept:
+ """Merge a draft in the concept into main."""
+ return DISK_CONCEPT_DB.merge_draft(namespace, concept_name, options.draft, user)
+class ScoreExample(BaseModel):
+ """Example to score along a specific concept."""
+ text: Optional[str] = None
+ img: Optional[bytes] = None
+class ScoreBody(BaseModel):
+ """Request body for the score endpoint."""
+ examples: list[ScoreExample]
+ draft: str = DRAFT_MAIN
+class ConceptModelInfo(BaseModel):
+ """Information about a concept model."""
+ namespace: str
+ concept_name: str
+ embedding_name: str
+ version: int
+ metrics: Optional[ConceptMetrics] = None
+@router.get('/{namespace}/{concept_name}/model')
+def get_concept_models(
+ namespace: str,
+ concept_name: str,
+ user: Annotated[Optional[UserInfo],
+ Depends(get_session_user)] = None) -> list[ConceptModelInfo]:
+ """Get a concept model from a database."""
+ concept = DISK_CONCEPT_DB.get(namespace, concept_name, user)
+ if not concept:
+ raise HTTPException(
+ status_code=404, detail=f'Concept "{namespace}/{concept_name}" was not found')
+ models = DISK_CONCEPT_MODEL_DB.get_models(namespace, concept_name, user)
+ for m in models:
+ DISK_CONCEPT_MODEL_DB.sync(m, user)
+ return [
+ ConceptModelInfo(
+ namespace=m.namespace,
+ concept_name=m.concept_name,
+ embedding_name=m.embedding_name,
+ version=m.version,
+ metrics=m.get_metrics(concept)) for m in models
+ ]
+@router.get('/{namespace}/{concept_name}/model/{embedding_name}')
+def get_concept_model(
+ namespace: str,
+ concept_name: str,
+ embedding_name: str,
+ user: Annotated[Optional[UserInfo], Depends(get_session_user)] = None) -> ConceptModelInfo:
+ """Get a concept model from a database."""
+ concept = DISK_CONCEPT_DB.get(namespace, concept_name, user)
+ if not concept:
+ raise HTTPException(
+ status_code=404, detail=f'Concept "{namespace}/{concept_name}" was not found')
+ model = DISK_CONCEPT_MODEL_DB.get(namespace, concept_name, embedding_name, user=user)
+ if not model:
+ model = DISK_CONCEPT_MODEL_DB.create(namespace, concept_name, embedding_name, user=user)
+ DISK_CONCEPT_MODEL_DB.sync(model)
+ model_info = ConceptModelInfo(
+ namespace=model.namespace,
+ concept_name=model.concept_name,
+ embedding_name=model.embedding_name,
+ version=model.version,
+ metrics=model.get_metrics(concept))
+ return model_info
+@router.post(
+ '/{namespace}/{concept_name}/model/{embedding_name}/score', response_model_exclude_none=True)
+def score(namespace: str, concept_name: str, embedding_name: str, body: ScoreBody,
+ user: Annotated[Optional[UserInfo], Depends(get_session_user)]) -> list[list[dict]]:
+ """Score examples along the specified concept."""
+ concept_scorer = ConceptScoreSignal(
+ namespace=namespace, concept_name=concept_name, embedding=embedding_name)
+ return cast(
+ list[list[dict]],
+ server_compute_concept(concept_scorer, cast(Iterable[RichData],
+ [e.text for e in body.examples]), user))
+class Examples(OpenAISchema):
+ """Generated text examples."""
+ examples: list[str] = Field(..., description='List of generated examples')
+@router.get('/generate_examples')
+def generate_examples(description: str) -> list[str]:
+ """Generate positive examples for a given concept using an LLM model."""
+ try:
+ import openai
+ except ImportError:
+ raise ImportError('Could not import the "openai" python package. '
+ 'Please install it with `pip install openai`.')
+ openai.api_key = env('OPENAI_API_KEY')
+ completion = openai.ChatCompletion.create(
+ model='gpt-3.5-turbo-0613',
+ functions=[Examples.openai_schema],
+ messages=[
+ {
+ 'role': 'system',
+ 'content': 'You must call the `Examples` function with the generated examples',
+ },
+ {
+ 'role': 'user',
+ 'content': f'Write 5 diverse, unnumbered, and concise examples of "{description}"',
+ },
+ ],
+ )
+ result = Examples.from_response(completion)
+ return result.examples

lilac/router_data_loader.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""The source loader runner which loads data into parquet files for the app.
+To run the source loader as a binary directly:
+poetry run python -m lilac.datasets.loader \
+ --dataset_name=$DATASET \
+ --output_dir=./data/ \
+ --config_path=./datasets/the_movies_dataset.json
+"""
+from typing import Any
+from fastapi import APIRouter, HTTPException, Request
+from pydantic import BaseModel
+from .auth import get_user_access
+from .data_loader import process_source
+from .env import data_path
+from .router_utils import RouteErrorHandler
+from .sources.source_registry import get_source_cls, registered_sources
+from .tasks import TaskId, task_manager
+REQUEST_TIMEOUT_SEC = 30 * 60 # 30 mins.
+router = APIRouter(route_class=RouteErrorHandler)
+class ProcessSourceRequest(BaseModel):
+ """The interface to the /process_source endpoint."""
+ username: str
+ dataset_name: str
+class SourcesList(BaseModel):
+ """The interface to the /process_source endpoint."""
+ sources: list[str]
+@router.get('/')
+def get_sources() -> SourcesList:
+ """Get the list of available sources."""
+ sources = registered_sources()
+ return SourcesList(sources=list(sources.keys()))
+@router.get('/{source_name}')
+def get_source_schema(source_name: str) -> dict[str, Any]:
+ """Get the fields for a source."""
+ source_cls = get_source_cls(source_name)
+ return source_cls.schema()
+class LoadDatasetOptions(BaseModel):
+ """Options for loading a dataset."""
+ namespace: str
+ dataset_name: str
+ config: dict[str, Any]
+class LoadDatasetResponse(BaseModel):
+ """Response of the load dataset endpoint."""
+ task_id: TaskId
+@router.post('/{source_name}/load')
+async def load(source_name: str, options: LoadDatasetOptions,
+ request: Request) -> LoadDatasetResponse:
+ """Load a dataset."""
+ if not get_user_access().create_dataset:
+ raise HTTPException(401, 'User does not have access to load a dataset.')
+ source_cls = get_source_cls(source_name)
+ source = source_cls(**options.config)
+ task_id = task_manager().task_id(
+ name=f'[{options.namespace}/{options.dataset_name}] Load dataset',
+ description=f'Loader: {source.name}. \n Config: {source}')
+ task_manager().execute(task_id, process_source, data_path(), options.namespace,
+ options.dataset_name, source, (task_id, 0))
+ return LoadDatasetResponse(task_id=task_id)

lilac/router_dataset.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""Router for the dataset database."""
+from typing import Annotated, Optional, Sequence, Union, cast
+from urllib.parse import unquote
+from fastapi import APIRouter, HTTPException, Response
+from fastapi.params import Depends
+from fastapi.responses import ORJSONResponse
+from pydantic import BaseModel, validator
+from .auth import UserInfo, get_session_user, get_user_access
+from .data.dataset import BinaryOp
+from .data.dataset import Column as DBColumn
+from .data.dataset import DatasetManifest, DatasetSettings, FeatureListValue, FeatureValue
+from .data.dataset import Filter as PyFilter
+from .data.dataset import (
+ GroupsSortBy,
+ ListOp,
+ Search,
+ SelectGroupsResult,
+ SelectRowsSchemaResult,
+ SortOrder,
+ StatsResult,
+ UnaryOp,
+)
+from .db_manager import get_dataset, remove_dataset_from_cache
+from .env import data_path
+from .router_utils import RouteErrorHandler
+from .schema import Bin, Path, normalize_path
+from .signals.concept_labels import ConceptLabelsSignal
+from .signals.concept_scorer import ConceptScoreSignal
+from .signals.semantic_similarity import SemanticSimilaritySignal
+from .signals.signal import Signal, TextEmbeddingSignal, TextSignal, resolve_signal
+from .signals.substring_search import SubstringSignal
+from .tasks import TaskId, task_manager
+from .utils import DatasetInfo, list_datasets
+router = APIRouter(route_class=RouteErrorHandler)
+@router.get('/', response_model_exclude_none=True)
+def get_datasets() -> list[DatasetInfo]:
+ """List the datasets."""
+ return list_datasets(data_path())
+class WebManifest(BaseModel):
+ """Information about a dataset."""
+ dataset_manifest: DatasetManifest
+@router.get('/{namespace}/{dataset_name}')
+def get_manifest(namespace: str, dataset_name: str) -> WebManifest:
+ """Get the web manifest for the dataset."""
+ dataset = get_dataset(namespace, dataset_name)
+ res = WebManifest(dataset_manifest=dataset.manifest())
+ # Avoids the error that Signal abstract class is not serializable.
+ return cast(WebManifest, ORJSONResponse(res.dict(exclude_none=True)))
+class ComputeSignalOptions(BaseModel):
+ """The request for the compute signal endpoint."""
+ signal: Signal
+ # The leaf path to compute the signal on.
+ leaf_path: Path
+ @validator('signal', pre=True)
+ def parse_signal(cls, signal: dict) -> Signal:
+ """Parse a signal to its specific subclass instance."""
+ return resolve_signal(signal)
+@router.delete('/{namespace}/{dataset_name}')
+def delete_dataset(namespace: str, dataset_name: str) -> None:
+ """Delete the dataset."""
+ if not get_user_access().dataset.delete_dataset:
+ raise HTTPException(401, 'User does not have access to delete this dataset.')
+ dataset = get_dataset(namespace, dataset_name)
+ dataset.delete()
+ remove_dataset_from_cache(namespace, dataset_name)
+class ComputeSignalResponse(BaseModel):
+ """Response of the compute signal column endpoint."""
+ task_id: TaskId
+@router.post('/{namespace}/{dataset_name}/compute_signal')
+def compute_signal(namespace: str, dataset_name: str,
+ options: ComputeSignalOptions) -> ComputeSignalResponse:
+ """Compute a signal for a dataset."""
+ if not get_user_access().dataset.compute_signals:
+ raise HTTPException(401, 'User does not have access to compute signals over this dataset.')
+ def _task_compute_signal(namespace: str, dataset_name: str, options_dict: dict,
+ task_id: TaskId) -> None:
+ # NOTE: We manually call .dict() to avoid the dask serializer, which doesn't call the underlying
+ # pydantic serializer.
+ options = ComputeSignalOptions(**options_dict)
+ dataset = get_dataset(namespace, dataset_name)
+ dataset.compute_signal(options.signal, options.leaf_path, task_step_id=(task_id, 0))
+ path_str = '.'.join(map(str, options.leaf_path))
+ task_id = task_manager().task_id(
+ name=f'[{namespace}/{dataset_name}] Compute signal "{options.signal.name}" on "{path_str}"',
+ description=f'Config: {options.signal}')
+ task_manager().execute(task_id, _task_compute_signal, namespace, dataset_name, options.dict(),
+ task_id)
+ return ComputeSignalResponse(task_id=task_id)
+class DeleteSignalOptions(BaseModel):
+ """The request for the delete signal endpoint."""
+ # The signal path holding the data from the signal.
+ signal_path: Path
+class DeleteSignalResponse(BaseModel):
+ """Response of the compute signal column endpoint."""
+ completed: bool
+@router.delete('/{namespace}/{dataset_name}/delete_signal')
+def delete_signal(namespace: str, dataset_name: str,
+ options: DeleteSignalOptions) -> DeleteSignalResponse:
+ """Delete a signal from a dataset."""
+ if not get_user_access().dataset.delete_signals:
+ raise HTTPException(401, 'User does not have access to delete this signal.')
+ dataset = get_dataset(namespace, dataset_name)
+ dataset.delete_signal(options.signal_path)
+ return DeleteSignalResponse(completed=True)
+class GetStatsOptions(BaseModel):
+ """The request for the get stats endpoint."""
+ leaf_path: Path
+@router.post('/{namespace}/{dataset_name}/stats')
+def get_stats(namespace: str, dataset_name: str, options: GetStatsOptions) -> StatsResult:
+ """Get the stats for the dataset."""
+ dataset = get_dataset(namespace, dataset_name)
+ return dataset.stats(options.leaf_path)
+class BinaryFilter(BaseModel):
+ """A filter on a column."""
+ path: Path
+ op: BinaryOp
+ value: FeatureValue
+class UnaryFilter(BaseModel):
+ """A filter on a column."""
+ path: Path
+ op: UnaryOp
+ value: None = None
+class ListFilter(BaseModel):
+ """A filter on a column."""
+ path: Path
+ op: ListOp
+ value: FeatureListValue
+Filter = Union[BinaryFilter, UnaryFilter, ListFilter]
+AllSignalTypes = Union[ConceptScoreSignal, ConceptLabelsSignal, SubstringSignal,
+ SemanticSimilaritySignal, TextEmbeddingSignal, TextSignal, Signal]
+# We override the `Column` class so we can add explicitly all signal types for better OpenAPI spec.
+class Column(DBColumn):
+ """A column in the dataset."""
+ signal_udf: Optional[AllSignalTypes] = None
+class SelectRowsOptions(BaseModel):
+ """The request for the select rows endpoint."""
+ columns: Optional[Sequence[Union[Path, Column]]] = None
+ searches: Optional[Sequence[Search]] = None
+ filters: Optional[Sequence[Filter]] = None
+ sort_by: Optional[Sequence[Path]] = None
+ sort_order: Optional[SortOrder] = SortOrder.DESC
+ limit: Optional[int] = None
+ offset: Optional[int] = None
+ combine_columns: Optional[bool] = None
+class SelectRowsSchemaOptions(BaseModel):
+ """The request for the select rows schema endpoint."""
+ columns: Optional[Sequence[Union[Path, Column]]] = None
+ searches: Optional[Sequence[Search]] = None
+ sort_by: Optional[Sequence[Path]] = None
+ sort_order: Optional[SortOrder] = SortOrder.DESC
+ combine_columns: Optional[bool] = None
+class SelectRowsResponse(BaseModel):
+ """The response for the select rows endpoint."""
+ rows: list[dict]
+ total_num_rows: int
+@router.get('/{namespace}/{dataset_name}/select_rows_download', response_model=None)
+def select_rows_download(
+ namespace: str, dataset_name: str, url_safe_options: str,
+ user: Annotated[Optional[UserInfo], Depends(get_session_user)]) -> list[dict]:
+ """Select rows from the dataset database and downloads them."""
+ options = SelectRowsOptions.parse_raw(unquote(url_safe_options))
+ return select_rows(namespace, dataset_name, options, user).rows
+@router.post('/{namespace}/{dataset_name}/select_rows', response_model_exclude_none=True)
+def select_rows(
+ namespace: str, dataset_name: str, options: SelectRowsOptions,
+ user: Annotated[Optional[UserInfo], Depends(get_session_user)]) -> SelectRowsResponse:
+ """Select rows from the dataset database."""
+ dataset = get_dataset(namespace, dataset_name)
+ sanitized_filters = [
+ PyFilter(path=normalize_path(f.path), op=f.op, value=f.value) for f in (options.filters or [])
+ ]
+ res = dataset.select_rows(
+ columns=options.columns,
+ searches=options.searches or [],
+ filters=sanitized_filters,
+ sort_by=options.sort_by,
+ sort_order=options.sort_order,
+ limit=options.limit,
+ offset=options.offset,
+ combine_columns=options.combine_columns or False,
+ user=user)
+ return SelectRowsResponse(rows=list(res), total_num_rows=res.total_num_rows)
+@router.post('/{namespace}/{dataset_name}/select_rows_schema', response_model_exclude_none=True)
+def select_rows_schema(namespace: str, dataset_name: str,
+ options: SelectRowsSchemaOptions) -> SelectRowsSchemaResult:
+ """Select rows from the dataset database."""
+ dataset = get_dataset(namespace, dataset_name)
+ return dataset.select_rows_schema(
+ columns=options.columns,
+ searches=options.searches or [],
+ sort_by=options.sort_by,
+ sort_order=options.sort_order,
+ combine_columns=options.combine_columns or False)
+class SelectGroupsOptions(BaseModel):
+ """The request for the select groups endpoint."""
+ leaf_path: Path
+ filters: Optional[Sequence[Filter]] = None
+ sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT
+ sort_order: Optional[SortOrder] = SortOrder.DESC
+ limit: Optional[int] = 100
+ bins: Optional[list[Bin]] = None
+@router.post('/{namespace}/{dataset_name}/select_groups')
+def select_groups(namespace: str, dataset_name: str,
+ options: SelectGroupsOptions) -> SelectGroupsResult:
+ """Select groups from the dataset database."""
+ dataset = get_dataset(namespace, dataset_name)
+ sanitized_filters = [
+ PyFilter(path=normalize_path(f.path), op=f.op, value=f.value) for f in (options.filters or [])
+ ]
+ return dataset.select_groups(options.leaf_path, sanitized_filters, options.sort_by,
+ options.sort_order, options.limit, options.bins)
+@router.get('/{namespace}/{dataset_name}/media')
+def get_media(namespace: str, dataset_name: str, item_id: str, leaf_path: str) -> Response:
+ """Get the media for the dataset."""
+ dataset = get_dataset(namespace, dataset_name)
+ path = tuple(leaf_path.split('.'))
+ result = dataset.media(item_id, path)
+ # Return the response via HTTP.
+ return Response(content=result.data)
+@router.get('/{namespace}/{dataset_name}/settings')
+def get_settings(namespace: str, dataset_name: str) -> DatasetSettings:
+ """Get the media for the dataset."""
+ dataset = get_dataset(namespace, dataset_name)
+ return dataset.settings()
+@router.post('/{namespace}/{dataset_name}/settings', response_model_exclude_none=True)
+def update_settings(namespace: str, dataset_name: str, settings: DatasetSettings) -> None:
+ """Get the media for the dataset."""
+ if not get_user_access().dataset.compute_signals:
+ raise HTTPException(401, 'User does not have access to update the settings of this dataset.')
+ dataset = get_dataset(namespace, dataset_name)
+ dataset.update_settings(settings)
+ return None

lilac/router_google_login.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""Router for Google OAuth2 login."""
+from urllib.parse import urlparse, urlunparse
+from authlib.integrations.starlette_client import OAuth, OAuthError
+from fastapi import APIRouter, Request, Response
+from fastapi.responses import HTMLResponse
+from starlette.config import Config
+from starlette.responses import RedirectResponse
+from .auth import UserInfo
+from .env import env
+from .router_utils import RouteErrorHandler
+router = APIRouter(route_class=RouteErrorHandler)
+if env('LILAC_AUTH_ENABLED'):
+ oauth = OAuth(
+ Config(
+ environ={
+ 'GOOGLE_CLIENT_ID': env('GOOGLE_CLIENT_ID'),
+ 'GOOGLE_CLIENT_SECRET': env('GOOGLE_CLIENT_SECRET')
+ }))
+ oauth.register(
+ name='google',
+ server_metadata_url='https://accounts.google.com/.well-known/openid-configuration',
+ client_kwargs={'scope': 'openid email profile'},
+ )
+@router.get('/login')
+async def login(request: Request, origin_url: str) -> RedirectResponse:
+ """Redirects to Google OAuth login page."""
+ auth_path = urlunparse(urlparse(origin_url)._replace(path='/google/auth'))
+ return await oauth.google.authorize_redirect(request, auth_path)
+@router.get('/auth')
+async def auth(request: Request) -> Response:
+ """Handles the Google OAuth callback."""
+ try:
+ token = await oauth.google.authorize_access_token(request)
+ except OAuthError as error:
+ return HTMLResponse(f'<h1>{error}</h1>')
+ userinfo = token['userinfo']
+ request.session['user'] = UserInfo(
+ id=str(userinfo['sub']),
+ email=userinfo['email'],
+ name=userinfo['name'],
+ given_name=userinfo['given_name'],
+ family_name=userinfo['family_name']).dict()
+ return RedirectResponse(url='/')
+@router.get('/logout')
+def logout(request: Request) -> RedirectResponse:
+ """Logs the user out."""
+ request.session.pop('user', None)
+ return RedirectResponse(url='/')

lilac/router_signal.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Router for the signal registry."""
+import math
+from typing import Annotated, Any, Optional
+from fastapi import APIRouter, Depends
+from pydantic import BaseModel, validator
+from .auth import UserInfo, get_session_user
+from .router_utils import RouteErrorHandler, server_compute_concept
+from .schema import Field, SignalInputType
+from .signals.concept_scorer import ConceptScoreSignal
+from .signals.signal import SIGNAL_REGISTRY, Signal, TextEmbeddingSignal, resolve_signal
+router = APIRouter(route_class=RouteErrorHandler)
+EMBEDDING_SORT_PRIORITIES = ['gte-small', 'gte-base', 'openai', 'sbert']
+class SignalInfo(BaseModel):
+ """Information about a signal."""
+ name: str
+ input_type: SignalInputType
+ json_schema: dict[str, Any]
+@router.get('/', response_model_exclude_none=True)
+def get_signals() -> list[SignalInfo]:
+ """List the signals."""
+ return [
+ SignalInfo(name=s.name, input_type=s.input_type, json_schema=s.schema())
+ for s in SIGNAL_REGISTRY.values()
+ if not issubclass(s, TextEmbeddingSignal)
+ ]
+@router.get('/embeddings', response_model_exclude_none=True)
+def get_embeddings() -> list[SignalInfo]:
+ """List the embeddings."""
+ embedding_infos = [
+ SignalInfo(name=s.name, input_type=s.input_type, json_schema=s.schema())
+ for s in SIGNAL_REGISTRY.values()
+ if issubclass(s, TextEmbeddingSignal)
+ ]
+ # Sort the embedding infos by priority.
+ embedding_infos = sorted(
+ embedding_infos,
+ key=lambda s: EMBEDDING_SORT_PRIORITIES.index(s.name)
+ if s.name in EMBEDDING_SORT_PRIORITIES else math.inf)
+ return embedding_infos
+class SignalComputeOptions(BaseModel):
+ """The request for the standalone compute signal endpoint."""
+ signal: Signal
+ # The inputs to compute.
+ inputs: list[str]
+ @validator('signal', pre=True)
+ def parse_signal(cls, signal: dict) -> Signal:
+ """Parse a signal to its specific subclass instance."""
+ return resolve_signal(signal)
+class SignalComputeResponse(BaseModel):
+ """The response for the standalone compute signal endpoint."""
+ items: list[Optional[Any]]
+@router.post('/compute', response_model_exclude_none=True)
+def compute(
+ options: SignalComputeOptions,
+ user: Annotated[Optional[UserInfo], Depends(get_session_user)]) -> SignalComputeResponse:
+ """Compute a signal over a set of inputs."""
+ signal = options.signal
+ if isinstance(signal, ConceptScoreSignal):
+ result = server_compute_concept(signal, options.inputs, user)
+ else:
+ signal.setup()
+ result = list(signal.compute(options.inputs))
+ return SignalComputeResponse(items=result)
+class SignalSchemaOptions(BaseModel):
+ """The request for the signal schema endpoint."""
+ signal: Signal
+ @validator('signal', pre=True)
+ def parse_signal(cls, signal: dict) -> Signal:
+ """Parse a signal to its specific subclass instance."""
+ return resolve_signal(signal)
+class SignalSchemaResponse(BaseModel):
+ """The response for the signal schema endpoint."""
+ fields: Field
+@router.post('/schema', response_model_exclude_none=True)
+def schema(options: SignalSchemaOptions) -> SignalSchemaResponse:
+ """Get the schema for a signal."""
+ signal = options.signal
+ return SignalSchemaResponse(fields=signal.fields())

lilac/router_tasks.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Router for tasks."""
+from fastapi import APIRouter
+from .router_utils import RouteErrorHandler
+from .tasks import TaskManifest, task_manager
+router = APIRouter(route_class=RouteErrorHandler)
+@router.get('/')
+async def get_task_manifest() -> TaskManifest:
+ """Get the tasks, both completed and pending."""
+ return await task_manager().manifest()

lilac/router_utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Utils for routers."""
+import traceback
+from typing import Callable, Iterable, Optional
+from fastapi import HTTPException, Request, Response
+from fastapi.routing import APIRoute
+from .auth import UserInfo
+from .concepts.db_concept import DISK_CONCEPT_DB, DISK_CONCEPT_MODEL_DB
+from .schema import Item, RichData
+from .signals.concept_scorer import ConceptScoreSignal
+class RouteErrorHandler(APIRoute):
+ """Custom APIRoute that handles application errors and exceptions."""
+ def get_route_handler(self) -> Callable:
+ """Get the route handler."""
+ original_route_handler = super().get_route_handler()
+ async def custom_route_handler(request: Request) -> Response:
+ try:
+ return await original_route_handler(request)
+ except Exception as ex:
+ if isinstance(ex, HTTPException):
+ raise ex
+ print('Route error:', request.url)
+ print(ex)
+ print(traceback.format_exc())
+ # wrap error into pretty 500 exception
+ raise HTTPException(status_code=500, detail=traceback.format_exc()) from ex
+ return custom_route_handler
+def server_compute_concept(signal: ConceptScoreSignal, examples: Iterable[RichData],
+ user: Optional[UserInfo]) -> list[Optional[Item]]:
+ """Compute a concept from the REST endpoints."""
+ # TODO(nsthorat): Move this to the setup() method in the concept_scorer.
+ concept = DISK_CONCEPT_DB.get(signal.namespace, signal.concept_name, user)
+ if not concept:
+ raise HTTPException(
+ status_code=404, detail=f'Concept "{signal.namespace}/{signal.concept_name}" was not found')
+ model = DISK_CONCEPT_MODEL_DB.get(
+ signal.namespace, signal.concept_name, signal.embedding, user=user)
+ if model is None:
+ model = DISK_CONCEPT_MODEL_DB.create(
+ signal.namespace, signal.concept_name, signal.embedding, user=user)
+ DISK_CONCEPT_MODEL_DB.sync(model, user)
+ texts = [example or '' for example in examples]
+ return list(signal.compute(texts))

lilac/schema.py ADDED Viewed

	@@ -0,0 +1,600 @@

+"""Item: an individual entry in the dataset."""
+import csv
+import io
+from collections import deque
+from datetime import datetime
+from enum import Enum
+from typing import Any, Optional, Union, cast
+import numpy as np
+import pyarrow as pa
+from pydantic import BaseModel, StrictInt, StrictStr, validator
+from typing_extensions import TypedDict
+MANIFEST_FILENAME = 'manifest.json'
+PARQUET_FILENAME_PREFIX = 'data'
+# We choose `__rowid__` inspired by the standard `rowid` pseudocolumn in DBs:
+# https://docs.oracle.com/cd/B19306_01/server.102/b14200/pseudocolumns008.htm
+UUID_COLUMN = '__rowid__'
+PATH_WILDCARD = '*'
+VALUE_KEY = '__value__'
+SIGNAL_METADATA_KEY = '__metadata__'
+TEXT_SPAN_START_FEATURE = 'start'
+TEXT_SPAN_END_FEATURE = 'end'
+EMBEDDING_KEY = 'embedding'
+# Python doesn't work with recursive types. These types provide some notion of type-safety.
+Scalar = Union[bool, datetime, int, float, str, bytes]
+Item = Any
+# Contains a string field name, a wildcard for repeateds, or a specific integer index for repeateds.
+# This path represents a path to a particular column.
+# Examples:
+# ['article', 'field'] represents {'article': {'field': VALUES}}
+# ['article', '*', 'field'] represents {'article': [{'field': VALUES}, {'field': VALUES}]}
+# ['article', '0', 'field'] represents {'article': {'field': VALUES}}
+PathTuple = tuple[StrictStr, ...]
+Path = Union[PathTuple, StrictStr]
+PathKeyedItem = tuple[Path, Item]
+# These fields are for for python only and not written to a schema.
+RichData = Union[str, bytes]
+VectorKey = tuple[Union[StrictStr, StrictInt], ...]
+PathKey = VectorKey
+class DataType(str, Enum):
+ """Enum holding the dtype for a field."""
+ STRING = 'string'
+ # Contains {start, end} offset integers with a reference_column.
+ STRING_SPAN = 'string_span'
+ BOOLEAN = 'boolean'
+ # Ints.
+ INT8 = 'int8'
+ INT16 = 'int16'
+ INT32 = 'int32'
+ INT64 = 'int64'
+ UINT8 = 'uint8'
+ UINT16 = 'uint16'
+ UINT32 = 'uint32'
+ UINT64 = 'uint64'
+ # Floats.
+ FLOAT16 = 'float16'
+ FLOAT32 = 'float32'
+ FLOAT64 = 'float64'
+ ### Time ###
+ # Time of day (no time zone).
+ TIME = 'time'
+ # Calendar date (year, month, day), no time zone.
+ DATE = 'date'
+ # An "Instant" stored as number of microseconds (µs) since 1970-01-01 00:00:00+00 (UTC time zone).
+ TIMESTAMP = 'timestamp'
+ # Time span, stored as microseconds.
+ INTERVAL = 'interval'
+ BINARY = 'binary'
+ EMBEDDING = 'embedding'
+ NULL = 'null'
+ def __repr__(self) -> str:
+ return self.value
+class SignalInputType(str, Enum):
+ """Enum holding the signal input type."""
+ TEXT = 'text'
+ TEXT_EMBEDDING = 'text_embedding'
+ IMAGE = 'image'
+ def __repr__(self) -> str:
+ return self.value
+SIGNAL_TYPE_TO_VALID_DTYPES: dict[SignalInputType, list[DataType]] = {
+ SignalInputType.TEXT: [DataType.STRING, DataType.STRING_SPAN],
+ SignalInputType.IMAGE: [DataType.BINARY],
+}
+def signal_type_supports_dtype(input_type: SignalInputType, dtype: DataType) -> bool:
+ """Returns True if the signal compute type supports the dtype."""
+ return dtype in SIGNAL_TYPE_TO_VALID_DTYPES[input_type]
+Bin = tuple[str, Optional[Union[float, int]], Optional[Union[float, int]]]
+class Field(BaseModel):
+ """Holds information for a field in the schema."""
+ repeated_field: Optional['Field'] = None
+ fields: Optional[dict[str, 'Field']] = None
+ dtype: Optional[DataType] = None
+ # Defined as the serialized signal when this field is the root result of a signal.
+ signal: Optional[dict[str, Any]] = None
+ # Maps a named bin to a tuple of (start, end) values.
+ bins: Optional[list[Bin]] = None
+ categorical: Optional[bool] = None
+ @validator('fields')
+ def either_fields_or_repeated_field_is_defined(
+ cls, fields: Optional[dict[str, 'Field']], values: dict[str,
+ Any]) -> Optional[dict[str, 'Field']]:
+ """Error if both `fields` and `repeated_fields` are defined."""
+ if not fields:
+ return fields
+ if values.get('repeated_field'):
+ raise ValueError('Both "fields" and "repeated_field" should not be defined')
+ if VALUE_KEY in fields:
+ raise ValueError(f'{VALUE_KEY} is a reserved field name.')
+ return fields
+ @validator('dtype', always=True)
+ def infer_default_dtype(cls, dtype: Optional[DataType], values: dict[str,
+ Any]) -> Optional[DataType]:
+ """Infers the default value for dtype if not explicitly provided."""
+ if dtype and values.get('repeated_field'):
+ raise ValueError('dtype and repeated_field cannot both be defined.')
+ if not values.get('repeated_field') and not values.get('fields') and not dtype:
+ raise ValueError('One of "fields", "repeated_field", or "dtype" should be defined')
+ return dtype
+ @validator('bins')
+ def validate_bins(cls, bins: list[Bin]) -> list[Bin]:
+ """Validate the bins."""
+ if len(bins) < 2:
+ raise ValueError('Please specify at least two bins.')
+ _, first_start, _ = bins[0]
+ if first_start is not None:
+ raise ValueError('The first bin should have a `None` start value.')
+ _, _, last_end = bins[-1]
+ if last_end is not None:
+ raise ValueError('The last bin should have a `None` end value.')
+ for i, (_, start, _) in enumerate(bins):
+ if i == 0:
+ continue
+ prev_bin = bins[i - 1]
+ _, _, prev_end = prev_bin
+ if start != prev_end:
+ raise ValueError(
+ f'Bin {i} start ({start}) should be equal to the previous bin end {prev_end}.')
+ return bins
+ @validator('categorical')
+ def validate_categorical(cls, categorical: bool, values: dict[str, Any]) -> bool:
+ """Validate the categorical field."""
+ if categorical and is_float(values['dtype']):
+ raise ValueError('Categorical fields cannot be float dtypes.')
+ return categorical
+ def __str__(self) -> str:
+ return _str_field(self, indent=0)
+ def __repr__(self) -> str:
+ return f' {self.__class__.__name__}::{self.json(exclude_none=True, indent=2)}'
+class Schema(BaseModel):
+ """Database schema."""
+ fields: dict[str, Field]
+ # Cached leafs.
+ _leafs: Optional[dict[PathTuple, Field]] = None
+ class Config:
+ arbitrary_types_allowed = True
+ underscore_attrs_are_private = True
+ @property
+ def leafs(self) -> dict[PathTuple, Field]:
+ """Return all the leaf fields in the schema. A leaf is defined as a node that contains a value.
+ NOTE: Leafs may contain children. Leafs can be found as any node that has a dtype defined.
+ """
+ if self._leafs:
+ return self._leafs
+ result: dict[PathTuple, Field] = {}
+ q: deque[tuple[PathTuple, Field]] = deque([((), Field(fields=self.fields))])
+ while q:
+ path, field = q.popleft()
+ if field.dtype:
+ # Nodes with dtypes act as leafs. They also may have children.
+ result[path] = field
+ if field.fields:
+ for name, child_field in field.fields.items():
+ child_path = (*path, name)
+ q.append((child_path, child_field))
+ elif field.repeated_field:
+ child_path = (*path, PATH_WILDCARD)
+ q.append((child_path, field.repeated_field))
+ self._leafs = result
+ return result
+ def has_field(self, path: PathTuple) -> bool:
+ """Returns if the field is found at the given path."""
+ field = cast(Field, self)
+ for path_part in path:
+ if field.fields:
+ field = cast(Field, field.fields.get(path_part))
+ if not field:
+ return False
+ elif field.repeated_field:
+ if path_part != PATH_WILDCARD:
+ return False
+ field = field.repeated_field
+ else:
+ return False
+ return True
+ def get_field(self, path: PathTuple) -> Field:
+ """Returns the field at the given path."""
+ field = cast(Field, self)
+ for name in path:
+ if field.fields:
+ if name not in field.fields:
+ raise ValueError(f'Path {path} not found in schema')
+ field = field.fields[name]
+ elif field.repeated_field:
+ if name != PATH_WILDCARD:
+ raise ValueError(f'Invalid path {path}')
+ field = field.repeated_field
+ else:
+ raise ValueError(f'Invalid path {path}')
+ return field
+ def __str__(self) -> str:
+ return _str_fields(self.fields, indent=0)
+ def __repr__(self) -> str:
+ return self.json(exclude_none=True, indent=2)
+def schema(schema_like: object) -> Schema:
+ """Parse a schema-like object to a Schema object."""
+ field = _parse_field_like(schema_like)
+ if not field.fields:
+ raise ValueError('Schema must have fields')
+ return Schema(fields=field.fields)
+def field(
+ dtype: Optional[Union[DataType, str]] = None,
+ signal: Optional[dict] = None,
+ fields: Optional[object] = None,
+ bins: Optional[list[Bin]] = None,
+ categorical: Optional[bool] = None,
+) -> Field:
+ """Parse a field-like object to a Field object."""
+ field = _parse_field_like(fields or {}, dtype)
+ if signal:
+ field.signal = signal
+ if dtype:
+ if isinstance(dtype, str):
+ dtype = DataType(dtype)
+ field.dtype = dtype
+ if bins:
+ field.bins = bins
+ if categorical is not None:
+ field.categorical = categorical
+ return field
+class SpanVector(TypedDict):
+ """A span with a vector."""
+ span: tuple[int, int]
+ vector: np.ndarray
+def lilac_span(start: int, end: int, metadata: dict[str, Any] = {}) -> Item:
+ """Creates a lilac span item, representing a pointer to a slice of text."""
+ return {VALUE_KEY: {TEXT_SPAN_START_FEATURE: start, TEXT_SPAN_END_FEATURE: end}, **metadata}
+def lilac_embedding(start: int, end: int, embedding: Optional[np.ndarray]) -> Item:
+ """Creates a lilac embedding item, representing a vector with a pointer to a slice of text."""
+ return lilac_span(start, end, {EMBEDDING_KEY: embedding})
+def _parse_field_like(field_like: object, dtype: Optional[Union[DataType, str]] = None) -> Field:
+ if isinstance(field_like, Field):
+ return field_like
+ elif isinstance(field_like, dict):
+ fields: dict[str, Field] = {}
+ for k, v in field_like.items():
+ fields[k] = _parse_field_like(v)
+ if isinstance(dtype, str):
+ dtype = DataType(dtype)
+ return Field(fields=fields or None, dtype=dtype)
+ elif isinstance(field_like, str):
+ return Field(dtype=DataType(field_like))
+ elif isinstance(field_like, list):
+ return Field(repeated_field=_parse_field_like(field_like[0], dtype=dtype))
+ else:
+ raise ValueError(f'Cannot parse field like: {field_like}')
+def child_item_from_column_path(item: Item, path: Path) -> Item:
+ """Return the last (child) item from a column path."""
+ child_item_value = item
+ for path_part in path:
+ if path_part == PATH_WILDCARD:
+ raise ValueError(
+ 'child_item_from_column_path cannot be called with a path that contains a repeated '
+ f'wildcard: "{path}"')
+ # path_part can either be an integer or a string for a dictionary, both of which we can
+ # directly index with.
+ child_path = int(path_part) if path_part.isdigit() else path_part
+ child_item_value = child_item_value[child_path]
+ return child_item_value
+def column_paths_match(path_match: Path, specific_path: Path) -> bool:
+ """Test whether two column paths match.
+ Args:
+ path_match: A column path that contains wildcards, and sub-paths. This path will be used for
+ testing the second specific path.
+ specific_path: A column path that specifically identifies an field.
+ Returns
+ Whether specific_path matches the path_match. This will only match when the
+ paths are equal length. If a user wants to enrich everything with an array, they must use the
+ path wildcard '*' in their patch match.
+ """
+ if isinstance(path_match, str):
+ path_match = (path_match,)
+ if isinstance(specific_path, str):
+ specific_path = (specific_path,)
+ if len(path_match) != len(specific_path):
+ return False
+ for path_match_p, specific_path_p in zip(path_match, specific_path):
+ if path_match_p == PATH_WILDCARD:
+ continue
+ if path_match_p != specific_path_p:
+ return False
+ return True
+def normalize_path(path: Path) -> PathTuple:
+ """Normalizes a dot seperated path, but ignores dots inside quotes, like regular SQL.
+ Examples
+ - 'a.b.c' will be parsed as ('a', 'b', 'c').
+ - '"a.b".c' will be parsed as ('a.b', 'c').
+ - '"a".b.c' will be parsed as ('a', 'b', 'c').
+ """
+ if isinstance(path, str):
+ return tuple(next(csv.reader(io.StringIO(path), delimiter='.')))
+ return path
+class ImageInfo(BaseModel):
+ """Info about an individual image."""
+ path: Path
+class SourceManifest(BaseModel):
+ """The manifest that describes the dataset run, including schema and parquet files."""
+ # List of a parquet filepaths storing the data. The paths can be relative to `manifest.json`.
+ files: list[str]
+ # The data schema.
+ data_schema: Schema
+ # Image information for the dataset.
+ images: Optional[list[ImageInfo]] = None
+def _str_fields(fields: dict[str, Field], indent: int) -> str:
+ prefix = ' ' * indent
+ out: list[str] = []
+ for name, field in fields.items():
+ out.append(f'{prefix}{name}:{_str_field(field, indent=indent + 2)}')
+ return '\n'.join(out)
+def _str_field(field: Field, indent: int) -> str:
+ if field.fields:
+ prefix = '\n' if indent > 0 else ''
+ return f'{prefix}{_str_fields(field.fields, indent)}'
+ if field.repeated_field:
+ return f' list({_str_field(field.repeated_field, indent)})'
+ return f' {cast(DataType, field.dtype)}'
+def dtype_to_arrow_schema(dtype: DataType) -> Union[pa.Schema, pa.DataType]:
+ """Convert the dtype to an arrow dtype."""
+ if dtype == DataType.STRING:
+ return pa.string()
+ elif dtype == DataType.BOOLEAN:
+ return pa.bool_()
+ elif dtype == DataType.FLOAT16:
+ return pa.float16()
+ elif dtype == DataType.FLOAT32:
+ return pa.float32()
+ elif dtype == DataType.FLOAT64:
+ return pa.float64()
+ elif dtype == DataType.INT8:
+ return pa.int8()
+ elif dtype == DataType.INT16:
+ return pa.int16()
+ elif dtype == DataType.INT32:
+ return pa.int32()
+ elif dtype == DataType.INT64:
+ return pa.int64()
+ elif dtype == DataType.UINT8:
+ return pa.uint8()
+ elif dtype == DataType.UINT16:
+ return pa.uint16()
+ elif dtype == DataType.UINT32:
+ return pa.uint32()
+ elif dtype == DataType.UINT64:
+ return pa.uint64()
+ elif dtype == DataType.BINARY:
+ return pa.binary()
+ elif dtype == DataType.TIME:
+ return pa.time64()
+ elif dtype == DataType.DATE:
+ return pa.date64()
+ elif dtype == DataType.TIMESTAMP:
+ return pa.timestamp('us')
+ elif dtype == DataType.INTERVAL:
+ return pa.duration('us')
+ elif dtype == DataType.EMBEDDING:
+ # We reserve an empty column for embeddings in parquet files so they can be queried.
+ # The values are *not* filled out. If parquet and duckdb support embeddings in the future, we
+ # can set this dtype to the relevant pyarrow type.
+ return pa.null()
+ elif dtype == DataType.STRING_SPAN:
+ return pa.struct({
+ VALUE_KEY: pa.struct({
+ TEXT_SPAN_START_FEATURE: pa.int32(),
+ TEXT_SPAN_END_FEATURE: pa.int32()
+ })
+ })
+ elif dtype == DataType.NULL:
+ return pa.null()
+ else:
+ raise ValueError(f'Can not convert dtype "{dtype}" to arrow dtype')
+def schema_to_arrow_schema(schema: Union[Schema, Field]) -> pa.Schema:
+ """Convert our schema to arrow schema."""
+ arrow_schema = cast(pa.Schema, _schema_to_arrow_schema_impl(schema))
+ arrow_fields = {field.name: field.type for field in arrow_schema}
+ return pa.schema(arrow_fields)
+def _schema_to_arrow_schema_impl(schema: Union[Schema, Field]) -> Union[pa.Schema, pa.DataType]:
+ """Convert a schema to an apache arrow schema."""
+ if schema.fields:
+ arrow_fields: dict[str, Union[pa.Schema, pa.DataType]] = {}
+ for name, field in schema.fields.items():
+ if name == UUID_COLUMN:
+ arrow_schema = dtype_to_arrow_schema(cast(DataType, field.dtype))
+ else:
+ arrow_schema = _schema_to_arrow_schema_impl(field)
+ arrow_fields[name] = arrow_schema
+ if isinstance(schema, Schema):
+ # Top-level schemas do not have __value__ fields.
+ return pa.schema(arrow_fields)
+ else:
+ # When nodes have both dtype and children, we add __value__ alongside the fields.
+ if schema.dtype:
+ value_schema = dtype_to_arrow_schema(schema.dtype)
+ if schema.dtype == DataType.STRING_SPAN:
+ value_schema = value_schema[VALUE_KEY].type
+ arrow_fields[VALUE_KEY] = value_schema
+ return pa.struct(arrow_fields)
+ field = cast(Field, schema)
+ if field.repeated_field:
+ return pa.list_(_schema_to_arrow_schema_impl(field.repeated_field))
+ return dtype_to_arrow_schema(cast(DataType, field.dtype))
+def arrow_dtype_to_dtype(arrow_dtype: pa.DataType) -> DataType:
+ """Convert arrow dtype to our dtype."""
+ # Ints.
+ if arrow_dtype == pa.int8():
+ return DataType.INT8
+ elif arrow_dtype == pa.int16():
+ return DataType.INT16
+ elif arrow_dtype == pa.int32():
+ return DataType.INT32
+ elif arrow_dtype == pa.int64():
+ return DataType.INT64
+ elif arrow_dtype == pa.uint8():
+ return DataType.UINT8
+ elif arrow_dtype == pa.uint16():
+ return DataType.UINT16
+ elif arrow_dtype == pa.uint32():
+ return DataType.UINT32
+ elif arrow_dtype == pa.uint64():
+ return DataType.UINT64
+ # Floats.
+ elif arrow_dtype == pa.float16():
+ return DataType.FLOAT16
+ elif arrow_dtype == pa.float32():
+ return DataType.FLOAT32
+ elif arrow_dtype == pa.float64():
+ return DataType.FLOAT64
+ # Time.
+ elif pa.types.is_time(arrow_dtype):
+ return DataType.TIME
+ elif pa.types.is_date(arrow_dtype):
+ return DataType.DATE
+ elif pa.types.is_timestamp(arrow_dtype):
+ return DataType.TIMESTAMP
+ elif pa.types.is_duration(arrow_dtype):
+ return DataType.INTERVAL
+ # Others.
+ elif arrow_dtype == pa.string():
+ return DataType.STRING
+ elif pa.types.is_binary(arrow_dtype) or pa.types.is_fixed_size_binary(arrow_dtype):
+ return DataType.BINARY
+ elif pa.types.is_boolean(arrow_dtype):
+ return DataType.BOOLEAN
+ elif arrow_dtype == pa.null():
+ return DataType.NULL
+ else:
+ raise ValueError(f'Can not convert arrow dtype "{arrow_dtype}" to our dtype')
+def arrow_schema_to_schema(schema: pa.Schema) -> Schema:
+ """Convert arrow schema to our schema."""
+ # TODO(nsthorat): Change this implementation to allow more complicated reading of arrow schemas
+ # into our schema by inferring values when {__value__: value} is present in the pyarrow schema.
+ # This isn't necessary today as this util is only needed by sources which do not have data in the
+ # lilac format.
+ return cast(Schema, _arrow_schema_to_schema_impl(schema))
+def _arrow_schema_to_schema_impl(schema: Union[pa.Schema, pa.DataType]) -> Union[Schema, Field]:
+ """Convert an apache arrow schema to our schema."""
+ if isinstance(schema, (pa.Schema, pa.StructType)):
+ fields: dict[str, Field] = {
+ field.name: cast(Field, _arrow_schema_to_schema_impl(field.type)) for field in schema
+ }
+ return Schema(fields=fields) if isinstance(schema, pa.Schema) else Field(fields=fields)
+ elif isinstance(schema, pa.ListType):
+ return Field(repeated_field=cast(Field, _arrow_schema_to_schema_impl(schema.value_field.type)))
+ else:
+ return Field(dtype=arrow_dtype_to_dtype(schema))
+def is_float(dtype: DataType) -> bool:
+ """Check if a dtype is a float dtype."""
+ return dtype in [DataType.FLOAT16, DataType.FLOAT32, DataType.FLOAT64]
+def is_integer(dtype: DataType) -> bool:
+ """Check if a dtype is an integer dtype."""
+ return dtype in [
+ DataType.INT8, DataType.INT16, DataType.INT32, DataType.INT64, DataType.UINT8, DataType.UINT16,
+ DataType.UINT32, DataType.UINT64
+ ]
+def is_temporal(dtype: DataType) -> bool:
+ """Check if a dtype is a temporal dtype."""
+ return dtype in [DataType.TIME, DataType.DATE, DataType.TIMESTAMP, DataType.INTERVAL]
+def is_ordinal(dtype: DataType) -> bool:
+ """Check if a dtype is an ordinal dtype."""
+ return is_float(dtype) or is_integer(dtype) or is_temporal(dtype)