Spaces:

AyoubZolodick
/

rag_lite

Running

App Files Files Community

EL GHAFRAOUI AYOUB commited on Nov 20, 2024

Commit

54f5afe

1 Parent(s): 70ba739

C

Browse files

Files changed (42) hide show

.cruft.json +29 -0
.devcontainer/devcontainer.json +65 -0
.dockerignore +5 -0
.github/dependabot.yml +29 -0
.github/workflows/publish.yml +27 -0
.github/workflows/test.yml +47 -0
.gitignore +76 -0
.pre-commit-config.yaml +83 -0
CHANGELOG.md +53 -0
Dockerfile +101 -0
docker-compose.yml +60 -0
poetry.lock +0 -0
pyproject.toml +186 -0
src/raglite/__init__.py +41 -0
src/raglite/_chainlit.py +117 -0
src/raglite/_cli.py +39 -0
src/raglite/_config.py +61 -0
src/raglite/_database.py +341 -0
src/raglite/_embed.py +203 -0
src/raglite/_eval.py +257 -0
src/raglite/_extract.py +69 -0
src/raglite/_flashrank.py +41 -0
src/raglite/_insert.py +160 -0
src/raglite/_litellm.py +261 -0
src/raglite/_markdown.py +221 -0
src/raglite/_query_adapter.py +162 -0
src/raglite/_rag.py +166 -0
src/raglite/_search.py +270 -0
src/raglite/_split_chunks.py +102 -0
src/raglite/_split_sentences.py +76 -0
src/raglite/_typing.py +145 -0
src/raglite/py.typed +0 -0
tests/__init__.py +1 -0
tests/conftest.py +102 -0
tests/specrel.pdf +0 -0
tests/test_embed.py +26 -0
tests/test_import.py +8 -0
tests/test_markdown.py +22 -0
tests/test_rag.py +40 -0
tests/test_rerank.py +55 -0
tests/test_search.py +48 -0
tests/test_split_chunks.py +56 -0

.cruft.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "template": "https://github.com/superlinear-ai/poetry-cookiecutter",
+  "commit": "b7f2fb0f123aae0a01d2ab015db31f52d2d8cc21",
+  "checkout": null,
+  "context": {
+    "cookiecutter": {
+      "project_type": "package",
+      "project_name": "RAGLite",
+      "project_description": "A Python toolkit for Retrieval-Augmented Generation (RAG) with SQLite or PostgreSQL.",
+      "project_url": "https://github.com/superlinear-ai/raglite",
+      "author_name": "Laurent Sorber",
+      "author_email": "[email protected]",
+      "python_version": "3.10",
+      "development_environment": "strict",
+      "with_conventional_commits": "1",
+      "with_fastapi_api": "0",
+      "with_typer_cli": "0",
+      "continuous_integration": "GitHub",
+      "private_package_repository_name": "",
+      "private_package_repository_url": "",
+      "__docker_image": "python:$PYTHON_VERSION-slim",
+      "__docstring_style": "NumPy",
+      "__project_name_kebab_case": "raglite",
+      "__project_name_snake_case": "raglite",
+      "_template": "https://github.com/superlinear-ai/poetry-cookiecutter"
+    }
+  },
+  "directory": null
+}

.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+    "name": "raglite",
+    "dockerComposeFile": "../docker-compose.yml",
+    "service": "devcontainer",
+    "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}/",
+    "remoteUser": "user",
+    "overrideCommand": true,
+    "postStartCommand": "cp --update /opt/build/poetry/poetry.lock /workspaces/${localWorkspaceFolderBasename}/ && mkdir -p /workspaces/${localWorkspaceFolderBasename}/.git/hooks/ && cp --update /opt/build/git/* /workspaces/${localWorkspaceFolderBasename}/.git/hooks/",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "charliermarsh.ruff",
+                "GitHub.vscode-github-actions",
+                "GitHub.vscode-pull-request-github",
+                "ms-python.mypy-type-checker",
+                "ms-python.python",
+                "ms-toolsai.jupyter",
+                "ryanluker.vscode-coverage-gutters",
+                "tamasfe.even-better-toml",
+                "visualstudioexptteam.vscodeintellicode"
+            ],
+            "settings": {
+                "coverage-gutters.coverageFileNames": [
+                    "reports/coverage.xml"
+                ],
+                "editor.codeActionsOnSave": {
+                    "source.fixAll": "explicit",
+                    "source.organizeImports": "explicit"
+                },
+                "editor.formatOnSave": true,
+                "[python]": {
+                    "editor.defaultFormatter": "charliermarsh.ruff"
+                },
+                "[toml]": {
+                    "editor.formatOnSave": false
+                },
+                "editor.rulers": [
+                    100
+                ],
+                "files.autoSave": "onFocusChange",
+                "jupyter.kernels.excludePythonEnvironments": [
+                    "/usr/local/bin/python"
+                ],
+                "mypy-type-checker.importStrategy": "fromEnvironment",
+                "mypy-type-checker.preferDaemon": true,
+                "notebook.codeActionsOnSave": {
+                    "notebook.source.fixAll": "explicit",
+                    "notebook.source.organizeImports": "explicit"
+                },
+                "notebook.formatOnSave.enabled": true,
+                "python.defaultInterpreterPath": "/opt/raglite-env/bin/python",
+                "python.terminal.activateEnvironment": false,
+                "python.testing.pytestEnabled": true,
+                "ruff.importStrategy": "fromEnvironment",
+                "ruff.logLevel": "warning",
+                "terminal.integrated.defaultProfile.linux": "zsh",
+                "terminal.integrated.profiles.linux": {
+                    "zsh": {
+                        "path": "/usr/bin/zsh"
+                    }
+                }
+            }
+        }
+    }
+}

.dockerignore ADDED Viewed

	@@ -0,0 +1,5 @@

+# Caches
+.*_cache/
+# Git
+.git/

.github/dependabot.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+version: 2
+updates:
+  - package-ecosystem: github-actions
+    directory: /
+    schedule:
+      interval: monthly
+    commit-message:
+      prefix: "ci"
+      prefix-development: "ci"
+      include: scope
+    groups:
+      ci-dependencies:
+        patterns:
+          - "*"
+  - package-ecosystem: pip
+    directory: /
+    schedule:
+      interval: monthly
+    commit-message:
+      prefix: "chore"
+      prefix-development: "build"
+      include: scope
+    allow:
+      - dependency-type: development
+    versioning-strategy: increase
+    groups:
+      development-dependencies:
+        dependency-type: development

.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+name: Publish
+on:
+  release:
+    types:
+      - created
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install Poetry
+        run: pip install --no-input poetry
+      - name: Publish package
+        run: |
+          poetry config pypi-token.pypi "${{ secrets.POETRY_PYPI_TOKEN_PYPI }}"
+          poetry publish --build

.github/workflows/test.yml ADDED Viewed

	@@ -0,0 +1,47 @@

+name: Test
+on:
+  push:
+    branches:
+      - main
+      - master
+  pull_request:
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11"]
+    name: Python ${{ matrix.python-version }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 21
+      - name: Install @devcontainers/cli
+        run: npm install --location=global @devcontainers/[email protected]
+      - name: Start Dev Container
+        run: |
+          git config --global init.defaultBranch main
+          PYTHON_VERSION=${{ matrix.python-version }} OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} devcontainer up --workspace-folder .
+      - name: Lint package
+        run: devcontainer exec --workspace-folder . poe lint
+      - name: Test package
+        run: devcontainer exec --workspace-folder . poe test
+      - name: Upload coverage
+        uses: codecov/codecov-action@v4
+        with:
+          files: reports/coverage.xml

.gitignore ADDED Viewed

	@@ -0,0 +1,76 @@

+# Chainlit
+.chainlit/
+.files/
+chainlit.md
+# Coverage.py
+htmlcov/
+reports/
+# cruft
+*.rej
+# Data
+*.csv*
+*.dat*
+*.pickle*
+*.xls*
+*.zip*
+data/
+# direnv
+.envrc
+# dotenv
+.env
+# rerankers
+.*_cache/
+# Hypothesis
+.hypothesis/
+# Jupyter
+*.ipynb
+.ipynb_checkpoints/
+notebooks/
+# macOS
+.DS_Store
+# mypy
+.dmypy.json
+.mypy_cache/
+# Node.js
+node_modules/
+# Poetry
+.venv/
+dist/
+# PyCharm
+.idea/
+# pyenv
+.python-version
+# pytest
+.pytest_cache/
+# Python
+__pycache__/
+*.py[cdo]
+# RAGLite
+*.db
+*.sqlite
+# Ruff
+.ruff_cache/
+# Terraform
+.terraform/
+# VS Code
+.vscode/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+# https://pre-commit.com
+default_install_hook_types: [commit-msg, pre-commit]
+default_stages: [commit, manual]
+fail_fast: true
+repos:
+  - repo: meta
+    hooks:
+      - id: check-useless-excludes
+  - repo: https://github.com/pre-commit/pygrep-hooks
+    rev: v1.10.0
+    hooks:
+      - id: python-check-mock-methods
+      - id: python-use-type-annotations
+      - id: rst-backticks
+      - id: rst-directive-colons
+      - id: rst-inline-touching-normal
+      - id: text-unicode-replacement-char
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-ast
+      - id: check-builtin-literals
+      - id: check-case-conflict
+      - id: check-docstring-first
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-symlinks
+      - id: check-toml
+      - id: check-vcs-permalinks
+      - id: check-xml
+      - id: check-yaml
+      - id: debug-statements
+      - id: destroyed-symlinks
+      - id: detect-private-key
+      - id: end-of-file-fixer
+        types: [python]
+      - id: fix-byte-order-marker
+      - id: mixed-line-ending
+      - id: name-tests-test
+        args: [--pytest-test-first]
+      - id: trailing-whitespace
+        types: [python]
+  - repo: local
+    hooks:
+      - id: commitizen
+        name: commitizen
+        entry: cz check
+        args: [--commit-msg-file]
+        require_serial: true
+        language: system
+        stages: [commit-msg]
+      - id: ruff-check
+        name: ruff check
+        entry: ruff check
+        args: ["--force-exclude", "--extend-fixable=ERA001,F401,F841,T201,T203"]
+        require_serial: true
+        language: system
+        types_or: [python, pyi]
+      - id: ruff-format
+        name: ruff format
+        entry: ruff format
+        args: [--force-exclude]
+        require_serial: true
+        language: system
+        types_or: [python, pyi]
+      - id: shellcheck
+        name: shellcheck
+        entry: shellcheck
+        args: [--check-sourced]
+        language: system
+        types: [shell]
+      - id: poetry-check
+        name: poetry check
+        entry: poetry check
+        language: system
+        pass_filenames: false
+      - id: mypy
+        name: mypy
+        entry: mypy
+        language: system
+        types: [python]

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,53 @@

+## v0.2.0 (2024-10-21)
+### Feat
+- add Chainlit frontend (#33)
+## v0.1.4 (2024-10-15)
+### Fix
+- fix optimal chunking edge cases (#32)
+## v0.1.3 (2024-10-13)
+### Fix
+- upgrade pdftext (#30)
+- improve chunk and segment ordering (#29)
+## v0.1.2 (2024-10-08)
+### Fix
+- avoid pdftext v0.3.11 (#27)
+## v0.1.1 (2024-10-07)
+### Fix
+- patch rerankers flashrank issue (#22)
+## v0.1.0 (2024-10-07)
+### Feat
+- add reranking (#20)
+- add LiteLLM and late chunking (#19)
+- add PostgreSQL support (#18)
+- make query adapter minimally invasive (#16)
+- upgrade default CPU model to Phi-3.5-mini (#15)
+- add evaluation (#14)
+- infer missing font sizes (#12)
+- automatically adjust number of RAG contexts (#10)
+- improve exception feedback for extraction (#9)
+- optimize config for CPU and GPU (#7)
+- simplify document insertion (#6)
+- implement basic features (#2)
+- initial commit
+### Fix
+- lazily import optional dependencies (#11)
+- improve indexing of multiple documents (#8)

Dockerfile ADDED Viewed

	@@ -0,0 +1,101 @@

+# syntax=docker/dockerfile:1
+ARG PYTHON_VERSION=3.10
+FROM python:$PYTHON_VERSION-slim AS base
+# Remove docker-clean so we can keep the apt cache in Docker build cache.
+RUN rm /etc/apt/apt.conf.d/docker-clean
+# Configure Python to print tracebacks on crash [1], and to not buffer stdout and stderr [2].
+# [1] https://docs.python.org/3/using/cmdline.html#envvar-PYTHONFAULTHANDLER
+# [2] https://docs.python.org/3/using/cmdline.html#envvar-PYTHONUNBUFFERED
+ENV PYTHONFAULTHANDLER 1
+ENV PYTHONUNBUFFERED 1
+# Create a non-root user and switch to it [1].
+# [1] https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user
+ARG UID=1000
+ARG GID=$UID
+RUN groupadd --gid $GID user && \
+    useradd --create-home --gid $GID --uid $UID user --no-log-init && \
+    chown user /opt/
+USER user
+# Create and activate a virtual environment.
+ENV VIRTUAL_ENV /opt/raglite-env
+ENV PATH $VIRTUAL_ENV/bin:$PATH
+RUN python -m venv $VIRTUAL_ENV
+# Set the working directory.
+WORKDIR /workspaces/raglite/
+FROM base as poetry
+USER root
+# Install Poetry in separate venv so it doesn't pollute the main venv.
+ENV POETRY_VERSION 1.8.0
+ENV POETRY_VIRTUAL_ENV /opt/poetry-env
+RUN --mount=type=cache,target=/root/.cache/pip/ \
+    python -m venv $POETRY_VIRTUAL_ENV && \
+    $POETRY_VIRTUAL_ENV/bin/pip install poetry~=$POETRY_VERSION && \
+    ln -s $POETRY_VIRTUAL_ENV/bin/poetry /usr/local/bin/poetry
+# Install compilers that may be required for certain packages or platforms.
+RUN --mount=type=cache,target=/var/cache/apt/ \
+    --mount=type=cache,target=/var/lib/apt/ \
+    apt-get update && \
+    apt-get install --no-install-recommends --yes build-essential
+USER user
+# Install the run time Python dependencies in the virtual environment.
+COPY --chown=user:user poetry.lock* pyproject.toml /workspaces/raglite/
+RUN mkdir -p /home/user/.cache/pypoetry/ && mkdir -p /home/user/.config/pypoetry/ && \
+    mkdir -p src/raglite/ && touch src/raglite/__init__.py && touch README.md
+RUN --mount=type=cache,uid=$UID,gid=$GID,target=/home/user/.cache/pypoetry/ \
+    poetry install --only main --all-extras --no-interaction
+FROM poetry as dev
+# Install development tools: curl, git, gpg, ssh, starship, sudo, vim, and zsh.
+USER root
+RUN --mount=type=cache,target=/var/cache/apt/ \
+    --mount=type=cache,target=/var/lib/apt/ \
+    apt-get update && \
+    apt-get install --no-install-recommends --yes curl git gnupg ssh sudo vim zsh && \
+    sh -c "$(curl -fsSL https://starship.rs/install.sh)" -- "--yes" && \
+    usermod --shell /usr/bin/zsh user && \
+    echo 'user ALL=(root) NOPASSWD:ALL' > /etc/sudoers.d/user && chmod 0440 /etc/sudoers.d/user
+RUN git config --system --add safe.directory '*'
+USER user
+# Install the development Python dependencies in the virtual environment.
+RUN --mount=type=cache,uid=$UID,gid=$GID,target=/home/user/.cache/pypoetry/ \
+    poetry install --all-extras --no-interaction
+# Persist output generated during docker build so that we can restore it in the dev container.
+COPY --chown=user:user .pre-commit-config.yaml /workspaces/raglite/
+RUN mkdir -p /opt/build/poetry/ && cp poetry.lock /opt/build/poetry/ && \
+    git init && pre-commit install --install-hooks && \
+    mkdir -p /opt/build/git/ && cp .git/hooks/commit-msg .git/hooks/pre-commit /opt/build/git/
+# Configure the non-root user's shell.
+ENV ANTIDOTE_VERSION 1.8.6
+RUN git clone --branch v$ANTIDOTE_VERSION --depth=1 https://github.com/mattmc3/antidote.git ~/.antidote/ && \
+    echo 'zsh-users/zsh-syntax-highlighting' >> ~/.zsh_plugins.txt && \
+    echo 'zsh-users/zsh-autosuggestions' >> ~/.zsh_plugins.txt && \
+    echo 'source ~/.antidote/antidote.zsh' >> ~/.zshrc && \
+    echo 'antidote load' >> ~/.zshrc && \
+    echo 'eval "$(starship init zsh)"' >> ~/.zshrc && \
+    echo 'HISTFILE=~/.history/.zsh_history' >> ~/.zshrc && \
+    echo 'HISTSIZE=1000' >> ~/.zshrc && \
+    echo 'SAVEHIST=1000' >> ~/.zshrc && \
+    echo 'setopt share_history' >> ~/.zshrc && \
+    echo 'bindkey "^[[A" history-beginning-search-backward' >> ~/.zshrc && \
+    echo 'bindkey "^[[B" history-beginning-search-forward' >> ~/.zshrc && \
+    mkdir ~/.history/ && \
+    zsh -c 'source ~/.zshrc'

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+version: "3.9"
+services:
+  devcontainer:
+    build:
+      context: .
+      target: dev
+      args:
+        PYTHON_VERSION: ${PYTHON_VERSION:-3.10}
+        UID: ${UID:-1000}
+        GID: ${GID:-1000}
+    environment:
+      - OPENAI_API_KEY
+      - POETRY_PYPI_TOKEN_PYPI
+    depends_on:
+      - postgres
+    networks:
+      - raglite-network
+    volumes:
+      - ..:/workspaces
+      - command-history-volume:/home/user/.history/
+  dev:
+    extends: devcontainer
+    stdin_open: true
+    tty: true
+    entrypoint: []
+    command: [ "sh", "-c", "sudo chown user $$SSH_AUTH_SOCK && cp --update /opt/build/poetry/poetry.lock /workspaces/raglite/ && mkdir -p /workspaces/raglite/.git/hooks/ && cp --update /opt/build/git/* /workspaces/raglite/.git/hooks/ && zsh" ]
+    environment:
+      - OPENAI_API_KEY
+      - POETRY_PYPI_TOKEN_PYPI
+      - SSH_AUTH_SOCK=/run/host-services/ssh-auth.sock
+    depends_on:
+      - postgres
+    networks:
+      - raglite-network
+    volumes:
+      - ~/.gitconfig:/etc/gitconfig
+      - ~/.ssh/known_hosts:/home/user/.ssh/known_hosts
+      - ${SSH_AGENT_AUTH_SOCK:-/run/host-services/ssh-auth.sock}:/run/host-services/ssh-auth.sock
+    profiles:
+      - dev
+  postgres:
+    image: pgvector/pgvector:pg16
+    environment:
+      POSTGRES_USER: raglite_user
+      POSTGRES_PASSWORD: raglite_password
+    networks:
+      - raglite-network
+    tmpfs:
+      - /var/lib/postgresql/data
+networks:
+  raglite-network:
+    driver: bridge
+volumes:
+  command-history-volume:

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,186 @@

+[build-system] # https://python-poetry.org/docs/pyproject/#poetry-and-pep-517
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+[tool.poetry] # https://python-poetry.org/docs/pyproject/
+name = "raglite"
+version = "0.2.0"
+description = "A Python toolkit for Retrieval-Augmented Generation (RAG) with SQLite or PostgreSQL."
+authors = ["Laurent Sorber <[email protected]>"]
+readme = "README.md"
+repository = "https://github.com/superlinear-ai/raglite"
+[tool.commitizen] # https://commitizen-tools.github.io/commitizen/config/
+bump_message = "bump(release): v$current_version → v$new_version"
+tag_format = "v$version"
+update_changelog_on_bump = true
+version_provider = "poetry"
+[tool.poetry.dependencies] # https://python-poetry.org/docs/dependency-specification/
+# Python:
+python = ">=3.10,<4.0"
+# Markdown conversion:
+pdftext = ">=0.3.13"
+pypandoc-binary = { version = ">=1.13", optional = true }
+scikit-learn = ">=1.4.2"
+# Markdown formatting:
+markdown-it-py = ">=3.0.0"
+mdformat-gfm = ">=0.3.6"
+# Sentence and chunk splitting:
+numpy = ">=1.26.4"
+scipy = ">=1.5.0"
+spacy = ">=3.7.0,<3.8.0"
+# Large Language Models:
+huggingface-hub = ">=0.22.0"
+litellm = ">=1.47.1"
+llama-cpp-python = ">=0.2.88"
+pydantic = ">=2.7.0"
+# Approximate Nearest Neighbors:
+pynndescent = ">=0.5.12"
+# Reranking:
+langdetect = ">=1.0.9"
+rerankers = { extras = ["flashrank"], version = ">=0.5.3" }
+# Storage:
+pg8000 = ">=1.31.2"
+sqlmodel-slim = ">=0.0.18"
+# Progress:
+tqdm = ">=4.66.0"
+# Evaluation:
+pandas = ">=2.1.0"
+ragas = { version = ">=0.1.12", optional = true }
+# CLI:
+typer = ">=0.12.5"
+# Frontend:
+chainlit = { version = ">=1.2.0", optional = true }
+[tool.poetry.extras] # https://python-poetry.org/docs/pyproject/#extras
+chainlit = ["chainlit"]
+pandoc = ["pypandoc-binary"]
+ragas = ["ragas"]
+[tool.poetry.group.test.dependencies] # https://python-poetry.org/docs/master/managing-dependencies/
+commitizen = ">=3.29.1"
+coverage = { extras = ["toml"], version = ">=7.4.4" }
+mypy = ">=1.9.0"
+poethepoet = ">=0.25.0"
+pre-commit = ">=3.7.0"
+pytest = ">=8.1.1"
+pytest-mock = ">=3.14.0"
+ruff = ">=0.5.7"
+safety = ">=3.1.0"
+shellcheck-py = ">=0.10.0.1"
+typeguard = ">=4.2.1"
+xx_sent_ud_sm = { url = "https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.7.0/xx_sent_ud_sm-3.7.0-py3-none-any.whl" }
+[tool.poetry.group.dev.dependencies] # https://python-poetry.org/docs/master/managing-dependencies/
+cruft = ">=2.15.0"
+ipykernel = ">=6.29.4"
+ipython = ">=8.8.0"
+ipywidgets = ">=8.1.2"
+matplotlib = ">=3.9.0"
+memory-profiler = ">=0.61.0"
+pdoc = ">=14.4.0"
+[tool.poetry.scripts] # https://python-poetry.org/docs/pyproject/#scripts
+raglite = "raglite:cli"
+[tool.coverage.report] # https://coverage.readthedocs.io/en/latest/config.html#report
+fail_under = 50
+precision = 1
+show_missing = true
+skip_covered = true
+[tool.coverage.run] # https://coverage.readthedocs.io/en/latest/config.html#run
+branch = true
+command_line = "--module pytest"
+data_file = "reports/.coverage"
+source = ["src"]
+[tool.coverage.xml] # https://coverage.readthedocs.io/en/latest/config.html#xml
+output = "reports/coverage.xml"
+[tool.mypy] # https://mypy.readthedocs.io/en/latest/config_file.html
+junit_xml = "reports/mypy.xml"
+strict = true
+disallow_subclassing_any = false
+disallow_untyped_decorators = false
+ignore_missing_imports = true
+pretty = true
+show_column_numbers = true
+show_error_codes = true
+show_error_context = true
+warn_unreachable = true
+[tool.pytest.ini_options] # https://docs.pytest.org/en/latest/reference/reference.html#ini-options-ref
+addopts = "--color=yes --exitfirst --failed-first --strict-config --strict-markers --verbosity=2 --junitxml=reports/pytest.xml"
+filterwarnings = ["error", "ignore::DeprecationWarning", "ignore::pytest.PytestUnraisableExceptionWarning"]
+testpaths = ["src", "tests"]
+xfail_strict = true
+[tool.ruff] # https://github.com/charliermarsh/ruff
+fix = true
+line-length = 100
+src = ["src", "tests"]
+target-version = "py310"
+[tool.ruff.lint]
+select = ["A", "ASYNC", "B", "BLE", "C4", "C90", "D", "DTZ", "E", "EM", "ERA", "F", "FBT", "FLY", "FURB", "G", "I", "ICN", "INP", "INT", "ISC", "LOG", "N", "NPY", "PERF", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "Q", "RET", "RSE", "RUF", "S", "SIM", "SLF", "SLOT", "T10", "T20", "TCH", "TID", "TRY", "UP", "W", "YTT"]
+ignore = ["D203", "D213", "E501", "RET504", "RUF002", "S101", "S307"]
+unfixable = ["ERA001", "F401", "F841", "T201", "T203"]
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 100
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+[tool.poe.tasks] # https://github.com/nat-n/poethepoet
+[tool.poe.tasks.docs]
+help = "Generate this package's docs"
+cmd = """
+    pdoc
+      --docformat $docformat
+      --output-directory $outputdirectory
+      raglite
+    """
+[[tool.poe.tasks.docs.args]]
+help = "The docstring style (default: numpy)"
+name = "docformat"
+options = ["--docformat"]
+default = "numpy"
+[[tool.poe.tasks.docs.args]]
+help = "The output directory (default: docs)"
+name = "outputdirectory"
+options = ["--output-directory"]
+default = "docs"
+[tool.poe.tasks.lint]
+help = "Lint this package"
+[[tool.poe.tasks.lint.sequence]]
+cmd = """
+      pre-commit run
+        --all-files
+        --color always
+      """
+[[tool.poe.tasks.lint.sequence]]
+shell = "safety check --continue-on-error --full-report"
+[tool.poe.tasks.test]
+help = "Test this package"
+[[tool.poe.tasks.test.sequence]]
+cmd = "coverage run"
+[[tool.poe.tasks.test.sequence]]
+cmd = "coverage report"
+[[tool.poe.tasks.test.sequence]]
+cmd = "coverage xml"

src/raglite/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""RAGLite."""
+from raglite._cli import cli
+from raglite._config import RAGLiteConfig
+from raglite._eval import answer_evals, evaluate, insert_evals
+from raglite._insert import insert_document
+from raglite._query_adapter import update_query_adapter
+from raglite._rag import async_rag, rag
+from raglite._search import (
+    hybrid_search,
+    keyword_search,
+    rerank_chunks,
+    retrieve_chunks,
+    retrieve_segments,
+    vector_search,
+)
+__all__ = [
+    # Config
+    "RAGLiteConfig",
+    # Insert
+    "insert_document",
+    # Search
+    "hybrid_search",
+    "keyword_search",
+    "vector_search",
+    "retrieve_chunks",
+    "retrieve_segments",
+    "rerank_chunks",
+    # RAG
+    "async_rag",
+    "rag",
+    # Query adapter
+    "update_query_adapter",
+    # Evaluate
+    "insert_evals",
+    "answer_evals",
+    "evaluate",
+    # CLI
+    "cli",
+]

src/raglite/_chainlit.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Chainlit frontend for RAGLite."""
+import os
+from pathlib import Path
+import chainlit as cl
+from chainlit.input_widget import Switch, TextInput
+from raglite import (
+    RAGLiteConfig,
+    async_rag,
+    hybrid_search,
+    insert_document,
+    rerank_chunks,
+    retrieve_chunks,
+)
+from raglite._markdown import document_to_markdown
+async_insert_document = cl.make_async(insert_document)
+async_hybrid_search = cl.make_async(hybrid_search)
+async_retrieve_chunks = cl.make_async(retrieve_chunks)
+async_rerank_chunks = cl.make_async(rerank_chunks)
+@cl.on_chat_start
+async def start_chat() -> None:
+    """Initialize the chat."""
+    # Disable tokenizes parallelism to avoid the deadlock warning.
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # Add Chainlit settings with which the user can configure the RAGLite config.
+    default_config = RAGLiteConfig()
+    config = RAGLiteConfig(
+        db_url=os.environ.get("RAGLITE_DB_URL", default_config.db_url),
+        llm=os.environ.get("RAGLITE_LLM", default_config.llm),
+        embedder=os.environ.get("RAGLITE_EMBEDDER", default_config.embedder),
+    )
+    settings = await cl.ChatSettings(  # type: ignore[no-untyped-call]
+        [
+            TextInput(id="db_url", label="Database URL", initial=str(config.db_url)),
+            TextInput(id="llm", label="LLM", initial=config.llm),
+            TextInput(id="embedder", label="Embedder", initial=config.embedder),
+            Switch(id="vector_search_query_adapter", label="Query adapter", initial=True),
+        ]
+    ).send()
+    await update_config(settings)
+@cl.on_settings_update  # type: ignore[arg-type]
+async def update_config(settings: cl.ChatSettings) -> None:
+    """Update the RAGLite config."""
+    # Update the RAGLite config given the Chainlit settings.
+    config = RAGLiteConfig(
+        db_url=settings["db_url"],  # type: ignore[index]
+        llm=settings["llm"],  # type: ignore[index]
+        embedder=settings["embedder"],  # type: ignore[index]
+        vector_search_query_adapter=settings["vector_search_query_adapter"],  # type: ignore[index]
+    )
+    cl.user_session.set("config", config)  # type: ignore[no-untyped-call]
+    # Run a search to prime the pipeline if it's a local pipeline.
+    # TODO: Don't do this for SQLite once we switch from PyNNDescent to sqlite-vec.
+    if str(config.db_url).startswith("sqlite") or config.embedder.startswith("llama-cpp-python"):
+        # async with cl.Step(name="initialize", type="retrieval"):
+        query = "Hello world"
+        chunk_ids, _ = await async_hybrid_search(query=query, config=config)
+        _ = await async_rerank_chunks(query=query, chunk_ids=chunk_ids, config=config)
+@cl.on_message
+async def handle_message(user_message: cl.Message) -> None:
+    """Respond to a user message."""
+    # Get the config and message history from the user session.
+    config: RAGLiteConfig = cl.user_session.get("config")  # type: ignore[no-untyped-call]
+    # Determine what to do with the attachments.
+    inline_attachments = []
+    for file in user_message.elements:
+        if file.path:
+            doc_md = document_to_markdown(Path(file.path))
+            if len(doc_md) // 3 <= 5 * (config.chunk_max_size // 3):
+                # Document is small enough to attach to the context.
+                inline_attachments.append(f"{Path(file.path).name}:\n\n{doc_md}")
+            else:
+                # Document is too large and must be inserted into the database.
+                async with cl.Step(name="insert", type="run") as step:
+                    step.input = Path(file.path).name
+                    await async_insert_document(Path(file.path), config=config)
+    # Append any inline attachments to the user prompt.
+    user_prompt = f"{user_message.content}\n\n" + "\n\n".join(
+        f'<attachment index="{i}">\n{attachment.strip()}\n</attachment>'
+        for i, attachment in enumerate(inline_attachments)
+    )
+    # Search for relevant contexts for RAG.
+    async with cl.Step(name="search", type="retrieval") as step:
+        step.input = user_message.content
+        chunk_ids, _ = await async_hybrid_search(query=user_prompt, num_results=10, config=config)
+        chunks = await async_retrieve_chunks(chunk_ids=chunk_ids, config=config)
+        step.output = chunks
+        step.elements = [  # Show the top 3 chunks inline.
+            cl.Text(content=str(chunk), display="inline") for chunk in chunks[:3]
+        ]
+    # Rerank the chunks.
+    async with cl.Step(name="rerank", type="rerank") as step:
+        step.input = chunks
+        chunks = await async_rerank_chunks(query=user_prompt, chunk_ids=chunks, config=config)
+        step.output = chunks
+        step.elements = [  # Show the top 3 chunks inline.
+            cl.Text(content=str(chunk), display="inline") for chunk in chunks[:3]
+        ]
+    # Stream the LLM response.
+    assistant_message = cl.Message(content="")
+    async for token in async_rag(
+        prompt=user_prompt,
+        search=chunks,
+        messages=cl.chat_context.to_openai()[-5:],  # type: ignore[no-untyped-call]
+        config=config,
+    ):
+        await assistant_message.stream_token(token)
+    await assistant_message.update()  # type: ignore[no-untyped-call]

src/raglite/_cli.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""RAGLite CLI."""
+import os
+import typer
+from raglite._config import RAGLiteConfig
+cli = typer.Typer()
+@cli.callback()
+def main() -> None:
+    """RAGLite CLI."""
+@cli.command()
+def chainlit(
+    db_url: str = typer.Option(RAGLiteConfig().db_url, help="Database URL"),
+    llm: str = typer.Option(RAGLiteConfig().llm, help="LiteLLM LLM"),
+    embedder: str = typer.Option(RAGLiteConfig().embedder, help="LiteLLM embedder"),
+) -> None:
+    """Serve a Chainlit frontend."""
+    # Set the environment variables for the Chainlit frontend.
+    os.environ["RAGLITE_DB_URL"] = os.environ.get("RAGLITE_DB_URL", db_url)
+    os.environ["RAGLITE_LLM"] = os.environ.get("RAGLITE_LLM", llm)
+    os.environ["RAGLITE_EMBEDDER"] = os.environ.get("RAGLITE_EMBEDDER", embedder)
+    # Import Chainlit here as it's an optional dependency.
+    try:
+        from chainlit.cli import run_chainlit
+    except ImportError as error:
+        error_message = "To serve a Chainlit frontend, please install the `chainlit` extra."
+        raise ImportError(error_message) from error
+    # Serve the frontend.
+    run_chainlit(__file__.replace("_cli.py", "_chainlit.py"))
+if __name__ == "__main__":
+    cli()

src/raglite/_config.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""RAGLite config."""
+import contextlib
+import os
+from dataclasses import dataclass, field
+from io import StringIO
+from llama_cpp import llama_supports_gpu_offload
+from sqlalchemy.engine import URL
+from raglite._flashrank import PatchedFlashRankRanker as FlashRankRanker
+# Suppress rerankers output on import until [1] is fixed.
+# [1] https://github.com/AnswerDotAI/rerankers/issues/36
+with contextlib.redirect_stdout(StringIO()):
+    from rerankers.models.ranker import BaseRanker
+@dataclass(frozen=True)
+class RAGLiteConfig:
+    """Configuration for RAGLite."""
+    # Database config.
+    db_url: str | URL = "sqlite:///raglite.sqlite"
+    # LLM config used for generation.
+    llm: str = field(
+        default_factory=lambda: (
+            "llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192"
+            if llama_supports_gpu_offload()
+            else "llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096"
+        )
+    )
+    llm_max_tries: int = 4
+    # Embedder config used for indexing.
+    embedder: str = field(
+        default_factory=lambda: (  # Nomic-embed may be better if only English is used.
+            "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf"
+            if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 4  # noqa: PLR2004
+            else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf"
+        )
+    )
+    embedder_normalize: bool = True
+    embedder_sentence_window_size: int = 3
+    # Chunk config used to partition documents into chunks.
+    chunk_max_size: int = 1440  # Max number of characters per chunk.
+    # Vector search config.
+    vector_search_index_metric: str = "cosine"  # The query adapter supports "dot" and "cosine".
+    vector_search_query_adapter: bool = True
+    # Reranking config.
+    reranker: BaseRanker | tuple[tuple[str, BaseRanker], ...] | None = field(
+        default_factory=lambda: (
+            ("en", FlashRankRanker("ms-marco-MiniLM-L-12-v2", verbose=0)),
+            ("other", FlashRankRanker("ms-marco-MultiBERT-L-12", verbose=0)),
+        ),
+        compare=False,  # Exclude the reranker from comparison to avoid lru_cache misses.
+    )
+    def __post_init__(self) -> None:
+        # Late chunking with llama-cpp-python does not apply sentence windowing.
+        if self.embedder.startswith("llama-cpp-python"):
+            object.__setattr__(self, "embedder_sentence_window_size", 1)

src/raglite/_database.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""PostgreSQL or SQLite database tables for RAGLite."""
+import datetime
+import json
+from functools import lru_cache
+from hashlib import sha256
+from pathlib import Path
+from typing import Any
+import numpy as np
+from litellm import get_model_info  # type: ignore[attr-defined]
+from markdown_it import MarkdownIt
+from pydantic import ConfigDict
+from sqlalchemy.engine import Engine, make_url
+from sqlmodel import (
+    JSON,
+    Column,
+    Field,
+    Relationship,
+    Session,
+    SQLModel,
+    create_engine,
+    text,
+)
+from raglite._config import RAGLiteConfig
+from raglite._litellm import LlamaCppPythonLLM
+from raglite._typing import Embedding, FloatMatrix, FloatVector, PickledObject
+def hash_bytes(data: bytes, max_len: int = 16) -> str:
+    """Hash bytes to a hexadecimal string."""
+    return sha256(data, usedforsecurity=False).hexdigest()[:max_len]
+class Document(SQLModel, table=True):
+    """A document."""
+    # Enable JSON columns.
+    model_config = ConfigDict(arbitrary_types_allowed=True)  # type: ignore[assignment]
+    # Table columns.
+    id: str = Field(..., primary_key=True)
+    filename: str
+    url: str | None = Field(default=None)
+    metadata_: dict[str, Any] = Field(default_factory=dict, sa_column=Column("metadata", JSON))
+    # Add relationships so we can access document.chunks and document.evals.
+    chunks: list["Chunk"] = Relationship(back_populates="document")
+    evals: list["Eval"] = Relationship(back_populates="document")
+    @staticmethod
+    def from_path(doc_path: Path, **kwargs: Any) -> "Document":
+        """Create a document from a file path."""
+        return Document(
+            id=hash_bytes(doc_path.read_bytes()),
+            filename=doc_path.name,
+            metadata_={
+                "size": doc_path.stat().st_size,
+                "created": doc_path.stat().st_ctime,
+                "modified": doc_path.stat().st_mtime,
+                **kwargs,
+            },
+        )
+class Chunk(SQLModel, table=True):
+    """A document chunk."""
+    # Enable JSON columns.
+    model_config = ConfigDict(arbitrary_types_allowed=True)  # type: ignore[assignment]
+    # Table columns.
+    id: str = Field(..., primary_key=True)
+    document_id: str = Field(..., foreign_key="document.id", index=True)
+    index: int = Field(..., index=True)
+    headings: str
+    body: str
+    metadata_: dict[str, Any] = Field(default_factory=dict, sa_column=Column("metadata", JSON))
+    # Add relationships so we can access chunk.document and chunk.embeddings.
+    document: Document = Relationship(back_populates="chunks")
+    embeddings: list["ChunkEmbedding"] = Relationship(back_populates="chunk")
+    @staticmethod
+    def from_body(
+        document_id: str,
+        index: int,
+        body: str,
+        headings: str = "",
+        **kwargs: Any,
+    ) -> "Chunk":
+        """Create a chunk from Markdown."""
+        return Chunk(
+            id=hash_bytes(body.encode()),
+            document_id=document_id,
+            index=index,
+            headings=headings,
+            body=body,
+            metadata_=kwargs,
+        )
+    def extract_headings(self) -> str:
+        """Extract Markdown headings from the chunk, starting from the current Markdown headings."""
+        md = MarkdownIt()
+        heading_lines = [""] * 10
+        level = None
+        for doc in (self.headings, self.body):
+            for token in md.parse(doc):
+                if token.type == "heading_open":
+                    level = int(token.tag[1])
+                elif token.type == "heading_close":
+                    level = None
+                elif level is not None:
+                    heading_content = token.content.strip().replace("\n", " ")
+                    heading_lines[level] = ("#" * level) + " " + heading_content
+                    heading_lines[level + 1 :] = [""] * len(heading_lines[level + 1 :])
+        headings = "\n".join([heading for heading in heading_lines if heading])
+        return headings
+    @property
+    def embedding_matrix(self) -> FloatMatrix:
+        """Return this chunk's multi-vector embedding matrix."""
+        # Uses the relationship chunk.embeddings to access the chunk_embedding table.
+        return np.vstack([embedding.embedding[np.newaxis, :] for embedding in self.embeddings])
+    def __hash__(self) -> int:
+        return hash(self.id)
+    def __repr__(self) -> str:
+        return json.dumps(
+            {
+                "id": self.id,
+                "document_id": self.document_id,
+                "index": self.index,
+                "headings": self.headings,
+                "body": self.body[:100],
+                "metadata": self.metadata_,
+            },
+            indent=4,
+        )
+    def __str__(self) -> str:
+        """Context representation of this chunk."""
+        return f"{self.headings.strip()}\n\n{self.body.strip()}".strip()
+class ChunkEmbedding(SQLModel, table=True):
+    """A (sub-)chunk embedding."""
+    __tablename__ = "chunk_embedding"
+    # Enable Embedding columns.
+    model_config = ConfigDict(arbitrary_types_allowed=True)  # type: ignore[assignment]
+    # Table columns.
+    id: int = Field(..., primary_key=True)
+    chunk_id: str = Field(..., foreign_key="chunk.id", index=True)
+    embedding: FloatVector = Field(..., sa_column=Column(Embedding(dim=-1)))
+    # Add relationship so we can access embedding.chunk.
+    chunk: Chunk = Relationship(back_populates="embeddings")
+    @classmethod
+    def set_embedding_dim(cls, dim: int) -> None:
+        """Modify the embedding column's dimension after class definition."""
+        cls.__table__.c["embedding"].type.dim = dim  # type: ignore[attr-defined]
+class IndexMetadata(SQLModel, table=True):
+    """Vector and keyword search index metadata."""
+    __tablename__ = "index_metadata"
+    # Enable PickledObject columns.
+    model_config = ConfigDict(arbitrary_types_allowed=True)  # type: ignore[assignment]
+    # Table columns.
+    id: str = Field(..., primary_key=True)
+    version: datetime.datetime = Field(
+        default_factory=lambda: datetime.datetime.now(datetime.timezone.utc)
+    )
+    metadata_: dict[str, Any] = Field(
+        default_factory=dict, sa_column=Column("metadata", PickledObject)
+    )
+    @staticmethod
+    @lru_cache(maxsize=4)
+    def _get(id_: str, *, config: RAGLiteConfig | None = None) -> dict[str, Any] | None:
+        engine = create_database_engine(config)
+        with Session(engine) as session:
+            index_metadata_record = session.get(IndexMetadata, id_)
+            if index_metadata_record is None:
+                return None
+        return index_metadata_record.metadata_
+    @staticmethod
+    def get(id_: str = "default", *, config: RAGLiteConfig | None = None) -> dict[str, Any]:
+        metadata = IndexMetadata._get(id_, config=config) or {}
+        return metadata
+class Eval(SQLModel, table=True):
+    """A RAG evaluation example."""
+    __tablename__ = "eval"
+    # Enable JSON columns.
+    model_config = ConfigDict(arbitrary_types_allowed=True)  # type: ignore[assignment]
+    # Table columns.
+    id: str = Field(..., primary_key=True)
+    document_id: str = Field(..., foreign_key="document.id", index=True)
+    chunk_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
+    question: str
+    contexts: list[str] = Field(default_factory=list, sa_column=Column(JSON))
+    ground_truth: str
+    metadata_: dict[str, Any] = Field(default_factory=dict, sa_column=Column("metadata", JSON))
+    # Add relationship so we can access eval.document.
+    document: Document = Relationship(back_populates="evals")
+    @staticmethod
+    def from_chunks(
+        question: str,
+        contexts: list[Chunk],
+        ground_truth: str,
+        **kwargs: Any,
+    ) -> "Eval":
+        """Create a chunk from Markdown."""
+        document_id = contexts[0].document_id
+        chunk_ids = [context.id for context in contexts]
+        return Eval(
+            id=hash_bytes(f"{document_id}-{chunk_ids}-{question}".encode()),
+            document_id=document_id,
+            chunk_ids=chunk_ids,
+            question=question,
+            contexts=[str(context) for context in contexts],
+            ground_truth=ground_truth,
+            metadata_=kwargs,
+        )
+@lru_cache(maxsize=1)
+def create_database_engine(config: RAGLiteConfig | None = None) -> Engine:
+    """Create a database engine and initialize it."""
+    # Parse the database URL and validate that the database backend is supported.
+    config = config or RAGLiteConfig()
+    db_url = make_url(config.db_url)
+    db_backend = db_url.get_backend_name()
+    # Update database configuration.
+    connect_args = {}
+    if db_backend == "postgresql":
+        # Select the pg8000 driver if not set (psycopg2 is the default), and prefer SSL.
+        if "+" not in db_url.drivername:
+            db_url = db_url.set(drivername="postgresql+pg8000")
+        # Support setting the sslmode for pg8000.
+        if "pg8000" in db_url.drivername and "sslmode" in db_url.query:
+            query = dict(db_url.query)
+            if query.pop("sslmode") != "disable":
+                connect_args["ssl_context"] = True
+            db_url = db_url.set(query=query)
+    elif db_backend == "sqlite":
+        # Optimize SQLite performance.
+        pragmas = {"journal_mode": "WAL", "synchronous": "NORMAL"}
+        db_url = db_url.update_query_dict(pragmas, append=True)
+    else:
+        error_message = "RAGLite only supports PostgreSQL and SQLite."
+        raise ValueError(error_message)
+    # Create the engine.
+    engine = create_engine(db_url, pool_pre_ping=True, connect_args=connect_args)
+    # Install database extensions.
+    if db_backend == "postgresql":
+        with Session(engine) as session:
+            session.execute(text("CREATE EXTENSION IF NOT EXISTS vector;"))
+            session.commit()
+    # If the user has configured a llama-cpp-python model, we ensure that LiteLLM's model info is up
+    # to date by loading that LLM.
+    if config.embedder.startswith("llama-cpp-python"):
+        _ = LlamaCppPythonLLM.llm(config.embedder, embedding=True)
+    llm_provider = "llama-cpp-python" if config.embedder.startswith("llama-cpp") else None
+    model_info = get_model_info(config.embedder, custom_llm_provider=llm_provider)
+    embedding_dim = model_info.get("output_vector_size") or -1
+    assert embedding_dim > 0
+    # Create all SQLModel tables.
+    ChunkEmbedding.set_embedding_dim(embedding_dim)
+    SQLModel.metadata.create_all(engine)
+    # Create backend-specific indexes.
+    if db_backend == "postgresql":
+        # Create a keyword search index with `tsvector` and a vector search index with `pgvector`.
+        with Session(engine) as session:
+            metrics = {"cosine": "cosine", "dot": "ip", "euclidean": "l2", "l1": "l1", "l2": "l2"}
+            session.execute(
+                text("""
+                CREATE INDEX IF NOT EXISTS keyword_search_chunk_index ON chunk USING GIN (to_tsvector('simple', body));
+                """)
+            )
+            session.execute(
+                text(f"""
+                CREATE INDEX IF NOT EXISTS vector_search_chunk_index ON chunk_embedding
+                USING hnsw (
+                     (embedding::halfvec({embedding_dim}))
+                     halfvec_{metrics[config.vector_search_index_metric]}_ops
+                );
+                """)
+            )
+            session.commit()
+    elif db_backend == "sqlite":
+        # Create a virtual table for keyword search on the chunk table.
+        # We use the chunk table as an external content table [1] to avoid duplicating the data.
+        # [1] https://www.sqlite.org/fts5.html#external_content_tables
+        with Session(engine) as session:
+            session.execute(
+                text("""
+                CREATE VIRTUAL TABLE IF NOT EXISTS keyword_search_chunk_index USING fts5(body, content='chunk', content_rowid='rowid');
+                """)
+            )
+            session.execute(
+                text("""
+                CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_insert AFTER INSERT ON chunk BEGIN
+                    INSERT INTO keyword_search_chunk_index(rowid, body) VALUES (new.rowid, new.body);
+                END;
+                """)
+            )
+            session.execute(
+                text("""
+                CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_delete AFTER DELETE ON chunk BEGIN
+                    INSERT INTO keyword_search_chunk_index(keyword_search_chunk_index, rowid, body) VALUES('delete', old.rowid, old.body);
+                END;
+                """)
+            )
+            session.execute(
+                text("""
+                CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_update AFTER UPDATE ON chunk BEGIN
+                    INSERT INTO keyword_search_chunk_index(keyword_search_chunk_index, rowid, body) VALUES('delete', old.rowid, old.body);
+                    INSERT INTO keyword_search_chunk_index(rowid, body) VALUES (new.rowid, new.body);
+                END;
+                """)
+            )
+            session.commit()
+    return engine

src/raglite/_embed.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""String embedder."""
+from functools import partial
+from typing import Literal
+import numpy as np
+from litellm import embedding
+from llama_cpp import LLAMA_POOLING_TYPE_NONE, Llama
+from tqdm.auto import tqdm, trange
+from raglite._config import RAGLiteConfig
+from raglite._litellm import LlamaCppPythonLLM
+from raglite._typing import FloatMatrix, IntVector
+def _embed_sentences_with_late_chunking(  # noqa: PLR0915
+    sentences: list[str], *, config: RAGLiteConfig | None = None
+) -> FloatMatrix:
+    """Embed a document's sentences with late chunking."""
+    def _count_tokens(
+        sentences: list[str], embedder: Llama, sentinel_char: str, sentinel_tokens: list[int]
+    ) -> list[int]:
+        # Join the sentences with the sentinel token and tokenise the result.
+        sentences_tokens = np.asarray(
+            embedder.tokenize(sentinel_char.join(sentences).encode(), add_bos=False), dtype=np.intp
+        )
+        # Map all sentinel token variants to the first one.
+        for sentinel_token in sentinel_tokens[1:]:
+            sentences_tokens[sentences_tokens == sentinel_token] = sentinel_tokens[0]
+        # Count how many tokens there are in between sentinel tokens to recover the token counts.
+        sentinel_indices = np.where(sentences_tokens == sentinel_tokens[0])[0]
+        num_tokens = np.diff(sentinel_indices, prepend=0, append=len(sentences_tokens))
+        assert len(num_tokens) == len(sentences), f"Sentinel `{sentinel_char}` appears in document"
+        num_tokens_list: list[int] = num_tokens.tolist()
+        return num_tokens_list
+    def _create_segment(
+        content_start_index: int,
+        max_tokens_preamble: int,
+        max_tokens_content: int,
+        num_tokens: IntVector,
+    ) -> tuple[int, int]:
+        # Compute the segment sentence start index so that the segment preamble has no more than
+        # max_tokens_preamble tokens between [segment_start_index, content_start_index).
+        cumsum_backwards = np.cumsum(num_tokens[:content_start_index][::-1])
+        offset_preamble = np.searchsorted(cumsum_backwards, max_tokens_preamble, side="right")
+        segment_start_index = content_start_index - int(offset_preamble)
+        # Allow a larger segment content if we didn't use all of the allowed preamble tokens.
+        max_tokens_content = max_tokens_content + (
+            max_tokens_preamble - np.sum(num_tokens[segment_start_index:content_start_index])
+        )
+        # Compute the segment sentence end index so that the segment content has no more than
+        # max_tokens_content tokens between [content_start_index, segment_end_index).
+        cumsum_forwards = np.cumsum(num_tokens[content_start_index:])
+        offset_segment = np.searchsorted(cumsum_forwards, max_tokens_content, side="right")
+        segment_end_index = content_start_index + int(offset_segment)
+        return segment_start_index, segment_end_index
+    # Assert that we're using a llama-cpp-python model, since API-based embedding models don't
+    # support outputting token-level embeddings.
+    config = config or RAGLiteConfig()
+    assert config.embedder.startswith("llama-cpp-python")
+    embedder = LlamaCppPythonLLM.llm(
+        config.embedder, embedding=True, pooling_type=LLAMA_POOLING_TYPE_NONE
+    )
+    n_ctx = embedder.n_ctx()
+    n_batch = embedder.n_batch
+    # Identify the tokens corresponding to a sentinel character.
+    sentinel_char = "⊕"
+    sentinel_test = f"A{sentinel_char}B {sentinel_char} C.\n{sentinel_char}D"
+    sentinel_tokens = [
+        token
+        for token in embedder.tokenize(sentinel_test.encode(), add_bos=False)
+        if sentinel_char in embedder.detokenize([token]).decode()
+    ]
+    assert len(sentinel_tokens), f"Sentinel `{sentinel_char}` not supported by embedder"
+    # Compute the number of tokens per sentence. We use a method based on a sentinel token to
+    # minimise the number of calls to embedder.tokenize, which incurs a significant overhead
+    # (presumably to load the tokenizer) [1].
+    # TODO: Make token counting faster and more robust once [1] is fixed.
+    # [1] https://github.com/abetlen/llama-cpp-python/issues/1763
+    num_tokens_list: list[int] = []
+    sentence_batch, sentence_batch_len = [], 0
+    for i, sentence in enumerate(sentences):
+        sentence_batch.append(sentence)
+        sentence_batch_len += len(sentence)
+        if i == len(sentences) - 1 or sentence_batch_len > (n_ctx // 2):
+            num_tokens_list.extend(
+                _count_tokens(sentence_batch, embedder, sentinel_char, sentinel_tokens)
+            )
+            sentence_batch, sentence_batch_len = [], 0
+    num_tokens = np.asarray(num_tokens_list, dtype=np.intp)
+    # Compute the maximum number of tokens for each segment's preamble and content.
+    # Unfortunately, llama-cpp-python truncates the input to n_batch tokens and crashes if you try
+    # to increase it [1]. Until this is fixed, we have to limit max_tokens to n_batch.
+    # TODO: Improve the context window size once [1] is fixed.
+    # [1] https://github.com/abetlen/llama-cpp-python/issues/1762
+    max_tokens = min(n_ctx, n_batch) - 16
+    max_tokens_preamble = round(0.382 * max_tokens)  # Golden ratio.
+    max_tokens_content = max_tokens - max_tokens_preamble
+    # Compute a list of segments, each consisting of a preamble and content.
+    segments = []
+    content_start_index = 0
+    while content_start_index < len(sentences):
+        segment_start_index, segment_end_index = _create_segment(
+            content_start_index, max_tokens_preamble, max_tokens_content, num_tokens
+        )
+        segments.append((segment_start_index, content_start_index, segment_end_index))
+        content_start_index = segment_end_index
+    # Embed the segments and apply late chunking.
+    sentence_embeddings_list: list[FloatMatrix] = []
+    if len(segments) > 1 or segments[0][2] > 128:  # noqa: PLR2004
+        segments = tqdm(segments, desc="Embedding", unit="segment", dynamic_ncols=True)
+    for segment in segments:
+        # Get the token embeddings of the entire segment, including preamble and content.
+        segment_start_index, content_start_index, segment_end_index = segment
+        segment_sentences = sentences[segment_start_index:segment_end_index]
+        segment_embedding = np.asarray(embedder.embed("".join(segment_sentences)))
+        # Split the segment embeddings into embedding matrices per sentence.
+        segment_tokens = num_tokens[segment_start_index:segment_end_index]
+        sentence_size = np.round(
+            len(segment_embedding) * (segment_tokens / np.sum(segment_tokens))
+        ).astype(np.intp)
+        sentence_matrices = np.split(segment_embedding, np.cumsum(sentence_size)[:-1])
+        # Compute the segment sentence embeddings by averaging the token embeddings.
+        content_sentence_embeddings = [
+            np.mean(sentence_matrix, axis=0, keepdims=True)
+            for sentence_matrix in sentence_matrices[content_start_index - segment_start_index :]
+        ]
+        sentence_embeddings_list.append(np.vstack(content_sentence_embeddings))
+    sentence_embeddings = np.vstack(sentence_embeddings_list)
+    # Normalise the sentence embeddings to unit norm and cast to half precision.
+    if config.embedder_normalize:
+        sentence_embeddings /= np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
+    sentence_embeddings = sentence_embeddings.astype(np.float16)
+    return sentence_embeddings
+def _embed_sentences_with_windowing(
+    sentences: list[str], *, config: RAGLiteConfig | None = None
+) -> FloatMatrix:
+    """Embed a document's sentences with windowing."""
+    def _embed_string_batch(string_batch: list[str], *, config: RAGLiteConfig) -> FloatMatrix:
+        # Embed the batch of strings.
+        if config.embedder.startswith("llama-cpp-python"):
+            # LiteLLM doesn't yet support registering a custom embedder, so we handle it here.
+            # Additionally, we explicitly manually pool the token embeddings to obtain sentence
+            # embeddings because token embeddings are universally supported, while sequence
+            # embeddings are only supported by some models.
+            embedder = LlamaCppPythonLLM.llm(
+                config.embedder, embedding=True, pooling_type=LLAMA_POOLING_TYPE_NONE
+            )
+            embeddings = np.asarray([np.mean(row, axis=0) for row in embedder.embed(string_batch)])
+        else:
+            # Use LiteLLM's API to embed the batch of strings.
+            response = embedding(config.embedder, string_batch)
+            embeddings = np.asarray([item["embedding"] for item in response["data"]])
+        # Normalise the embeddings to unit norm and cast to half precision.
+        if config.embedder_normalize:
+            embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)
+        embeddings = embeddings.astype(np.float16)
+        return embeddings
+    # Window the sentences with a lookback of `config.embedder_sentence_window_size - 1` sentences.
+    config = config or RAGLiteConfig()
+    sentence_windows = [
+        "".join(sentences[max(0, i - (config.embedder_sentence_window_size - 1)) : i + 1])
+        for i in range(len(sentences))
+    ]
+    # Embed the sentence windows in batches.
+    batch_size = 64
+    batch_range = (
+        partial(trange, desc="Embedding", unit="batch", dynamic_ncols=True)
+        if len(sentence_windows) > batch_size
+        else range
+    )
+    batch_embeddings = [
+        _embed_string_batch(sentence_windows[i : i + batch_size], config=config)
+        for i in batch_range(0, len(sentence_windows), batch_size)  # type: ignore[operator]
+    ]
+    sentence_embeddings = np.vstack(batch_embeddings)
+    return sentence_embeddings
+def sentence_embedding_type(
+    *,
+    config: RAGLiteConfig | None = None,
+) -> Literal["late_chunking", "windowing"]:
+    """Return the type of sentence embeddings."""
+    config = config or RAGLiteConfig()
+    return "late_chunking" if config.embedder.startswith("llama-cpp-python") else "windowing"
+def embed_sentences(sentences: list[str], *, config: RAGLiteConfig | None = None) -> FloatMatrix:
+    """Embed the sentences of a document as a NumPy matrix with one row per sentence."""
+    config = config or RAGLiteConfig()
+    if sentence_embedding_type(config=config) == "late_chunking":
+        sentence_embeddings = _embed_sentences_with_late_chunking(sentences, config=config)
+    else:
+        sentence_embeddings = _embed_sentences_with_windowing(sentences, config=config)
+    return sentence_embeddings

src/raglite/_eval.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""Generation and evaluation of evals."""
+from random import randint
+from typing import ClassVar
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field, field_validator
+from sqlmodel import Session, func, select
+from tqdm.auto import tqdm, trange
+from raglite._config import RAGLiteConfig
+from raglite._database import Chunk, Document, Eval, create_database_engine
+from raglite._extract import extract_with_llm
+from raglite._rag import rag
+from raglite._search import hybrid_search, retrieve_segments, vector_search
+from raglite._typing import SearchMethod
+def insert_evals(  # noqa: C901
+    *, num_evals: int = 100, max_contexts_per_eval: int = 20, config: RAGLiteConfig | None = None
+) -> None:
+    """Generate and insert evals into the database."""
+    class QuestionResponse(BaseModel):
+        """A specific question about the content of a set of document contexts."""
+        question: str = Field(
+            ...,
+            description="A specific question about the content of a set of document contexts.",
+            min_length=1,
+        )
+        system_prompt: ClassVar[str] = """
+You are given a set of contexts extracted from a document.
+You are a subject matter expert on the document's topic.
+Your task is to generate a question to quiz other subject matter experts on the information in the provided context.
+The question MUST satisfy ALL of the following criteria:
+- The question SHOULD integrate as much of the provided context as possible.
+- The question MUST NOT be a general or open question, but MUST instead be as specific to the provided context as possible.
+- The question MUST be completely answerable using ONLY the information in the provided context, without depending on any background information.
+- The question MUST be entirely self-contained and able to be understood in full WITHOUT access to the provided context.
+- The question MUST NOT reference the existence of the context, directly or indirectly.
+- The question MUST treat the context as if its contents are entirely part of your working memory.
+            """.strip()
+        @field_validator("question")
+        @classmethod
+        def validate_question(cls, value: str) -> str:
+            """Validate the question."""
+            question = value.strip().lower()
+            if "context" in question or "document" in question or "question" in question:
+                raise ValueError
+            if not question.endswith("?"):
+                raise ValueError
+            return value
+    config = config or RAGLiteConfig()
+    engine = create_database_engine(config)
+    with Session(engine) as session:
+        for _ in trange(num_evals, desc="Generating evals", unit="eval", dynamic_ncols=True):
+            # Sample a random document from the database.
+            seed_document = session.exec(select(Document).order_by(func.random()).limit(1)).first()
+            if seed_document is None:
+                error_message = "First run `insert_document()` before generating evals."
+                raise ValueError(error_message)
+            # Sample a random chunk from that document.
+            seed_chunk = session.exec(
+                select(Chunk)
+                .where(Chunk.document_id == seed_document.id)
+                .order_by(func.random())
+                .limit(1)
+            ).first()
+            if seed_chunk is None:
+                continue
+            # Expand the seed chunk into a set of related chunks.
+            related_chunk_ids, _ = vector_search(
+                np.mean(seed_chunk.embedding_matrix, axis=0, keepdims=True),
+                num_results=randint(2, max_contexts_per_eval // 2),  # noqa: S311
+                config=config,
+            )
+            related_chunks = retrieve_segments(related_chunk_ids, config=config)
+            # Extract a question from the seed chunk's related chunks.
+            try:
+                question_response = extract_with_llm(
+                    QuestionResponse, related_chunks, config=config
+                )
+            except ValueError:
+                continue
+            else:
+                question = question_response.question
+            # Search for candidate chunks to answer the generated question.
+            candidate_chunk_ids, _ = hybrid_search(
+                question, num_results=max_contexts_per_eval, config=config
+            )
+            candidate_chunks = [session.get(Chunk, chunk_id) for chunk_id in candidate_chunk_ids]
+            # Determine which candidate chunks are relevant to answer the generated question.
+            class ContextEvalResponse(BaseModel):
+                """Indicate whether the provided context can be used to answer a given question."""
+                hit: bool = Field(
+                    ...,
+                    description="True if the provided context contains (a part of) the answer to the given question, false otherwise.",
+                )
+                system_prompt: ClassVar[str] = f"""
+You are given a context extracted from a document.
+You are a subject matter expert on the document's topic.
+Your task is to answer whether the provided context contains (a part of) the answer to this question: "{question}"
+An example of a context that does NOT contain (a part of) the answer is a table of contents.
+                    """.strip()
+            relevant_chunks = []
+            for candidate_chunk in tqdm(
+                candidate_chunks, desc="Evaluating chunks", unit="chunk", dynamic_ncols=True
+            ):
+                try:
+                    context_eval_response = extract_with_llm(
+                        ContextEvalResponse, str(candidate_chunk), config=config
+                    )
+                except ValueError:  # noqa: PERF203
+                    pass
+                else:
+                    if context_eval_response.hit:
+                        relevant_chunks.append(candidate_chunk)
+            if not relevant_chunks:
+                continue
+            # Answer the question using the relevant chunks.
+            class AnswerResponse(BaseModel):
+                """Answer a question using the provided context."""
+                answer: str = Field(
+                    ...,
+                    description="A complete answer to the given question using the provided context.",
+                    min_length=1,
+                )
+                system_prompt: ClassVar[str] = f"""
+You are given a set of contexts extracted from a document.
+You are a subject matter expert on the document's topic.
+Your task is to generate a complete answer to the following question using the provided context: "{question}"
+The answer MUST satisfy ALL of the following criteria:
+- The answer MUST integrate as much of the provided context as possible.
+- The answer MUST be entirely self-contained and able to be understood in full WITHOUT access to the provided context.
+- The answer MUST NOT reference the existence of the context, directly or indirectly.
+- The answer MUST treat the context as if its contents are entirely part of your working memory.
+                    """.strip()
+            try:
+                answer_response = extract_with_llm(
+                    AnswerResponse,
+                    [str(relevant_chunk) for relevant_chunk in relevant_chunks],
+                    config=config,
+                )
+            except ValueError:
+                continue
+            else:
+                answer = answer_response.answer
+            # Store the eval in the database.
+            eval_ = Eval.from_chunks(
+                question=question,
+                contexts=relevant_chunks,
+                ground_truth=answer,
+            )
+            session.add(eval_)
+            session.commit()
+def answer_evals(
+    num_evals: int = 100,
+    search: SearchMethod = hybrid_search,
+    *,
+    config: RAGLiteConfig | None = None,
+) -> pd.DataFrame:
+    """Read evals from the database and answer them with RAG."""
+    # Read evals from the database.
+    config = config or RAGLiteConfig()
+    engine = create_database_engine(config)
+    with Session(engine) as session:
+        evals = session.exec(select(Eval).limit(num_evals)).all()
+    # Answer evals with RAG.
+    answers: list[str] = []
+    contexts: list[list[str]] = []
+    for eval_ in tqdm(evals, desc="Answering evals", unit="eval", dynamic_ncols=True):
+        response = rag(eval_.question, search=search, config=config)
+        answer = "".join(response)
+        answers.append(answer)
+        chunk_ids, _ = search(eval_.question, config=config)
+        contexts.append(retrieve_segments(chunk_ids))
+    # Collect the answered evals.
+    answered_evals: dict[str, list[str] | list[list[str]]] = {
+        "question": [eval_.question for eval_ in evals],
+        "answer": answers,
+        "contexts": contexts,
+        "ground_truth": [eval_.ground_truth for eval_ in evals],
+        "ground_truth_contexts": [eval_.contexts for eval_ in evals],
+    }
+    answered_evals_df = pd.DataFrame.from_dict(answered_evals)
+    return answered_evals_df
+def evaluate(
+    answered_evals: pd.DataFrame | int = 100,
+    config: RAGLiteConfig | None = None,
+) -> pd.DataFrame:
+    """Evaluate the performance of a set of answered evals with Ragas."""
+    try:
+        from datasets import Dataset
+        from langchain_community.chat_models import ChatLiteLLM
+        from langchain_community.embeddings import LlamaCppEmbeddings
+        from langchain_community.llms import LlamaCpp
+        from ragas import RunConfig
+        from ragas import evaluate as ragas_evaluate
+        from raglite._litellm import LlamaCppPythonLLM
+    except ImportError as import_error:
+        error_message = "To use the `evaluate` function, please install the `ragas` extra."
+        raise ImportError(error_message) from import_error
+    # Create a set of answered evals if not provided.
+    config = config or RAGLiteConfig()
+    answered_evals_df = (
+        answered_evals
+        if isinstance(answered_evals, pd.DataFrame)
+        else answer_evals(num_evals=answered_evals, config=config)
+    )
+    # Load the LLM.
+    if config.llm.startswith("llama-cpp-python"):
+        llm = LlamaCppPythonLLM().llm(model=config.llm)
+        lc_llm = LlamaCpp(
+            model_path=llm.model_path,
+            n_batch=llm.n_batch,
+            n_ctx=llm.n_ctx(),
+            n_gpu_layers=-1,
+            verbose=llm.verbose,
+        )
+    else:
+        lc_llm = ChatLiteLLM(model=config.llm)  # type: ignore[call-arg]
+    # Load the embedder.
+    if not config.embedder.startswith("llama-cpp-python"):
+        error_message = "Currently, only `llama-cpp-python` embedders are supported."
+        raise NotImplementedError(error_message)
+    embedder = LlamaCppPythonLLM().llm(model=config.embedder, embedding=True)
+    lc_embedder = LlamaCppEmbeddings(  # type: ignore[call-arg]
+        model_path=embedder.model_path,
+        n_batch=embedder.n_batch,
+        n_ctx=embedder.n_ctx(),
+        n_gpu_layers=-1,
+        verbose=embedder.verbose,
+    )
+    # Evaluate the answered evals with Ragas.
+    evaluation_df = ragas_evaluate(
+        dataset=Dataset.from_pandas(answered_evals_df),
+        llm=lc_llm,
+        embeddings=lc_embedder,
+        run_config=RunConfig(max_workers=1),
+    ).to_pandas()
+    return evaluation_df

src/raglite/_extract.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Extract structured data from unstructured text with an LLM."""
+from typing import Any, TypeVar
+from litellm import completion
+from pydantic import BaseModel, ValidationError
+from raglite._config import RAGLiteConfig
+T = TypeVar("T", bound=BaseModel)
+def extract_with_llm(
+    return_type: type[T],
+    user_prompt: str | list[str],
+    config: RAGLiteConfig | None = None,
+    **kwargs: Any,
+) -> T:
+    """Extract structured data from unstructured text with an LLM.
+    This function expects a `return_type.system_prompt: ClassVar[str]` that contains the system
+    prompt to use. Example:
+        from typing import ClassVar
+        from pydantic import BaseModel, Field
+        class MyNameResponse(BaseModel):
+            my_name: str = Field(..., description="The user's name.")
+            system_prompt: ClassVar[str] = "The system prompt to use (excluded from JSON schema)."
+        my_name_response = extract_with_llm(MyNameResponse, "My name is Thomas A. Anderson.")
+    """
+    # Load the default config if not provided.
+    config = config or RAGLiteConfig()
+    # Update the system prompt with the JSON schema of the return type to help the LLM.
+    system_prompt = (
+        return_type.system_prompt.strip() + "\n",  # type: ignore[attr-defined]
+        "Format your response according to this JSON schema:\n",
+        return_type.model_json_schema(),
+    )
+    # Concatenate the user prompt if it is a list of strings.
+    if isinstance(user_prompt, list):
+        user_prompt = "\n\n".join(
+            f'<context index="{i}">\n{chunk.strip()}\n</context>'
+            for i, chunk in enumerate(user_prompt)
+        )
+    # Extract structured data from the unstructured input.
+    for _ in range(config.llm_max_tries):
+        response = completion(
+            model=config.llm,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            response_format={"type": "json_object", "schema": return_type.model_json_schema()},
+            **kwargs,
+        )
+        try:
+            instance = return_type.model_validate_json(response["choices"][0]["message"]["content"])
+        except (KeyError, ValueError, ValidationError) as e:
+            # Malformed response, not a JSON string, or not a valid instance of the return type.
+            last_exception = e
+            continue
+        else:
+            break
+    else:
+        error_message = f"Failed to extract {return_type} from input {user_prompt}."
+        raise ValueError(error_message) from last_exception
+    return instance

src/raglite/_flashrank.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Patched version of FlashRankRanker that fixes incorrect reranking [1].
+[1] https://github.com/AnswerDotAI/rerankers/issues/39
+"""
+import contextlib
+from io import StringIO
+from typing import Any
+from flashrank import RerankRequest
+# Suppress rerankers output on import until [1] is fixed.
+# [1] https://github.com/AnswerDotAI/rerankers/issues/36
+with contextlib.redirect_stdout(StringIO()):
+    from rerankers.documents import Document
+    from rerankers.models.flashrank_ranker import FlashRankRanker
+    from rerankers.results import RankedResults, Result
+    from rerankers.utils import prep_docs
+class PatchedFlashRankRanker(FlashRankRanker):
+    def rank(
+        self,
+        query: str,
+        docs: str | list[str] | Document | list[Document],
+        doc_ids: list[str] | list[int] | None = None,
+        metadata: list[dict[str, Any]] | None = None,
+    ) -> RankedResults:
+        docs = prep_docs(docs, doc_ids, metadata)
+        passages = [{"id": doc_idx, "text": doc.text} for doc_idx, doc in enumerate(docs)]
+        rerank_request = RerankRequest(query=query, passages=passages)
+        flashrank_results = self.model.rerank(rerank_request)
+        ranked_results = [
+            Result(
+                document=docs[result["id"]],  # This patches the incorrect ranking in the original.
+                score=result["score"],
+                rank=idx + 1,
+            )
+            for idx, result in enumerate(flashrank_results)
+        ]
+        return RankedResults(results=ranked_results, query=query, has_scores=True)

src/raglite/_insert.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""Index documents."""
+from pathlib import Path
+import numpy as np
+from sqlalchemy.engine import make_url
+from sqlmodel import Session, select
+from tqdm.auto import tqdm
+from raglite._config import RAGLiteConfig
+from raglite._database import Chunk, ChunkEmbedding, Document, IndexMetadata, create_database_engine
+from raglite._embed import embed_sentences, sentence_embedding_type
+from raglite._markdown import document_to_markdown
+from raglite._split_chunks import split_chunks
+from raglite._split_sentences import split_sentences
+from raglite._typing import FloatMatrix
+def _create_chunk_records(
+    document_id: str,
+    chunks: list[str],
+    chunk_embeddings: list[FloatMatrix],
+    config: RAGLiteConfig,
+) -> tuple[list[Chunk], list[list[ChunkEmbedding]]]:
+    """Process chunks into chunk and chunk embedding records."""
+    # Create the chunk records.
+    chunk_records, headings = [], ""
+    for i, chunk in enumerate(chunks):
+        # Create and append the chunk record.
+        record = Chunk.from_body(document_id=document_id, index=i, body=chunk, headings=headings)
+        chunk_records.append(record)
+        # Update the Markdown headings with those of this chunk.
+        headings = record.extract_headings()
+    # Create the chunk embedding records.
+    chunk_embedding_records = []
+    if sentence_embedding_type(config=config) == "late_chunking":
+        # Every chunk record is associated with a list of chunk embedding records, one for each of
+        # the sentences in the chunk.
+        for chunk_record, chunk_embedding in zip(chunk_records, chunk_embeddings, strict=True):
+            chunk_embedding_records.append(
+                [
+                    ChunkEmbedding(chunk_id=chunk_record.id, embedding=sentence_embedding)
+                    for sentence_embedding in chunk_embedding
+                ]
+            )
+    else:
+        # Embed the full chunks, including the current Markdown headings.
+        full_chunk_embeddings = embed_sentences([str(chunk) for chunk in chunks], config=config)
+        # Every chunk record is associated with a list of chunk embedding records. The chunk
+        # embedding records each correspond to a linear combination of a sentence embedding and an
+        # embedding of the full chunk with Markdown headings.
+        α = 0.382  # Golden ratio.  # noqa: PLC2401
+        for chunk_record, chunk_embedding, full_chunk_embedding in zip(
+            chunk_records, chunk_embeddings, full_chunk_embeddings, strict=True
+        ):
+            chunk_embedding_records.append(
+                [
+                    ChunkEmbedding(
+                        chunk_id=chunk_record.id,
+                        embedding=α * sentence_embedding + (1 - α) * full_chunk_embedding,
+                    )
+                    for sentence_embedding in chunk_embedding
+                ]
+            )
+    return chunk_records, chunk_embedding_records
+def insert_document(doc_path: Path, *, config: RAGLiteConfig | None = None) -> None:  # noqa: PLR0915
+    """Insert a document into the database and update the index."""
+    # Use the default config if not provided.
+    config = config or RAGLiteConfig()
+    db_backend = make_url(config.db_url).get_backend_name()
+    # Preprocess the document into chunks and chunk embeddings.
+    with tqdm(total=5, unit="step", dynamic_ncols=True) as pbar:
+        pbar.set_description("Initializing database")
+        engine = create_database_engine(config)
+        pbar.update(1)
+        pbar.set_description("Converting to Markdown")
+        doc = document_to_markdown(doc_path)
+        pbar.update(1)
+        pbar.set_description("Splitting sentences")
+        sentences = split_sentences(doc, max_len=config.chunk_max_size)
+        pbar.update(1)
+        pbar.set_description("Embedding sentences")
+        sentence_embeddings = embed_sentences(sentences, config=config)
+        pbar.update(1)
+        pbar.set_description("Splitting chunks")
+        chunks, chunk_embeddings = split_chunks(
+            sentences=sentences,
+            sentence_embeddings=sentence_embeddings,
+            sentence_window_size=config.embedder_sentence_window_size,
+            max_size=config.chunk_max_size,
+        )
+        pbar.update(1)
+    # Create and store the chunk records.
+    with Session(engine) as session:
+        # Add the document to the document table.
+        document_record = Document.from_path(doc_path)
+        if session.get(Document, document_record.id) is None:
+            session.add(document_record)
+            session.commit()
+        # Create the chunk records to insert into the chunk table.
+        chunk_records, chunk_embedding_records = _create_chunk_records(
+            document_record.id, chunks, chunk_embeddings, config
+        )
+        # Store the chunk and chunk embedding records.
+        for chunk_record, chunk_embedding_record_list in tqdm(
+            zip(chunk_records, chunk_embedding_records, strict=True),
+            desc="Inserting chunks",
+            total=len(chunk_records),
+            unit="chunk",
+            dynamic_ncols=True,
+        ):
+            if session.get(Chunk, chunk_record.id) is not None:
+                continue
+            session.add(chunk_record)
+            session.add_all(chunk_embedding_record_list)
+            session.commit()
+    # Manually update the vector search chunk index for SQLite.
+    if db_backend == "sqlite":
+        from pynndescent import NNDescent
+        with Session(engine) as session:
+            # Get the vector search chunk index from the database, or create a new one.
+            index_metadata = session.get(IndexMetadata, "default") or IndexMetadata(id="default")
+            chunk_ids = index_metadata.metadata_.get("chunk_ids", [])
+            chunk_sizes = index_metadata.metadata_.get("chunk_sizes", [])
+            # Get the unindexed chunks.
+            unindexed_chunks = list(session.exec(select(Chunk).offset(len(chunk_ids))).all())
+            if not unindexed_chunks:
+                return
+            # Assemble the unindexed chunk embeddings into a NumPy array.
+            unindexed_chunk_embeddings = [chunk.embedding_matrix for chunk in unindexed_chunks]
+            X = np.vstack(unindexed_chunk_embeddings)  # noqa: N806
+            # Index the unindexed chunks.
+            with tqdm(
+                total=len(unindexed_chunks),
+                desc="Indexing chunks",
+                unit="chunk",
+                dynamic_ncols=True,
+            ) as pbar:
+                # Fit or update the ANN index.
+                if len(chunk_ids) == 0:
+                    nndescent = NNDescent(X, metric=config.vector_search_index_metric)
+                else:
+                    nndescent = index_metadata.metadata_["index"]
+                    nndescent.update(X)
+                # Prepare the ANN index so it can to handle query vectors not in the training set.
+                nndescent.prepare()
+                # Update the index metadata and mark it as dirty by recreating the dictionary.
+                index_metadata.metadata_ = {
+                    **index_metadata.metadata_,
+                    "index": nndescent,
+                    "chunk_ids": chunk_ids + [c.id for c in unindexed_chunks],
+                    "chunk_sizes": chunk_sizes + [len(em) for em in unindexed_chunk_embeddings],
+                }
+                # Store the updated vector search chunk index.
+                session.add(index_metadata)
+                session.commit()
+                pbar.update(len(unindexed_chunks))

src/raglite/_litellm.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""Add support for llama-cpp-python models to LiteLLM."""
+import asyncio
+import logging
+import warnings
+from collections.abc import AsyncIterator, Callable, Iterator
+from functools import cache
+from typing import Any, ClassVar, cast
+import httpx
+import litellm
+from litellm import (  # type: ignore[attr-defined]
+    CustomLLM,
+    GenericStreamingChunk,
+    ModelResponse,
+    convert_to_model_response_object,
+)
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from llama_cpp import (  # type: ignore[attr-defined]
+    ChatCompletionRequestMessage,
+    CreateChatCompletionResponse,
+    CreateChatCompletionStreamResponse,
+    Llama,
+    LlamaRAMCache,
+)
+# Reduce the logging level for LiteLLM and flashrank.
+logging.getLogger("litellm").setLevel(logging.WARNING)
+logging.getLogger("flashrank").setLevel(logging.WARNING)
+class LlamaCppPythonLLM(CustomLLM):
+    """A llama-cpp-python provider for LiteLLM.
+    This provider enables using llama-cpp-python models with LiteLLM. The LiteLLM model
+    specification is "llama-cpp-python/<hugging_face_repo_id>/<filename>@<n_ctx>", where n_ctx is
+    an optional parameter that specifies the context size of the model. If n_ctx is not provided or
+    if it's set to 0, the model's default context size is used.
+    Example usage:
+    ```python
+    from litellm import completion
+    response = completion(
+        model="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@4092",
+        messages=[{"role": "user", "content": "Hello world!"}],
+        # stream=True
+    )
+    ```
+    """
+    # Create a lock to prevent concurrent access to llama-cpp-python models.
+    streaming_lock: ClassVar[asyncio.Lock] = asyncio.Lock()
+    # The set of supported OpenAI parameters is the intersection of [1] and [2]. Not included:
+    # max_completion_tokens, stream_options, n, user, logprobs, top_logprobs, extra_headers.
+    # [1] https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion
+    # [2] https://docs.litellm.ai/docs/completion/input
+    supported_openai_params: ClassVar[list[str]] = [
+        "functions",  # Deprecated
+        "function_call",  # Deprecated
+        "tools",
+        "tool_choice",
+        "temperature",
+        "top_p",
+        "top_k",
+        "min_p",
+        "typical_p",
+        "stop",
+        "seed",
+        "response_format",
+        "max_tokens",
+        "presence_penalty",
+        "frequency_penalty",
+        "repeat_penalty",
+        "tfs_z",
+        "mirostat_mode",
+        "mirostat_tau",
+        "mirostat_eta",
+        "logit_bias",
+    ]
+    @staticmethod
+    @cache
+    def llm(model: str, **kwargs: Any) -> Llama:
+        # Drop the llama-cpp-python prefix from the model.
+        repo_id_filename = model.replace("llama-cpp-python/", "")
+        # Convert the LiteLLM model string to repo_id, filename, and n_ctx.
+        repo_id, filename = repo_id_filename.rsplit("/", maxsplit=1)
+        n_ctx = 0
+        if len(filename_n_ctx := filename.rsplit("@", maxsplit=1)) == 2:  # noqa: PLR2004
+            filename, n_ctx_str = filename_n_ctx
+            n_ctx = int(n_ctx_str)
+        # Load the LLM.
+        with warnings.catch_warnings():  # Filter huggingface_hub warning about HF_TOKEN.
+            warnings.filterwarnings("ignore", category=UserWarning)
+            llm = Llama.from_pretrained(
+                repo_id=repo_id,
+                filename=filename,
+                n_ctx=n_ctx,
+                n_gpu_layers=-1,
+                verbose=False,
+                **kwargs,
+            )
+        # Enable caching.
+        llm.set_cache(LlamaRAMCache())
+        # Register the model info with LiteLLM.
+        litellm.register_model(  # type: ignore[attr-defined]
+            {
+                model: {
+                    "max_tokens": llm.n_ctx(),
+                    "max_input_tokens": llm.n_ctx(),
+                    "max_output_tokens": None,
+                    "input_cost_per_token": 0.0,
+                    "output_cost_per_token": 0.0,
+                    "output_vector_size": llm.n_embd() if kwargs.get("embedding") else None,
+                    "litellm_provider": "llama-cpp-python",
+                    "mode": "embedding" if kwargs.get("embedding") else "completion",
+                    "supported_openai_params": LlamaCppPythonLLM.supported_openai_params,
+                    "supports_function_calling": True,
+                    "supports_parallel_function_calling": True,
+                    "supports_vision": False,
+                }
+            }
+        )
+        return llm
+    def completion(  # noqa: PLR0913
+        self,
+        model: str,
+        messages: list[ChatCompletionRequestMessage],
+        api_base: str,
+        custom_prompt_dict: dict[str, Any],
+        model_response: ModelResponse,
+        print_verbose: Callable,  # type: ignore[type-arg]
+        encoding: str,
+        api_key: str,
+        logging_obj: Any,
+        optional_params: dict[str, Any],
+        acompletion: Callable | None = None,  # type: ignore[type-arg]
+        litellm_params: dict[str, Any] | None = None,
+        logger_fn: Callable | None = None,  # type: ignore[type-arg]
+        headers: dict[str, Any] | None = None,
+        timeout: float | httpx.Timeout | None = None,
+        client: HTTPHandler | None = None,
+    ) -> ModelResponse:
+        llm = self.llm(model)
+        llama_cpp_python_params = {
+            k: v for k, v in optional_params.items() if k in self.supported_openai_params
+        }
+        response = cast(
+            CreateChatCompletionResponse,
+            llm.create_chat_completion(messages=messages, **llama_cpp_python_params),
+        )
+        litellm_model_response: ModelResponse = convert_to_model_response_object(
+            response_object=response,
+            model_response_object=model_response,
+            response_type="completion",
+            stream=False,
+        )
+        return litellm_model_response
+    def streaming(  # noqa: PLR0913
+        self,
+        model: str,
+        messages: list[ChatCompletionRequestMessage],
+        api_base: str,
+        custom_prompt_dict: dict[str, Any],
+        model_response: ModelResponse,
+        print_verbose: Callable,  # type: ignore[type-arg]
+        encoding: str,
+        api_key: str,
+        logging_obj: Any,
+        optional_params: dict[str, Any],
+        acompletion: Callable | None = None,  # type: ignore[type-arg]
+        litellm_params: dict[str, Any] | None = None,
+        logger_fn: Callable | None = None,  # type: ignore[type-arg]
+        headers: dict[str, Any] | None = None,
+        timeout: float | httpx.Timeout | None = None,
+        client: HTTPHandler | None = None,
+    ) -> Iterator[GenericStreamingChunk]:
+        llm = self.llm(model)
+        llama_cpp_python_params = {
+            k: v for k, v in optional_params.items() if k in self.supported_openai_params
+        }
+        stream = cast(
+            Iterator[CreateChatCompletionStreamResponse],
+            llm.create_chat_completion(messages=messages, **llama_cpp_python_params, stream=True),
+        )
+        for chunk in stream:
+            choices = chunk.get("choices", [])
+            for choice in choices:
+                text = choice.get("delta", {}).get("content", None)
+                finish_reason = choice.get("finish_reason")
+                litellm_generic_streaming_chunk = GenericStreamingChunk(
+                    text=text,  # type: ignore[typeddict-item]
+                    is_finished=bool(finish_reason),
+                    finish_reason=finish_reason,  # type: ignore[typeddict-item]
+                    usage=None,
+                    index=choice.get("index"),  # type: ignore[typeddict-item]
+                    provider_specific_fields={
+                        "id": chunk.get("id"),
+                        "model": chunk.get("model"),
+                        "created": chunk.get("created"),
+                        "object": chunk.get("object"),
+                    },
+                )
+                yield litellm_generic_streaming_chunk
+    async def astreaming(  # type: ignore[misc,override]  # noqa: PLR0913
+        self,
+        model: str,
+        messages: list[ChatCompletionRequestMessage],
+        api_base: str,
+        custom_prompt_dict: dict[str, Any],
+        model_response: ModelResponse,
+        print_verbose: Callable,  # type: ignore[type-arg]
+        encoding: str,
+        api_key: str,
+        logging_obj: Any,
+        optional_params: dict[str, Any],
+        acompletion: Callable | None = None,  # type: ignore[type-arg]
+        litellm_params: dict[str, Any] | None = None,
+        logger_fn: Callable | None = None,  # type: ignore[type-arg]
+        headers: dict[str, Any] | None = None,
+        timeout: float | httpx.Timeout | None = None,  # noqa: ASYNC109
+        client: AsyncHTTPHandler | None = None,
+    ) -> AsyncIterator[GenericStreamingChunk]:
+        # Start a synchronous stream.
+        stream = self.streaming(
+            model,
+            messages,
+            api_base,
+            custom_prompt_dict,
+            model_response,
+            print_verbose,
+            encoding,
+            api_key,
+            logging_obj,
+            optional_params,
+            acompletion,
+            litellm_params,
+            logger_fn,
+            headers,
+            timeout,
+        )
+        await asyncio.sleep(0)  # Yield control to the event loop after initialising the context.
+        # Wrap the synchronous stream in an asynchronous stream.
+        async with LlamaCppPythonLLM.streaming_lock:
+            for litellm_generic_streaming_chunk in stream:
+                yield litellm_generic_streaming_chunk
+                await asyncio.sleep(0)  # Yield control to the event loop after each token.
+# Register the LlamaCppPythonLLM provider.
+if not any(provider["provider"] == "llama-cpp-python" for provider in litellm.custom_provider_map):
+    litellm.custom_provider_map.append(
+        {"provider": "llama-cpp-python", "custom_handler": LlamaCppPythonLLM()}
+    )
+    litellm.suppress_debug_info = True

src/raglite/_markdown.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""Convert any document to Markdown."""
+import re
+from copy import deepcopy
+from pathlib import Path
+from typing import Any
+import mdformat
+import numpy as np
+from pdftext.extraction import dictionary_output
+from sklearn.cluster import KMeans
+def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]:  # noqa: C901, PLR0915
+    """Convert a PDF parsed with pdftext to Markdown."""
+    def add_heading_level_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:  # noqa: C901
+        """Add heading level metadata to a PDF parsed with pdftext."""
+        def extract_font_size(span: dict[str, Any]) -> float:
+            """Extract the font size from a text span."""
+            font_size: float = 1.0
+            if span["font"]["size"] > 1:  # A value of 1 appears to mean "unknown" in pdftext.
+                font_size = span["font"]["size"]
+            elif digit_sequences := re.findall(r"\d+", span["font"]["name"] or ""):
+                font_size = float(digit_sequences[-1])
+            elif "\n" not in span["text"]:  # Occasionally a span can contain a newline character.
+                if round(span["rotation"]) in (0.0, 180.0, -180.0):
+                    font_size = span["bbox"][3] - span["bbox"][1]
+                elif round(span["rotation"]) in (90.0, -90.0, 270.0, -270.0):
+                    font_size = span["bbox"][2] - span["bbox"][0]
+            return font_size
+        # Copy the pages.
+        pages = deepcopy(pages)
+        # Extract an array of all font sizes used by the text spans.
+        font_sizes = np.asarray(
+            [
+                extract_font_size(span)
+                for page in pages
+                for block in page["blocks"]
+                for line in block["lines"]
+                for span in line["spans"]
+            ]
+        )
+        font_sizes = np.round(font_sizes * 2) / 2
+        unique_font_sizes, counts = np.unique(font_sizes, return_counts=True)
+        # Determine the paragraph font size as the mode font size.
+        tiny = unique_font_sizes < min(5, np.max(unique_font_sizes))
+        counts[tiny] = -counts[tiny]
+        mode = np.argmax(counts)
+        counts[tiny] = -counts[tiny]
+        mode_font_size = unique_font_sizes[mode]
+        # Determine (at most) 6 heading font sizes by clustering font sizes larger than the mode.
+        heading_font_sizes = unique_font_sizes[mode + 1 :]
+        if len(heading_font_sizes) > 0:
+            heading_counts = counts[mode + 1 :]
+            kmeans = KMeans(n_clusters=min(6, len(heading_font_sizes)), random_state=42)
+            kmeans.fit(heading_font_sizes[:, np.newaxis], sample_weight=heading_counts)
+            heading_font_sizes = np.sort(np.ravel(kmeans.cluster_centers_))[::-1]
+        # Add heading level information to the text spans and lines.
+        for page in pages:
+            for block in page["blocks"]:
+                for line in block["lines"]:
+                    if "md" not in line:
+                        line["md"] = {}
+                    heading_level = np.zeros(8)  # 0-5: <h1>-<h6>, 6: <p>, 7: <small>
+                    for span in line["spans"]:
+                        if "md" not in span:
+                            span["md"] = {}
+                        span_font_size = extract_font_size(span)
+                        if span_font_size < mode_font_size:
+                            idx = 7
+                        elif span_font_size == mode_font_size:
+                            idx = 6
+                        else:
+                            idx = np.argmin(np.abs(heading_font_sizes - span_font_size))  # type: ignore[assignment]
+                        span["md"]["heading_level"] = idx + 1
+                        heading_level[idx] += len(span["text"])
+                    line["md"]["heading_level"] = np.argmax(heading_level) + 1
+        return pages
+    def add_emphasis_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """Add emphasis metadata such as bold and italic to a PDF parsed with pdftext."""
+        # Copy the pages.
+        pages = deepcopy(pages)
+        # Add emphasis metadata to the text spans.
+        for page in pages:
+            for block in page["blocks"]:
+                for line in block["lines"]:
+                    if "md" not in line:
+                        line["md"] = {}
+                    for span in line["spans"]:
+                        if "md" not in span:
+                            span["md"] = {}
+                        span["md"]["bold"] = span["font"]["weight"] > 500  # noqa: PLR2004
+                        span["md"]["italic"] = "ital" in (span["font"]["name"] or "").lower()
+                    line["md"]["bold"] = all(
+                        span["md"]["bold"] for span in line["spans"] if span["text"].strip()
+                    )
+                    line["md"]["italic"] = all(
+                        span["md"]["italic"] for span in line["spans"] if span["text"].strip()
+                    )
+        return pages
+    def strip_page_numbers(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """Strip page numbers from a PDF parsed with pdftext."""
+        # Copy the pages.
+        pages = deepcopy(pages)
+        # Remove lines that only contain a page number.
+        for page in pages:
+            for block in page["blocks"]:
+                block["lines"] = [
+                    line
+                    for line in block["lines"]
+                    if not re.match(
+                        r"^\s*[#0]*\d+\s*$", "".join(span["text"] for span in line["spans"])
+                    )
+                ]
+        return pages
+    def convert_to_markdown(pages: list[dict[str, Any]]) -> list[str]:  # noqa: C901, PLR0912
+        """Convert a list of pages to Markdown."""
+        pages_md = []
+        for page in pages:
+            page_md = ""
+            for block in page["blocks"]:
+                block_text = ""
+                for line in block["lines"]:
+                    # Build the line text and style the spans.
+                    line_text = ""
+                    for span in line["spans"]:
+                        if (
+                            not line["md"]["bold"]
+                            and not line["md"]["italic"]
+                            and span["md"]["bold"]
+                            and span["md"]["italic"]
+                        ):
+                            line_text += f"***{span['text']}***"
+                        elif not line["md"]["bold"] and span["md"]["bold"]:
+                            line_text += f"**{span['text']}**"
+                        elif not line["md"]["italic"] and span["md"]["italic"]:
+                            line_text += f"*{span['text']}*"
+                        else:
+                            line_text += span["text"]
+                    # Add emphasis to the line (if it's not a heading or whitespace).
+                    line_text = line_text.rstrip()
+                    line_is_whitespace = not line_text.strip()
+                    line_is_heading = line["md"]["heading_level"] <= 6  # noqa: PLR2004
+                    if not line_is_heading and not line_is_whitespace:
+                        if line["md"]["bold"] and line["md"]["italic"]:
+                            line_text = f"***{line_text}***"
+                        elif line["md"]["bold"]:
+                            line_text = f"**{line_text}**"
+                        elif line["md"]["italic"]:
+                            line_text = f"*{line_text}*"
+                    # Set the heading level.
+                    if line_is_heading and not line_is_whitespace:
+                        line_text = f"{'#' * line['md']['heading_level']} {line_text}"
+                    line_text += "\n"
+                    block_text += line_text
+                block_text = block_text.rstrip() + "\n\n"
+                page_md += block_text
+            pages_md.append(page_md.strip())
+        return pages_md
+    def merge_split_headings(pages: list[str]) -> list[str]:
+        """Merge headings that are split across lines."""
+        def _merge_split_headings(match: re.Match[str]) -> str:
+            atx_headings = [line.strip("# ").strip() for line in match.group().splitlines()]
+            return f"{match.group(1)} {' '.join(atx_headings)}\n\n"
+        pages_md = [
+            re.sub(
+                r"^(#+)[ \t]+[^\n]+\n+(?:^\1[ \t]+[^\n]+\n+)+",
+                _merge_split_headings,
+                page,
+                flags=re.MULTILINE,
+            )
+            for page in pages
+        ]
+        return pages_md
+    # Add heading level metadata.
+    pages = add_heading_level_metadata(pages)
+    # Add emphasis metadata.
+    pages = add_emphasis_metadata(pages)
+    # Strip page numbers.
+    pages = strip_page_numbers(pages)
+    # Convert the pages to Markdown.
+    pages_md = convert_to_markdown(pages)
+    # Merge headings that are split across lines.
+    pages_md = merge_split_headings(pages_md)
+    return pages_md
+def document_to_markdown(doc_path: Path) -> str:
+    """Convert any document to GitHub Flavored Markdown."""
+    # Convert the file's content to GitHub Flavored Markdown.
+    if doc_path.suffix == ".pdf":
+        # Parse the PDF with pdftext and convert it to Markdown.
+        pages = dictionary_output(doc_path, sort=True, keep_chars=False)
+        doc = "\n\n".join(parsed_pdf_to_markdown(pages))
+    else:
+        try:
+            # Use pandoc for everything else.
+            import pypandoc
+            doc = pypandoc.convert_file(doc_path, to="gfm")
+        except ImportError as error:
+            error_message = (
+                "To convert files to Markdown with pandoc, please install the `pandoc` extra."
+            )
+            raise ImportError(error_message) from error
+        except RuntimeError:
+            # File format not supported, fall back to reading the text.
+            doc = doc_path.read_text()
+    # Improve Markdown quality.
+    doc = mdformat.text(doc)
+    return doc

src/raglite/_query_adapter.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""Compute and update an optimal query adapter."""
+import numpy as np
+from sqlmodel import Session, col, select
+from tqdm.auto import tqdm
+from raglite._config import RAGLiteConfig
+from raglite._database import Chunk, ChunkEmbedding, Eval, IndexMetadata, create_database_engine
+from raglite._embed import embed_sentences
+from raglite._search import vector_search
+def update_query_adapter(  # noqa: PLR0915, C901
+    *,
+    max_triplets: int = 4096,
+    max_triplets_per_eval: int = 64,
+    optimize_top_k: int = 40,
+    config: RAGLiteConfig | None = None,
+) -> None:
+    """Compute an optimal query adapter and update the database with it.
+    This function computes an optimal linear transform A, called a 'query adapter', that is used to
+    transform a query embedding q as A @ q before searching for the nearest neighbouring chunks in
+    order to improve the quality of the search results.
+    Given a set of triplets (qᵢ, pᵢ, nᵢ), we want to find the query adapter A that increases the
+    score pᵢ'qᵢ of the positive chunk pᵢ and decreases the score nᵢ'qᵢ of the negative chunk nᵢ.
+    If the nearest neighbour search uses the dot product as its relevance score, we can find the
+    optimal query adapter by solving the following relaxed Procrustes optimisation problem with a
+    bound on the Frobenius norm of A:
+    A* = argmax Σᵢ pᵢ' (A qᵢ) - nᵢ' (A qᵢ)
+                Σᵢ (pᵢ - nᵢ)' A qᵢ
+                trace[ (P - N) A Q' ]  where  Q := [q₁'; ...; qₖ']
+                                              P := [p₁'; ...; pₖ']
+                                              N := [n₁'; ...; nₖ']
+                trace[ Q' (P - N) A ]
+                trace[ M A ]           where  M := Q' (P - N)
+           s.t. ||A||_F == 1
+       = M' / ||M||_F
+    If the nearest neighbour search uses the cosine similarity as its relevance score, we can find
+    the optimal query adapter by solving the following orthogonal Procrustes optimisation problem
+    with an orthogonality constraint on A:
+    A* = argmax Σᵢ pᵢ' (A qᵢ) - nᵢ' (A qᵢ)
+                Σᵢ (pᵢ - nᵢ)' A qᵢ
+                trace[ (P - N) A Q' ]
+                trace[ Q' (P - N) A ]
+                trace[ M A ]
+                trace[ U Σ V' A ]      where  U Σ V' := M is the SVD of M
+                trace[ Σ V' A U ]
+           s.t. A'A == 𝕀
+       = V U'
+    Additionally, we want to limit the effect of A* so that it adjusts q just enough to invert
+    incorrectly ordered (q, p, n) triplets, but not so much as to affect the correctly ordered ones.
+    To achieve this, we'll rewrite M as α(M / s) + (1 - α)𝕀, where s scales M to the same norm as 𝕀,
+    and choose the smallest α that ranks (q, p, n) correctly. If α = 0, the relevance score gap
+    between an incorrect (p, n) pair would be B := (p - n)' q < 0. If α = 1, the relevance score gap
+    would be A := (p - n)' (p - n) / ||p - n|| > 0. For a target relevance score gap of say
+    C := 5% * A, the optimal α is then given by αA + (1 - α)B = C => α = (B - C) / (B - A).
+    """
+    config = config or RAGLiteConfig()
+    config_no_query_adapter = RAGLiteConfig(
+        **{**config.__dict__, "vector_search_query_adapter": False}
+    )
+    engine = create_database_engine(config)
+    with Session(engine) as session:
+        # Get random evals from the database.
+        chunk_embedding = session.exec(select(ChunkEmbedding).limit(1)).first()
+        if chunk_embedding is None:
+            error_message = "First run `insert_document()` to insert documents."
+            raise ValueError(error_message)
+        evals = session.exec(
+            select(Eval).order_by(Eval.id).limit(max(8, max_triplets // max_triplets_per_eval))
+        ).all()
+        if len(evals) * max_triplets_per_eval < len(chunk_embedding.embedding):
+            error_message = "First run `insert_evals()` to generate sufficient evals."
+            raise ValueError(error_message)
+        # Loop over the evals to generate (q, p, n) triplets.
+        Q = np.zeros((0, len(chunk_embedding.embedding)))  # noqa: N806
+        P = np.zeros_like(Q)  # noqa: N806
+        N = np.zeros_like(Q)  # noqa: N806
+        for eval_ in tqdm(
+            evals, desc="Extracting triplets from evals", unit="eval", dynamic_ncols=True
+        ):
+            # Embed the question.
+            question_embedding = embed_sentences([eval_.question], config=config)
+            # Retrieve chunks that would be used to answer the question.
+            chunk_ids, _ = vector_search(
+                question_embedding, num_results=optimize_top_k, config=config_no_query_adapter
+            )
+            retrieved_chunks = session.exec(select(Chunk).where(col(Chunk.id).in_(chunk_ids))).all()
+            # Extract (q, p, n) triplets by comparing the retrieved chunks with the eval.
+            num_triplets = 0
+            for i, retrieved_chunk in enumerate(retrieved_chunks):
+                # Select irrelevant chunks.
+                if retrieved_chunk.id not in eval_.chunk_ids:
+                    # Look up all positive chunks (each represented by the mean of its multi-vector
+                    # embedding) that are ranked lower than this negative one (represented by the
+                    # embedding in the multi-vector embedding that best matches the query).
+                    p_mean = [
+                        np.mean(chunk.embedding_matrix, axis=0, keepdims=True)
+                        for chunk in retrieved_chunks[i + 1 :]
+                        if chunk is not None and chunk.id in eval_.chunk_ids
+                    ]
+                    n_top = retrieved_chunk.embedding_matrix[
+                        np.argmax(retrieved_chunk.embedding_matrix @ question_embedding.T),
+                        np.newaxis,
+                        :,
+                    ]
+                    # Filter out any (p, n, q) triplets for which the mean positive embedding ranks
+                    # higher than the top negative one.
+                    p_mean = [p_e for p_e in p_mean if (n_top - p_e) @ question_embedding.T > 0]
+                    if not p_mean:
+                        continue
+                    # Stack the (p, n, q) triplets.
+                    p = np.vstack(p_mean)
+                    n = np.repeat(n_top, p.shape[0], axis=0)
+                    q = np.repeat(question_embedding, p.shape[0], axis=0)
+                    num_triplets += p.shape[0]
+                    # Append the (query, positive, negative) tuples to the Q, P, N matrices.
+                    Q = np.vstack([Q, q])  # noqa: N806
+                    P = np.vstack([P, p])  # noqa: N806
+                    N = np.vstack([N, n])  # noqa: N806
+                    # Check if we have sufficient triplets for this eval.
+                    if num_triplets >= max_triplets_per_eval:
+                        break
+            # Check if we have sufficient triplets to compute the query adapter.
+            if Q.shape[0] > max_triplets:
+                Q, P, N = Q[:max_triplets, :], P[:max_triplets, :], N[:max_triplets, :]  # noqa: N806
+                break
+        # Normalise the rows of Q, P, N.
+        Q /= np.linalg.norm(Q, axis=1, keepdims=True)  # noqa: N806
+        P /= np.linalg.norm(P, axis=1, keepdims=True)  # noqa: N806
+        N /= np.linalg.norm(N, axis=1, keepdims=True)  # noqa: N806
+        # Compute the optimal weighted query adapter A*.
+        # TODO: Matmul in float16 is extremely slow compared to single or double precision, why?
+        gap_before = np.sum((P - N) * Q, axis=1)
+        gap_after = 2 * (1 - np.sum(P * N, axis=1)) / np.linalg.norm(P - N, axis=1)
+        gap_target = 0.05 * gap_after
+        α = (gap_before - gap_target) / (gap_before - gap_after)  # noqa: PLC2401
+        MT = (α[:, np.newaxis] * (P - N)).T @ Q  # noqa: N806
+        s = np.linalg.norm(MT, ord="fro") / np.sqrt(MT.shape[0])
+        MT = np.mean(α) * (MT / s) + np.mean(1 - α) * np.eye(Q.shape[1])  # noqa: N806
+        if config.vector_search_index_metric == "dot":
+            # Use the relaxed Procrustes solution.
+            A_star = MT / np.linalg.norm(MT, ord="fro")  # noqa: N806
+        elif config.vector_search_index_metric == "cosine":
+            # Use the orthogonal Procrustes solution.
+            U, _, VT = np.linalg.svd(MT, full_matrices=False)  # noqa: N806
+            A_star = U @ VT  # noqa: N806
+        else:
+            error_message = f"Unsupported ANN metric: {config.vector_search_index_metric}"
+            raise ValueError(error_message)
+        # Store the optimal query adapter in the database.
+        index_metadata = session.get(IndexMetadata, "default") or IndexMetadata(id="default")
+        index_metadata.metadata_ = {**index_metadata.metadata_, "query_adapter": A_star}
+        session.add(index_metadata)
+        session.commit()

src/raglite/_rag.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""Retrieval-augmented generation."""
+from collections.abc import AsyncIterator, Iterator
+from litellm import acompletion, completion, get_model_info  # type: ignore[attr-defined]
+from raglite._config import RAGLiteConfig
+from raglite._database import Chunk
+from raglite._litellm import LlamaCppPythonLLM
+from raglite._search import hybrid_search, rerank_chunks, retrieve_segments
+from raglite._typing import SearchMethod
+RAG_SYSTEM_PROMPT = """
+You are a friendly and knowledgeable assistant that provides complete and insightful answers.
+Answer the user's question using only the context below.
+When responding, you MUST NOT reference the existence of the context, directly or indirectly.
+Instead, you MUST treat the context as if its contents are entirely part of your working memory.
+""".strip()
+def _max_contexts(
+    prompt: str,
+    *,
+    max_contexts: int = 5,
+    context_neighbors: tuple[int, ...] | None = (-1, 1),
+    messages: list[dict[str, str]] | None = None,
+    config: RAGLiteConfig | None = None,
+) -> int:
+    """Determine the maximum number of contexts for RAG."""
+    # If the user has configured a llama-cpp-python model, we ensure that LiteLLM's model info is up
+    # to date by loading that LLM.
+    config = config or RAGLiteConfig()
+    if config.llm.startswith("llama-cpp-python"):
+        _ = LlamaCppPythonLLM.llm(config.llm)
+    # Get the model's maximum context size.
+    llm_provider = "llama-cpp-python" if config.llm.startswith("llama-cpp") else None
+    model_info = get_model_info(config.llm, custom_llm_provider=llm_provider)
+    max_tokens = model_info.get("max_tokens") or 2048
+    # Reduce the maximum number of contexts to take into account the LLM's context size.
+    max_context_tokens = (
+        max_tokens
+        - sum(len(message["content"]) // 3 for message in messages or [])  # Previous messages.
+        - len(RAG_SYSTEM_PROMPT) // 3  # System prompt.
+        - len(prompt) // 3  # User prompt.
+    )
+    max_tokens_per_context = config.chunk_max_size // 3
+    max_tokens_per_context *= 1 + len(context_neighbors or [])
+    max_contexts = min(max_contexts, max_context_tokens // max_tokens_per_context)
+    if max_contexts <= 0:
+        error_message = "Not enough context tokens available for RAG."
+        raise ValueError(error_message)
+    return max_contexts
+def _contexts(  # noqa: PLR0913
+    prompt: str,
+    *,
+    max_contexts: int = 5,
+    context_neighbors: tuple[int, ...] | None = (-1, 1),
+    search: SearchMethod | list[str] | list[Chunk] = hybrid_search,
+    messages: list[dict[str, str]] | None = None,
+    config: RAGLiteConfig | None = None,
+) -> list[str]:
+    """Retrieve contexts for RAG."""
+    # Determine the maximum number of contexts.
+    max_contexts = _max_contexts(
+        prompt,
+        max_contexts=max_contexts,
+        context_neighbors=context_neighbors,
+        messages=messages,
+        config=config,
+    )
+    # Retrieve the top chunks.
+    config = config or RAGLiteConfig()
+    chunks: list[str] | list[Chunk]
+    if callable(search):
+        # If the user has configured a reranker, we retrieve extra contexts to rerank.
+        extra_contexts = 3 * max_contexts if config.reranker else 0
+        # Retrieve relevant contexts.
+        chunk_ids, _ = search(prompt, num_results=max_contexts + extra_contexts, config=config)
+        # Rerank the relevant contexts.
+        chunks = rerank_chunks(query=prompt, chunk_ids=chunk_ids, config=config)
+    else:
+        # The user has passed a list of chunk_ids or chunks directly.
+        chunks = search
+    # Extend the top contexts with their neighbors and group chunks into contiguous segments.
+    segments = retrieve_segments(chunks[:max_contexts], neighbors=context_neighbors, config=config)
+    return segments
+def rag(  # noqa: PLR0913
+    prompt: str,
+    *,
+    max_contexts: int = 5,
+    context_neighbors: tuple[int, ...] | None = (-1, 1),
+    search: SearchMethod | list[str] | list[Chunk] = hybrid_search,
+    messages: list[dict[str, str]] | None = None,
+    system_prompt: str = RAG_SYSTEM_PROMPT,
+    config: RAGLiteConfig | None = None,
+) -> Iterator[str]:
+    """Retrieval-augmented generation."""
+    # Get the contexts for RAG as contiguous segments of chunks.
+    config = config or RAGLiteConfig()
+    segments = _contexts(
+        prompt,
+        max_contexts=max_contexts,
+        context_neighbors=context_neighbors,
+        search=search,
+        config=config,
+    )
+    system_prompt = f"{system_prompt}\n\n" + "\n\n".join(
+        f'<context index="{i}">\n{segment.strip()}\n</context>'
+        for i, segment in enumerate(segments)
+    )
+    # Stream the LLM response.
+    stream = completion(
+        model=config.llm,
+        messages=[
+            *(messages or []),
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt},
+        ],
+        stream=True,
+    )
+    for output in stream:
+        token: str = output["choices"][0]["delta"].get("content") or ""
+        yield token
+async def async_rag(  # noqa: PLR0913
+    prompt: str,
+    *,
+    max_contexts: int = 5,
+    context_neighbors: tuple[int, ...] | None = (-1, 1),
+    search: SearchMethod | list[str] | list[Chunk] = hybrid_search,
+    messages: list[dict[str, str]] | None = None,
+    system_prompt: str = RAG_SYSTEM_PROMPT,
+    config: RAGLiteConfig | None = None,
+) -> AsyncIterator[str]:
+    """Retrieval-augmented generation."""
+    # Get the contexts for RAG as contiguous segments of chunks.
+    config = config or RAGLiteConfig()
+    segments = _contexts(
+        prompt,
+        max_contexts=max_contexts,
+        context_neighbors=context_neighbors,
+        search=search,
+        config=config,
+    )
+    system_prompt = f"{system_prompt}\n\n" + "\n\n".join(
+        f'<context index="{i}">\n{segment.strip()}\n</context>'
+        for i, segment in enumerate(segments)
+    )
+    # Stream the LLM response.
+    async_stream = await acompletion(
+        model=config.llm,
+        messages=[
+            *(messages or []),
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt},
+        ],
+        stream=True,
+    )
+    async for output in async_stream:
+        token: str = output["choices"][0]["delta"].get("content") or ""
+        yield token

src/raglite/_search.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""Query documents."""
+import re
+import string
+from collections import defaultdict
+from collections.abc import Sequence
+from itertools import groupby
+from typing import cast
+import numpy as np
+from langdetect import detect
+from sqlalchemy.engine import make_url
+from sqlmodel import Session, and_, col, or_, select, text
+from raglite._config import RAGLiteConfig
+from raglite._database import Chunk, ChunkEmbedding, IndexMetadata, create_database_engine
+from raglite._embed import embed_sentences
+from raglite._typing import FloatMatrix
+def vector_search(
+    query: str | FloatMatrix,
+    *,
+    num_results: int = 3,
+    config: RAGLiteConfig | None = None,
+) -> tuple[list[str], list[float]]:
+    """Search chunks using ANN vector search."""
+    # Read the config.
+    config = config or RAGLiteConfig()
+    db_backend = make_url(config.db_url).get_backend_name()
+    # Get the index metadata (including the query adapter, and in the case of SQLite, the index).
+    index_metadata = IndexMetadata.get("default", config=config)
+    # Embed the query.
+    query_embedding = (
+        embed_sentences([query], config=config)[0, :] if isinstance(query, str) else np.ravel(query)
+    )
+    # Apply the query adapter to the query embedding.
+    Q = index_metadata.get("query_adapter")  # noqa: N806
+    if config.vector_search_query_adapter and Q is not None:
+        query_embedding = (Q @ query_embedding).astype(query_embedding.dtype)
+    # Search for the multi-vector chunk embeddings that are most similar to the query embedding.
+    if db_backend == "postgresql":
+        # Check that the selected metric is supported by pgvector.
+        metrics = {"cosine": "<=>", "dot": "<#>", "euclidean": "<->", "l1": "<+>", "l2": "<->"}
+        if config.vector_search_index_metric not in metrics:
+            error_message = f"Unsupported metric {config.vector_search_index_metric}."
+            raise ValueError(error_message)
+        # With pgvector, we can obtain the nearest neighbours and similarities with a single query.
+        engine = create_database_engine(config)
+        with Session(engine) as session:
+            distance_func = getattr(
+                ChunkEmbedding.embedding, f"{config.vector_search_index_metric}_distance"
+            )
+            distance = distance_func(query_embedding).label("distance")
+            results = session.exec(
+                select(ChunkEmbedding.chunk_id, distance).order_by(distance).limit(8 * num_results)
+            )
+            chunk_ids_, distance = zip(*results, strict=True)
+            chunk_ids, similarity = np.asarray(chunk_ids_), 1.0 - np.asarray(distance)
+    elif db_backend == "sqlite":
+        # Load the NNDescent index.
+        index = index_metadata.get("index")
+        ids = np.asarray(index_metadata.get("chunk_ids"))
+        cumsum = np.cumsum(np.asarray(index_metadata.get("chunk_sizes")))
+        # Find the neighbouring multi-vector indices.
+        from pynndescent import NNDescent
+        multi_vector_indices, distance = cast(NNDescent, index).query(
+            query_embedding[np.newaxis, :], k=8 * num_results
+        )
+        similarity = 1 - distance[0, :]
+        # Transform the multi-vector indices into chunk indices, and then to chunk ids.
+        chunk_indices = np.searchsorted(cumsum, multi_vector_indices[0, :], side="right") + 1
+        chunk_ids = np.asarray([ids[chunk_index - 1] for chunk_index in chunk_indices])
+    # Score each unique chunk id as the mean similarity of its multi-vector hits. Chunk ids with
+    # fewer hits are padded with the minimum similarity of the result set.
+    unique_chunk_ids, counts = np.unique(chunk_ids, return_counts=True)
+    score = np.full(
+        (len(unique_chunk_ids), np.max(counts)), np.min(similarity), dtype=similarity.dtype
+    )
+    for i, (unique_chunk_id, count) in enumerate(zip(unique_chunk_ids, counts, strict=True)):
+        score[i, :count] = similarity[chunk_ids == unique_chunk_id]
+    pooled_similarity = np.mean(score, axis=1)
+    # Sort the chunk ids by their adjusted similarity.
+    sorted_indices = np.argsort(pooled_similarity)[::-1]
+    unique_chunk_ids = unique_chunk_ids[sorted_indices][:num_results]
+    pooled_similarity = pooled_similarity[sorted_indices][:num_results]
+    return unique_chunk_ids.tolist(), pooled_similarity.tolist()
+def keyword_search(
+    query: str, *, num_results: int = 3, config: RAGLiteConfig | None = None
+) -> tuple[list[str], list[float]]:
+    """Search chunks using BM25 keyword search."""
+    # Read the config.
+    config = config or RAGLiteConfig()
+    db_backend = make_url(config.db_url).get_backend_name()
+    # Connect to the database.
+    engine = create_database_engine(config)
+    with Session(engine) as session:
+        if db_backend == "postgresql":
+            # Convert the query to a tsquery [1].
+            # [1] https://www.postgresql.org/docs/current/textsearch-controls.html
+            query_escaped = re.sub(r"[&|!():<>\"]", " ", query)
+            tsv_query = " | ".join(query_escaped.split())
+            # Perform keyword search with tsvector.
+            statement = text("""
+                SELECT id as chunk_id, ts_rank(to_tsvector('simple', body), to_tsquery('simple', :query)) AS score
+                FROM chunk
+                WHERE to_tsvector('simple', body) @@ to_tsquery('simple', :query)
+                ORDER BY score DESC
+                LIMIT :limit;
+            """)
+            results = session.execute(statement, params={"query": tsv_query, "limit": num_results})
+        elif db_backend == "sqlite":
+            # Convert the query to an FTS5 query [1].
+            # [1] https://www.sqlite.org/fts5.html#full_text_query_syntax
+            query_escaped = re.sub(f"[{re.escape(string.punctuation)}]", "", query)
+            fts5_query = " OR ".join(query_escaped.split())
+            # Perform keyword search with FTS5. In FTS5, BM25 scores are negative [1], so we
+            # negate them to make them positive.
+            # [1] https://www.sqlite.org/fts5.html#the_bm25_function
+            statement = text("""
+                SELECT chunk.id as chunk_id, -bm25(keyword_search_chunk_index) as score
+                FROM chunk JOIN keyword_search_chunk_index ON chunk.rowid = keyword_search_chunk_index.rowid
+                WHERE keyword_search_chunk_index MATCH :match
+                ORDER BY score DESC
+                LIMIT :limit;
+            """)
+            results = session.execute(statement, params={"match": fts5_query, "limit": num_results})
+        # Unpack the results.
+        chunk_ids, keyword_score = zip(*results, strict=True)
+        chunk_ids, keyword_score = list(chunk_ids), list(keyword_score)  # type: ignore[assignment]
+    return chunk_ids, keyword_score  # type: ignore[return-value]
+def reciprocal_rank_fusion(
+    rankings: list[list[str]], *, k: int = 60
+) -> tuple[list[str], list[float]]:
+    """Reciprocal Rank Fusion."""
+    # Compute the RRF score.
+    chunk_ids = {chunk_id for ranking in rankings for chunk_id in ranking}
+    chunk_id_score: defaultdict[str, float] = defaultdict(float)
+    for ranking in rankings:
+        chunk_id_index = {chunk_id: i for i, chunk_id in enumerate(ranking)}
+        for chunk_id in chunk_ids:
+            chunk_id_score[chunk_id] += 1 / (k + chunk_id_index.get(chunk_id, len(chunk_id_index)))
+    # Rank RRF results according to descending RRF score.
+    rrf_chunk_ids, rrf_score = zip(
+        *sorted(chunk_id_score.items(), key=lambda x: x[1], reverse=True), strict=True
+    )
+    return list(rrf_chunk_ids), list(rrf_score)
+def hybrid_search(
+    query: str, *, num_results: int = 3, num_rerank: int = 100, config: RAGLiteConfig | None = None
+) -> tuple[list[str], list[float]]:
+    """Search chunks by combining ANN vector search with BM25 keyword search."""
+    # Run both searches.
+    vs_chunk_ids, _ = vector_search(query, num_results=num_rerank, config=config)
+    ks_chunk_ids, _ = keyword_search(query, num_results=num_rerank, config=config)
+    # Combine the results with Reciprocal Rank Fusion (RRF).
+    chunk_ids, hybrid_score = reciprocal_rank_fusion([vs_chunk_ids, ks_chunk_ids])
+    chunk_ids, hybrid_score = chunk_ids[:num_results], hybrid_score[:num_results]
+    return chunk_ids, hybrid_score
+def retrieve_chunks(
+    chunk_ids: list[str],
+    *,
+    config: RAGLiteConfig | None = None,
+) -> list[Chunk]:
+    """Retrieve chunks by their ids."""
+    config = config or RAGLiteConfig()
+    engine = create_database_engine(config)
+    with Session(engine) as session:
+        chunks = list(session.exec(select(Chunk).where(col(Chunk.id).in_(chunk_ids))).all())
+    chunks = sorted(chunks, key=lambda chunk: chunk_ids.index(chunk.id))
+    return chunks
+def retrieve_segments(
+    chunk_ids: list[str] | list[Chunk],
+    *,
+    neighbors: tuple[int, ...] | None = (-1, 1),
+    config: RAGLiteConfig | None = None,
+) -> list[str]:
+    """Group chunks into contiguous segments and retrieve them."""
+    # Retrieve the chunks.
+    config = config or RAGLiteConfig()
+    chunks: list[Chunk] = (
+        retrieve_chunks(chunk_ids, config=config)  # type: ignore[arg-type,assignment]
+        if all(isinstance(chunk_id, str) for chunk_id in chunk_ids)
+        else chunk_ids
+    )
+    # Extend the chunks with their neighbouring chunks.
+    if neighbors:
+        engine = create_database_engine(config)
+        with Session(engine) as session:
+            neighbor_conditions = [
+                and_(Chunk.document_id == chunk.document_id, Chunk.index == chunk.index + offset)
+                for chunk in chunks
+                for offset in neighbors
+            ]
+            chunks += list(session.exec(select(Chunk).where(or_(*neighbor_conditions))).all())
+    # Keep only the unique chunks.
+    chunks = list(set(chunks))
+    # Sort the chunks by document_id and index (needed for groupby).
+    chunks = sorted(chunks, key=lambda chunk: (chunk.document_id, chunk.index))
+    # Group the chunks into contiguous segments.
+    segments: list[list[Chunk]] = []
+    for _, group in groupby(chunks, key=lambda chunk: chunk.document_id):
+        segment: list[Chunk] = []
+        for chunk in group:
+            if not segment or chunk.index == segment[-1].index + 1:
+                segment.append(chunk)
+            else:
+                segments.append(segment)
+                segment = [chunk]
+        segments.append(segment)
+    # Rank segments according to the aggregate relevance of their chunks.
+    chunk_id_to_score = {chunk.id: 1 / (i + 1) for i, chunk in enumerate(chunks)}
+    segments.sort(
+        key=lambda segment: sum(chunk_id_to_score.get(chunk.id, 0.0) for chunk in segment),
+        reverse=True,
+    )
+    # Convert the segments into strings.
+    segments = [
+        segment[0].headings.strip() + "\n\n" + "".join(chunk.body for chunk in segment).strip()  # type: ignore[misc]
+        for segment in segments
+    ]
+    return segments  # type: ignore[return-value]
+def rerank_chunks(
+    query: str,
+    chunk_ids: list[str] | list[Chunk],
+    *,
+    config: RAGLiteConfig | None = None,
+) -> list[Chunk]:
+    """Rerank chunks according to their relevance to a given query."""
+    # Retrieve the chunks.
+    config = config or RAGLiteConfig()
+    chunks: list[Chunk] = (
+        retrieve_chunks(chunk_ids, config=config)  # type: ignore[arg-type,assignment]
+        if all(isinstance(chunk_id, str) for chunk_id in chunk_ids)
+        else chunk_ids
+    )
+    # Early exit if no reranker is configured.
+    if not config.reranker:
+        return chunks
+    # Select the reranker.
+    if isinstance(config.reranker, Sequence):
+        # Detect the languages of the chunks and queries.
+        langs = {detect(str(chunk)) for chunk in chunks}
+        langs.add(detect(query))
+        # If all chunks and the query are in the same language, use a language-specific reranker.
+        rerankers = dict(config.reranker)
+        if len(langs) == 1 and (lang := next(iter(langs))) in rerankers:
+            reranker = rerankers[lang]
+        else:
+            reranker = rerankers.get("other")
+    else:
+        # A specific reranker was configured.
+        reranker = config.reranker
+    # Rerank the chunks.
+    if reranker:
+        results = reranker.rank(query=query, docs=[str(chunk) for chunk in chunks])
+        chunks = [chunks[result.doc_id] for result in results.results]
+    return chunks

src/raglite/_split_chunks.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Split a document into semantic chunks."""
+import re
+import numpy as np
+from scipy.optimize import linprog
+from scipy.sparse import coo_matrix
+from raglite._typing import FloatMatrix
+def split_chunks(  # noqa: C901, PLR0915
+    sentences: list[str],
+    sentence_embeddings: FloatMatrix,
+    sentence_window_size: int = 3,
+    max_size: int = 1440,
+) -> tuple[list[str], list[FloatMatrix]]:
+    """Split sentences into optimal semantic chunks with corresponding sentence embeddings."""
+    # Validate the input.
+    sentence_length = np.asarray([len(sentence) for sentence in sentences])
+    if not np.all(sentence_length <= max_size):
+        error_message = "Sentence with length larger than chunk max_size detected."
+        raise ValueError(error_message)
+    if not np.all(np.linalg.norm(sentence_embeddings, axis=1) > 0.0):
+        error_message = "Sentence embeddings with zero norm detected."
+        raise ValueError(error_message)
+    # Exit early if there is only one chunk to return.
+    if len(sentences) <= 1 or sum(sentence_length) <= max_size:
+        return ["".join(sentences)] if sentences else sentences, [sentence_embeddings]
+    # Normalise the sentence embeddings to unit norm.
+    X = sentence_embeddings.astype(np.float32)  # noqa: N806
+    X = X / np.linalg.norm(X, axis=1, keepdims=True)  # noqa: N806
+    # Select nonoutlying sentences and remove the discourse vector.
+    q15, q85 = np.quantile(sentence_length, [0.15, 0.85])
+    nonoutlying_sentences = (q15 <= sentence_length) & (sentence_length <= q85)
+    discourse = np.mean(X[nonoutlying_sentences, :], axis=0)
+    discourse = discourse / np.linalg.norm(discourse)
+    if not np.any(np.linalg.norm(X - discourse[np.newaxis, :], axis=1) <= np.finfo(X.dtype).eps):
+        X = X - np.outer(X @ discourse, discourse)  # noqa: N806
+        X = X / np.linalg.norm(X, axis=1, keepdims=True)  # noqa: N806
+    # For each partition point in the list of sentences, compute the similarity of the windows
+    # before and after the partition point. Sentence embeddings are assumed to be of the sentence
+    # itself and at most the (sentence_window_size - 1) sentences that preceed it.
+    sentence_window_size = min(len(sentences) - 1, sentence_window_size)
+    windows_before = X[:-sentence_window_size]
+    windows_after = X[sentence_window_size:]
+    partition_similarity = np.ones(len(sentences) - 1, dtype=X.dtype)
+    partition_similarity[: len(windows_before)] = np.sum(windows_before * windows_after, axis=1)
+    # Make partition similarity nonnegative before modification and optimisation.
+    partition_similarity = np.maximum(
+        (partition_similarity + 1) / 2, np.sqrt(np.finfo(X.dtype).eps)
+    )
+    # Modify the partition similarity to encourage splitting on Markdown headings.
+    prev_sentence_is_heading = True
+    for i, sentence in enumerate(sentences[:-1]):
+        is_heading = bool(re.match(r"^#+\s", sentence.replace("\n", "").strip()))
+        if is_heading:
+            # Encourage splitting before a heading.
+            if not prev_sentence_is_heading:
+                partition_similarity[i - 1] = partition_similarity[i - 1] / 4
+            # Don't split immediately after a heading.
+            partition_similarity[i] = 1.0
+        prev_sentence_is_heading = is_heading
+    # Solve an optimisation problem to find the best partition points.
+    sentence_length_cumsum = np.cumsum(sentence_length)
+    row_indices = []
+    col_indices = []
+    data = []
+    for i in range(len(sentences) - 1):
+        r = sentence_length_cumsum[i - 1] if i > 0 else 0
+        idx = np.searchsorted(sentence_length_cumsum - r, max_size)
+        assert idx > i
+        if idx == len(sentence_length_cumsum):
+            break
+        cols = list(range(i, idx))
+        col_indices.extend(cols)
+        row_indices.extend([i] * len(cols))
+        data.extend([1] * len(cols))
+    A = coo_matrix(  # noqa: N806
+        (data, (row_indices, col_indices)),
+        shape=(max(row_indices) + 1, len(sentences) - 1),
+        dtype=np.float32,
+    )
+    b_ub = np.ones(A.shape[0], dtype=np.float32)
+    res = linprog(
+        partition_similarity,
+        A_ub=-A,
+        b_ub=-b_ub,
+        bounds=(0, 1),
+        integrality=[1] * A.shape[1],
+    )
+    if not res.success:
+        error_message = "Optimization of chunk partitions failed."
+        raise ValueError(error_message)
+    # Split the sentences and their window embeddings into optimal chunks.
+    partition_indices = (np.where(res.x)[0] + 1).tolist()
+    chunks = [
+        "".join(sentences[i:j])
+        for i, j in zip([0, *partition_indices], [*partition_indices, len(sentences)], strict=True)
+    ]
+    chunk_embeddings = np.split(sentence_embeddings, partition_indices)
+    return chunks, chunk_embeddings

src/raglite/_split_sentences.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Sentence splitter."""
+import re
+import spacy
+from markdown_it import MarkdownIt
+from spacy.language import Language
+@Language.component("_mark_additional_sentence_boundaries")
+def _mark_additional_sentence_boundaries(doc: spacy.tokens.Doc) -> spacy.tokens.Doc:
+    """Mark additional sentence boundaries in Markdown documents."""
+    def get_markdown_heading_indexes(doc: str) -> list[tuple[int, int]]:
+        """Get the indexes of the headings in a Markdown document."""
+        md = MarkdownIt()
+        tokens = md.parse(doc)
+        headings = []
+        lines = doc.splitlines(keepends=True)
+        char_idx = [0]
+        for line in lines:
+            char_idx.append(char_idx[-1] + len(line))
+        for token in tokens:
+            if token.type == "heading_open":
+                start_line, end_line = token.map  # type: ignore[misc]
+                heading_start = char_idx[start_line]
+                heading_end = char_idx[end_line]
+                headings.append((heading_start, heading_end))
+        return headings
+    headings = get_markdown_heading_indexes(doc.text)
+    for heading_start, heading_end in headings:
+        # Mark the start of a heading as a new sentence.
+        for token in doc:
+            if heading_start <= token.idx:
+                token.is_sent_start = True
+                break
+        # Mark the end of a heading as a new sentence.
+        for token in doc:
+            if heading_end <= token.idx:
+                token.is_sent_start = True
+                break
+    return doc
+def split_sentences(doc: str, max_len: int | None = None) -> list[str]:
+    """Split a document into sentences."""
+    # Split sentences with spaCy.
+    try:
+        nlp = spacy.load("xx_sent_ud_sm")
+    except OSError as error:
+        error_message = "Please install `xx_sent_ud_sm` with `pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.7.0/xx_sent_ud_sm-3.7.0-py3-none-any.whl`."
+        raise ImportError(error_message) from error
+    nlp.add_pipe("_mark_additional_sentence_boundaries", before="senter")
+    sentences = [sent.text_with_ws for sent in nlp(doc).sents if sent.text.strip()]
+    # Apply additional splits on paragraphs and sentences because spaCy's splitting is not perfect.
+    if max_len is not None:
+        for pattern in (r"(?<=\n\n)", r"(?<=\.\s)"):
+            sentences = [
+                part
+                for sent in sentences
+                for part in ([sent] if len(sent) <= max_len else re.split(pattern, sent))
+            ]
+    # Recursively split long sentences in the middle if they are still too long.
+    if max_len is not None:
+        while any(len(sentence) > max_len for sentence in sentences):
+            sentences = [
+                part
+                for sent in sentences
+                for part in (
+                    [sent]
+                    if len(sent) <= max_len
+                    else [sent[: len(sent) // 2], sent[len(sent) // 2 :]]
+                )
+            ]
+    return sentences

src/raglite/_typing.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""RAGLite typing."""
+import io
+import pickle
+from collections.abc import Callable
+from typing import Any, Protocol
+import numpy as np
+from sqlalchemy.engine import Dialect
+from sqlalchemy.sql.operators import Operators
+from sqlalchemy.types import Float, LargeBinary, TypeDecorator, TypeEngine, UserDefinedType
+from raglite._config import RAGLiteConfig
+FloatMatrix = np.ndarray[tuple[int, int], np.dtype[np.floating[Any]]]
+FloatVector = np.ndarray[tuple[int], np.dtype[np.floating[Any]]]
+IntVector = np.ndarray[tuple[int], np.dtype[np.intp]]
+class SearchMethod(Protocol):
+    def __call__(
+        self, query: str, *, num_results: int = 3, config: RAGLiteConfig | None = None
+    ) -> tuple[list[str], list[float]]: ...
+class NumpyArray(TypeDecorator[np.ndarray[Any, np.dtype[np.floating[Any]]]]):
+    """A NumPy array column type for SQLAlchemy."""
+    impl = LargeBinary
+    def process_bind_param(
+        self, value: np.ndarray[Any, np.dtype[np.floating[Any]]] | None, dialect: Dialect
+    ) -> bytes | None:
+        """Convert a NumPy array to bytes."""
+        if value is None:
+            return None
+        buffer = io.BytesIO()
+        np.save(buffer, value, allow_pickle=False, fix_imports=False)
+        return buffer.getvalue()
+    def process_result_value(
+        self, value: bytes | None, dialect: Dialect
+    ) -> np.ndarray[Any, np.dtype[np.floating[Any]]] | None:
+        """Convert bytes to a NumPy array."""
+        if value is None:
+            return None
+        return np.load(io.BytesIO(value), allow_pickle=False, fix_imports=False)  # type: ignore[no-any-return]
+class PickledObject(TypeDecorator[object]):
+    """A pickled object column type for SQLAlchemy."""
+    impl = LargeBinary
+    def process_bind_param(self, value: object | None, dialect: Dialect) -> bytes | None:
+        """Convert a Python object to bytes."""
+        if value is None:
+            return None
+        return pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL, fix_imports=False)
+    def process_result_value(self, value: bytes | None, dialect: Dialect) -> object | None:
+        """Convert bytes to a Python object."""
+        if value is None:
+            return None
+        return pickle.loads(value, fix_imports=False)  # type: ignore[no-any-return]  # noqa: S301
+class HalfVecComparatorMixin(UserDefinedType.Comparator[FloatVector]):
+    """A mixin that provides comparison operators for halfvecs."""
+    def cosine_distance(self, other: FloatVector) -> Operators:
+        """Compute the cosine distance."""
+        return self.op("<=>", return_type=Float)(other)
+    def dot_distance(self, other: FloatVector) -> Operators:
+        """Compute the dot product distance."""
+        return self.op("<#>", return_type=Float)(other)
+    def euclidean_distance(self, other: FloatVector) -> Operators:
+        """Compute the Euclidean distance."""
+        return self.op("<->", return_type=Float)(other)
+    def l1_distance(self, other: FloatVector) -> Operators:
+        """Compute the L1 distance."""
+        return self.op("<+>", return_type=Float)(other)
+    def l2_distance(self, other: FloatVector) -> Operators:
+        """Compute the L2 distance."""
+        return self.op("<->", return_type=Float)(other)
+class HalfVec(UserDefinedType[FloatVector]):
+    """A PostgreSQL half-precision vector column type for SQLAlchemy."""
+    cache_ok = True  # HalfVec is immutable.
+    def __init__(self, dim: int | None = None) -> None:
+        super().__init__()
+        self.dim = dim
+    def get_col_spec(self, **kwargs: Any) -> str:
+        return f"halfvec({self.dim})"
+    def bind_processor(self, dialect: Dialect) -> Callable[[FloatVector | None], str | None]:
+        """Process NumPy ndarray to PostgreSQL halfvec format for bound parameters."""
+        def process(value: FloatVector | None) -> str | None:
+            return f"[{','.join(str(x) for x in np.ravel(value))}]" if value is not None else None
+        return process
+    def result_processor(
+        self, dialect: Dialect, coltype: Any
+    ) -> Callable[[str | None], FloatVector | None]:
+        """Process PostgreSQL halfvec format to NumPy ndarray."""
+        def process(value: str | None) -> FloatVector | None:
+            if value is None:
+                return None
+            return np.fromstring(value.strip("[]"), sep=",", dtype=np.float16)
+        return process
+    class comparator_factory(HalfVecComparatorMixin):  # noqa: N801
+        ...
+class Embedding(TypeDecorator[FloatVector]):
+    """An embedding column type for SQLAlchemy."""
+    cache_ok = True  # Embedding is immutable.
+    impl = NumpyArray
+    def __init__(self, dim: int = -1):
+        super().__init__()
+        self.dim = dim
+    def load_dialect_impl(self, dialect: Dialect) -> TypeEngine[FloatVector]:
+        if dialect.name == "postgresql":
+            return dialect.type_descriptor(HalfVec(self.dim))
+        return dialect.type_descriptor(NumpyArray())
+    class comparator_factory(HalfVecComparatorMixin):  # noqa: N801
+        ...

src/raglite/py.typed ADDED Viewed

File without changes

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """RAGLite test suite."""

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Fixtures for the tests."""
+import os
+import socket
+import tempfile
+from collections.abc import Generator
+from pathlib import Path
+import pytest
+from sqlalchemy import create_engine, text
+from raglite import RAGLiteConfig, insert_document
+POSTGRES_URL = "postgresql+pg8000://raglite_user:raglite_password@postgres:5432/postgres"
+def is_postgres_running() -> bool:
+    """Check if PostgreSQL is running."""
+    try:
+        with socket.create_connection(("postgres", 5432), timeout=1):
+            return True
+    except OSError:
+        return False
+def is_openai_available() -> bool:
+    """Check if an OpenAI API key is set."""
+    return bool(os.environ.get("OPENAI_API_KEY"))
+def pytest_sessionstart(session: pytest.Session) -> None:
+    """Reset the PostgreSQL and SQLite databases."""
+    if is_postgres_running():
+        engine = create_engine(POSTGRES_URL, isolation_level="AUTOCOMMIT")
+        with engine.connect() as conn:
+            for variant in ["local", "remote"]:
+                conn.execute(text(f"DROP DATABASE IF EXISTS raglite_test_{variant}"))
+                conn.execute(text(f"CREATE DATABASE raglite_test_{variant}"))
+@pytest.fixture(scope="session")
+def sqlite_url() -> Generator[str, None, None]:
+    """Create a temporary SQLite database file and return the database URL."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        db_file = Path(temp_dir) / "raglite_test.sqlite"
+        yield f"sqlite:///{db_file}"
+@pytest.fixture(
+    scope="session",
+    params=[
+        pytest.param("sqlite", id="sqlite"),
+        pytest.param(
+            POSTGRES_URL,
+            id="postgres",
+            marks=pytest.mark.skipif(not is_postgres_running(), reason="PostgreSQL is not running"),
+        ),
+    ],
+)
+def database(request: pytest.FixtureRequest) -> str:
+    """Get a database URL to test RAGLite with."""
+    db_url: str = (
+        request.getfixturevalue("sqlite_url") if request.param == "sqlite" else request.param
+    )
+    return db_url
+@pytest.fixture(
+    scope="session",
+    params=[
+        pytest.param(
+            "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf",
+            id="bge_m3",
+        ),
+        pytest.param(
+            "text-embedding-3-small",
+            id="openai_text_embedding_3_small",
+            marks=pytest.mark.skipif(not is_openai_available(), reason="OpenAI API key is not set"),
+        ),
+    ],
+)
+def embedder(request: pytest.FixtureRequest) -> str:
+    """Get an embedder model URL to test RAGLite with."""
+    embedder: str = request.param
+    return embedder
+@pytest.fixture(scope="session")
+def raglite_test_config(database: str, embedder: str) -> RAGLiteConfig:
+    """Create a lightweight in-memory config for testing SQLite and PostgreSQL."""
+    # Select the database based on the embedder.
+    variant = "local" if embedder.startswith("llama-cpp-python") else "remote"
+    if "postgres" in database:
+        database = database.replace("/postgres", f"/raglite_test_{variant}")
+    elif "sqlite" in database:
+        database = database.replace(".sqlite", f"_{variant}.sqlite")
+    # Create a RAGLite config for the given database and embedder.
+    db_config = RAGLiteConfig(db_url=database, embedder=embedder)
+    # Insert a document and update the index.
+    doc_path = Path(__file__).parent / "specrel.pdf"  # Einstein's special relativity paper.
+    insert_document(doc_path, config=db_config)
+    return db_config

tests/specrel.pdf ADDED Viewed

Binary file (178 kB). View file

tests/test_embed.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""Test RAGLite's embedding functionality."""
+from pathlib import Path
+import numpy as np
+from raglite import RAGLiteConfig
+from raglite._embed import embed_sentences
+from raglite._markdown import document_to_markdown
+from raglite._split_sentences import split_sentences
+def test_embed(embedder: str) -> None:
+    """Test embedding a document."""
+    raglite_test_config = RAGLiteConfig(embedder=embedder, embedder_normalize=True)
+    doc_path = Path(__file__).parent / "specrel.pdf"  # Einstein's special relativity paper.
+    doc = document_to_markdown(doc_path)
+    sentences = split_sentences(doc, max_len=raglite_test_config.chunk_max_size)
+    sentence_embeddings = embed_sentences(sentences, config=raglite_test_config)
+    assert isinstance(sentences, list)
+    assert isinstance(sentence_embeddings, np.ndarray)
+    assert len(sentences) == len(sentence_embeddings)
+    assert sentence_embeddings.shape[1] >= 128  # noqa: PLR2004
+    assert sentence_embeddings.dtype == np.float16
+    assert np.all(np.isfinite(sentence_embeddings))
+    assert np.allclose(np.linalg.norm(sentence_embeddings, axis=1), 1.0, rtol=1e-3)

tests/test_import.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Test RAGLite."""
+import raglite
+def test_import() -> None:
+    """Test that the package can be imported."""
+    assert isinstance(raglite.__name__, str)

tests/test_markdown.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Test Markdown conversion."""
+from pathlib import Path
+from raglite._markdown import document_to_markdown
+def test_pdf_with_missing_font_sizes() -> None:
+    """Test conversion of a PDF with missing font sizes."""
+    # Convert a PDF whose parsed font sizes are all equal to 1.
+    doc_path = Path(__file__).parent / "specrel.pdf"  # Einstein's special relativity paper.
+    doc = document_to_markdown(doc_path)
+    # Verify that we can reconstruct the font sizes and heading levels regardless of the missing
+    # font size data.
+    expected_heading = """
+# ON THE ELECTRODYNAMICS OF MOVING BODIES
+## By A. EINSTEIN  June 30, 1905
+It is known that Maxwell
+    """.strip()
+    assert doc.startswith(expected_heading)

tests/test_rag.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Test RAGLite's RAG functionality."""
+import os
+from typing import TYPE_CHECKING
+import pytest
+from llama_cpp import llama_supports_gpu_offload
+from raglite import RAGLiteConfig, hybrid_search, rag, retrieve_chunks
+if TYPE_CHECKING:
+    from raglite._database import Chunk
+    from raglite._typing import SearchMethod
+def is_accelerator_available() -> bool:
+    """Check if an accelerator is available."""
+    return llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 8  # noqa: PLR2004
+@pytest.mark.skipif(not is_accelerator_available(), reason="No accelerator available")
+def test_rag(raglite_test_config: RAGLiteConfig) -> None:
+    """Test Retrieval-Augmented Generation."""
+    # Assemble different types of search inputs for RAG.
+    prompt = "What does it mean for two events to be simultaneous?"
+    search_inputs: list[SearchMethod | list[str] | list[Chunk]] = [
+        hybrid_search,  # A search method as input.
+        hybrid_search(prompt, config=raglite_test_config)[0],  # Chunk ids as input.
+        retrieve_chunks(  # Chunks as input.
+            hybrid_search(prompt, config=raglite_test_config)[0], config=raglite_test_config
+        ),
+    ]
+    # Answer a question with RAG.
+    for search_input in search_inputs:
+        stream = rag(prompt, search=search_input, config=raglite_test_config)
+        answer = ""
+        for update in stream:
+            assert isinstance(update, str)
+            answer += update
+        assert "simultaneous" in answer.lower()

tests/test_rerank.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Test RAGLite's reranking functionality."""
+import pytest
+from rerankers.models.ranker import BaseRanker
+from raglite import RAGLiteConfig, hybrid_search, rerank_chunks, retrieve_chunks
+from raglite._database import Chunk
+from raglite._flashrank import PatchedFlashRankRanker as FlashRankRanker
+@pytest.fixture(
+    params=[
+        pytest.param(None, id="no_reranker"),
+        pytest.param(FlashRankRanker("ms-marco-MiniLM-L-12-v2", verbose=0), id="flashrank_english"),
+        pytest.param(
+            (
+                ("en", FlashRankRanker("ms-marco-MiniLM-L-12-v2", verbose=0)),
+                ("other", FlashRankRanker("ms-marco-MultiBERT-L-12", verbose=0)),
+            ),
+            id="flashrank_multilingual",
+        ),
+    ],
+)
+def reranker(
+    request: pytest.FixtureRequest,
+) -> BaseRanker | tuple[tuple[str, BaseRanker], ...] | None:
+    """Get a reranker to test RAGLite with."""
+    reranker: BaseRanker | tuple[tuple[str, BaseRanker], ...] | None = request.param
+    return reranker
+def test_reranker(
+    raglite_test_config: RAGLiteConfig,
+    reranker: BaseRanker | tuple[tuple[str, BaseRanker], ...] | None,
+) -> None:
+    """Test inserting a document, updating the indexes, and searching for a query."""
+    # Update the config with the reranker.
+    raglite_test_config = RAGLiteConfig(
+        db_url=raglite_test_config.db_url, embedder=raglite_test_config.embedder, reranker=reranker
+    )
+    # Search for a query.
+    query = "What does it mean for two events to be simultaneous?"
+    chunk_ids, _ = hybrid_search(query, num_results=3, config=raglite_test_config)
+    # Retrieve the chunks.
+    chunks = retrieve_chunks(chunk_ids, config=raglite_test_config)
+    assert all(isinstance(chunk, Chunk) for chunk in chunks)
+    assert all(chunk_id == chunk.id for chunk_id, chunk in zip(chunk_ids, chunks, strict=True))
+    # Rerank the chunks given an inverted chunk order.
+    reranked_chunks = rerank_chunks(query, chunks[::-1], config=raglite_test_config)
+    if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder:
+        assert reranked_chunks[0] == chunks[0]
+    # Test that we can also rerank given the chunk_ids only.
+    reranked_chunks = rerank_chunks(query, chunk_ids[::-1], config=raglite_test_config)
+    if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder:
+        assert reranked_chunks[0] == chunks[0]

tests/test_search.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Test RAGLite's search functionality."""
+import pytest
+from raglite import (
+    RAGLiteConfig,
+    hybrid_search,
+    keyword_search,
+    retrieve_chunks,
+    retrieve_segments,
+    vector_search,
+)
+from raglite._database import Chunk
+from raglite._typing import SearchMethod
+@pytest.fixture(
+    params=[
+        pytest.param(keyword_search, id="keyword_search"),
+        pytest.param(vector_search, id="vector_search"),
+        pytest.param(hybrid_search, id="hybrid_search"),
+    ],
+)
+def search_method(
+    request: pytest.FixtureRequest,
+) -> SearchMethod:
+    """Get a search method to test RAGLite with."""
+    search_method: SearchMethod = request.param
+    return search_method
+def test_search(raglite_test_config: RAGLiteConfig, search_method: SearchMethod) -> None:
+    """Test searching for a query."""
+    # Search for a query.
+    query = "What does it mean for two events to be simultaneous?"
+    num_results = 5
+    chunk_ids, scores = search_method(query, num_results=num_results, config=raglite_test_config)
+    assert len(chunk_ids) == len(scores) == num_results
+    assert all(isinstance(chunk_id, str) for chunk_id in chunk_ids)
+    assert all(isinstance(score, float) for score in scores)
+    # Retrieve the chunks.
+    chunks = retrieve_chunks(chunk_ids, config=raglite_test_config)
+    assert all(isinstance(chunk, Chunk) for chunk in chunks)
+    assert all(chunk_id == chunk.id for chunk_id, chunk in zip(chunk_ids, chunks, strict=True))
+    assert any("Definition of Simultaneity" in str(chunk) for chunk in chunks)
+    # Extend the chunks with their neighbours and group them into contiguous segments.
+    segments = retrieve_segments(chunk_ids, neighbors=(-1, 1), config=raglite_test_config)
+    assert all(isinstance(segment, str) for segment in segments)

tests/test_split_chunks.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Test RAGLite's chunk splitting functionality."""
+import numpy as np
+import pytest
+from raglite._split_chunks import split_chunks
+@pytest.mark.parametrize(
+    "sentences",
+    [
+        pytest.param([], id="one_chunk:no_sentences"),
+        pytest.param(["Hello world"], id="one_chunk:one_sentence"),
+        pytest.param(["Hello world"] * 2, id="one_chunk:two_sentences"),
+        pytest.param(["Hello world"] * 3, id="one_chunk:three_sentences"),
+        pytest.param(["Hello world"] * 100, id="one_chunk:many_sentences"),
+        pytest.param(["Hello world", "X" * 1000], id="n_chunks:two_sentences_a"),
+        pytest.param(["X" * 1000, "Hello world"], id="n_chunks:two_sentences_b"),
+        pytest.param(["Hello world", "X" * 1000, "X" * 1000], id="n_chunks:three_sentences_a"),
+        pytest.param(["X" * 1000, "Hello world", "X" * 1000], id="n_chunks:three_sentences_b"),
+        pytest.param(["X" * 1000, "X" * 1000, "Hello world"], id="n_chunks:three_sentences_c"),
+        pytest.param(["X" * 1000] * 100, id="n_chunks:many_sentences_a"),
+        pytest.param(["X" * 100] * 1000, id="n_chunks:many_sentences_b"),
+    ],
+)
+def test_edge_cases(sentences: list[str]) -> None:
+    """Test chunk splitting edge cases."""
+    sentence_embeddings = np.ones((len(sentences), 768)).astype(np.float16)
+    chunks, chunk_embeddings = split_chunks(
+        sentences, sentence_embeddings, sentence_window_size=3, max_size=1440
+    )
+    assert isinstance(chunks, list)
+    assert isinstance(chunk_embeddings, list)
+    assert len(chunk_embeddings) == (len(chunks) if sentences else 1)
+    assert all(isinstance(chunk, str) for chunk in chunks)
+    assert all(isinstance(chunk_embedding, np.ndarray) for chunk_embedding in chunk_embeddings)
+    assert all(ce.dtype == sentence_embeddings.dtype for ce in chunk_embeddings)
+    assert sum(ce.shape[0] for ce in chunk_embeddings) == sentence_embeddings.shape[0]
+    assert all(ce.shape[1] == sentence_embeddings.shape[1] for ce in chunk_embeddings)
+@pytest.mark.parametrize(
+    "sentences",
+    [
+        pytest.param(["Hello world" * 1000] + ["X"] * 100, id="first"),
+        pytest.param(["X"] * 50 + ["Hello world" * 1000] + ["X"] * 50, id="middle"),
+        pytest.param(["X"] * 100 + ["Hello world" * 1000], id="last"),
+    ],
+)
+def test_long_sentence(sentences: list[str]) -> None:
+    """Test chunking on sentences that are too long."""
+    sentence_embeddings = np.ones((len(sentences), 768)).astype(np.float16)
+    with pytest.raises(
+        ValueError, match="Sentence with length larger than chunk max_size detected."
+    ):
+        _ = split_chunks(sentences, sentence_embeddings, sentence_window_size=3, max_size=1440)