Spaces:

AyoubZolodick
/

rag_lite

Running

File size: 11,316 Bytes

54f5afe

"""Generation and evaluation of evals."""

from random import randint
from typing import ClassVar

import numpy as np
import pandas as pd
from pydantic import BaseModel, Field, field_validator
from sqlmodel import Session, func, select
from tqdm.auto import tqdm, trange

from raglite._config import RAGLiteConfig
from raglite._database import Chunk, Document, Eval, create_database_engine
from raglite._extract import extract_with_llm
from raglite._rag import rag
from raglite._search import hybrid_search, retrieve_segments, vector_search
from raglite._typing import SearchMethod


def insert_evals(  # noqa: C901
    *, num_evals: int = 100, max_contexts_per_eval: int = 20, config: RAGLiteConfig | None = None
) -> None:
    """Generate and insert evals into the database."""

    class QuestionResponse(BaseModel):
        """A specific question about the content of a set of document contexts."""

        question: str = Field(
            ...,
            description="A specific question about the content of a set of document contexts.",
            min_length=1,
        )
        system_prompt: ClassVar[str] = """
You are given a set of contexts extracted from a document.
You are a subject matter expert on the document's topic.
Your task is to generate a question to quiz other subject matter experts on the information in the provided context.
The question MUST satisfy ALL of the following criteria:
- The question SHOULD integrate as much of the provided context as possible.
- The question MUST NOT be a general or open question, but MUST instead be as specific to the provided context as possible.
- The question MUST be completely answerable using ONLY the information in the provided context, without depending on any background information.
- The question MUST be entirely self-contained and able to be understood in full WITHOUT access to the provided context.
- The question MUST NOT reference the existence of the context, directly or indirectly.
- The question MUST treat the context as if its contents are entirely part of your working memory.
            """.strip()

        @field_validator("question")
        @classmethod
        def validate_question(cls, value: str) -> str:
            """Validate the question."""
            question = value.strip().lower()
            if "context" in question or "document" in question or "question" in question:
                raise ValueError
            if not question.endswith("?"):
                raise ValueError
            return value

    config = config or RAGLiteConfig()
    engine = create_database_engine(config)
    with Session(engine) as session:
        for _ in trange(num_evals, desc="Generating evals", unit="eval", dynamic_ncols=True):
            # Sample a random document from the database.
            seed_document = session.exec(select(Document).order_by(func.random()).limit(1)).first()
            if seed_document is None:
                error_message = "First run `insert_document()` before generating evals."
                raise ValueError(error_message)
            # Sample a random chunk from that document.
            seed_chunk = session.exec(
                select(Chunk)
                .where(Chunk.document_id == seed_document.id)
                .order_by(func.random())
                .limit(1)
            ).first()
            if seed_chunk is None:
                continue
            # Expand the seed chunk into a set of related chunks.
            related_chunk_ids, _ = vector_search(
                np.mean(seed_chunk.embedding_matrix, axis=0, keepdims=True),
                num_results=randint(2, max_contexts_per_eval // 2),  # noqa: S311
                config=config,
            )
            related_chunks = retrieve_segments(related_chunk_ids, config=config)
            # Extract a question from the seed chunk's related chunks.
            try:
                question_response = extract_with_llm(
                    QuestionResponse, related_chunks, config=config
                )
            except ValueError:
                continue
            else:
                question = question_response.question
            # Search for candidate chunks to answer the generated question.
            candidate_chunk_ids, _ = hybrid_search(
                question, num_results=max_contexts_per_eval, config=config
            )
            candidate_chunks = [session.get(Chunk, chunk_id) for chunk_id in candidate_chunk_ids]

            # Determine which candidate chunks are relevant to answer the generated question.
            class ContextEvalResponse(BaseModel):
                """Indicate whether the provided context can be used to answer a given question."""

                hit: bool = Field(
                    ...,
                    description="True if the provided context contains (a part of) the answer to the given question, false otherwise.",
                )
                system_prompt: ClassVar[str] = f"""
You are given a context extracted from a document.
You are a subject matter expert on the document's topic.
Your task is to answer whether the provided context contains (a part of) the answer to this question: "{question}"
An example of a context that does NOT contain (a part of) the answer is a table of contents.
                    """.strip()

            relevant_chunks = []
            for candidate_chunk in tqdm(
                candidate_chunks, desc="Evaluating chunks", unit="chunk", dynamic_ncols=True
            ):
                try:
                    context_eval_response = extract_with_llm(
                        ContextEvalResponse, str(candidate_chunk), config=config
                    )
                except ValueError:  # noqa: PERF203
                    pass
                else:
                    if context_eval_response.hit:
                        relevant_chunks.append(candidate_chunk)
            if not relevant_chunks:
                continue

            # Answer the question using the relevant chunks.
            class AnswerResponse(BaseModel):
                """Answer a question using the provided context."""

                answer: str = Field(
                    ...,
                    description="A complete answer to the given question using the provided context.",
                    min_length=1,
                )
                system_prompt: ClassVar[str] = f"""
You are given a set of contexts extracted from a document.
You are a subject matter expert on the document's topic.
Your task is to generate a complete answer to the following question using the provided context: "{question}"
The answer MUST satisfy ALL of the following criteria:
- The answer MUST integrate as much of the provided context as possible.
- The answer MUST be entirely self-contained and able to be understood in full WITHOUT access to the provided context.
- The answer MUST NOT reference the existence of the context, directly or indirectly.
- The answer MUST treat the context as if its contents are entirely part of your working memory.
                    """.strip()

            try:
                answer_response = extract_with_llm(
                    AnswerResponse,
                    [str(relevant_chunk) for relevant_chunk in relevant_chunks],
                    config=config,
                )
            except ValueError:
                continue
            else:
                answer = answer_response.answer
            # Store the eval in the database.
            eval_ = Eval.from_chunks(
                question=question,
                contexts=relevant_chunks,
                ground_truth=answer,
            )
            session.add(eval_)
            session.commit()


def answer_evals(
    num_evals: int = 100,
    search: SearchMethod = hybrid_search,
    *,
    config: RAGLiteConfig | None = None,
) -> pd.DataFrame:
    """Read evals from the database and answer them with RAG."""
    # Read evals from the database.
    config = config or RAGLiteConfig()
    engine = create_database_engine(config)
    with Session(engine) as session:
        evals = session.exec(select(Eval).limit(num_evals)).all()
    # Answer evals with RAG.
    answers: list[str] = []
    contexts: list[list[str]] = []
    for eval_ in tqdm(evals, desc="Answering evals", unit="eval", dynamic_ncols=True):
        response = rag(eval_.question, search=search, config=config)
        answer = "".join(response)
        answers.append(answer)
        chunk_ids, _ = search(eval_.question, config=config)
        contexts.append(retrieve_segments(chunk_ids))
    # Collect the answered evals.
    answered_evals: dict[str, list[str] | list[list[str]]] = {
        "question": [eval_.question for eval_ in evals],
        "answer": answers,
        "contexts": contexts,
        "ground_truth": [eval_.ground_truth for eval_ in evals],
        "ground_truth_contexts": [eval_.contexts for eval_ in evals],
    }
    answered_evals_df = pd.DataFrame.from_dict(answered_evals)
    return answered_evals_df


def evaluate(
    answered_evals: pd.DataFrame | int = 100,
    config: RAGLiteConfig | None = None,
) -> pd.DataFrame:
    """Evaluate the performance of a set of answered evals with Ragas."""
    try:
        from datasets import Dataset
        from langchain_community.chat_models import ChatLiteLLM
        from langchain_community.embeddings import LlamaCppEmbeddings
        from langchain_community.llms import LlamaCpp
        from ragas import RunConfig
        from ragas import evaluate as ragas_evaluate

        from raglite._litellm import LlamaCppPythonLLM
    except ImportError as import_error:
        error_message = "To use the `evaluate` function, please install the `ragas` extra."
        raise ImportError(error_message) from import_error

    # Create a set of answered evals if not provided.
    config = config or RAGLiteConfig()
    answered_evals_df = (
        answered_evals
        if isinstance(answered_evals, pd.DataFrame)
        else answer_evals(num_evals=answered_evals, config=config)
    )
    # Load the LLM.
    if config.llm.startswith("llama-cpp-python"):
        llm = LlamaCppPythonLLM().llm(model=config.llm)
        lc_llm = LlamaCpp(
            model_path=llm.model_path,
            n_batch=llm.n_batch,
            n_ctx=llm.n_ctx(),
            n_gpu_layers=-1,
            verbose=llm.verbose,
        )
    else:
        lc_llm = ChatLiteLLM(model=config.llm)  # type: ignore[call-arg]
    # Load the embedder.
    if not config.embedder.startswith("llama-cpp-python"):
        error_message = "Currently, only `llama-cpp-python` embedders are supported."
        raise NotImplementedError(error_message)
    embedder = LlamaCppPythonLLM().llm(model=config.embedder, embedding=True)
    lc_embedder = LlamaCppEmbeddings(  # type: ignore[call-arg]
        model_path=embedder.model_path,
        n_batch=embedder.n_batch,
        n_ctx=embedder.n_ctx(),
        n_gpu_layers=-1,
        verbose=embedder.verbose,
    )
    # Evaluate the answered evals with Ragas.
    evaluation_df = ragas_evaluate(
        dataset=Dataset.from_pandas(answered_evals_df),
        llm=lc_llm,
        embeddings=lc_embedder,
        run_config=RunConfig(max_workers=1),
    ).to_pandas()
    return evaluation_df