Spaces:

mickkhaw
/

ragalicious-app

Paused

File size: 6,104 Bytes

7a6fa31

import os
from typing import List
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.vectorstores import (
    MyScale,
    MyScaleSettings,
)
from langchain_community.vectorstores.qdrant import Qdrant
from langchain_core.callbacks.manager import (
    CallbackManagerForRetrieverRun,
)
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_qdrant.vectorstores import Qdrant

from .metadata import CUISINES, DIETS, EQUIPMENT, KEY_INGREDIENTS, OCCASIONS

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
#QDRANT_CLOUD_KEY = os.environ.get("QDRANT_CLOUD_KEY")
#QDRANT_CLOUD_URL = "https://30591e3d-7092-41c4-95e1-4d3c7ef6e894.us-east4-0.gcp.cloud.qdrant.io"


# Define embedding model
base_embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=OPENAI_API_KEY)


def get_ensemble_retriever():
    # Use a Qdrant VectorStore to embed and store our data
    qdrant_descriptions = Qdrant.from_existing_collection(
        embedding=base_embeddings_model,
        # 3 vector indices - recipe_descriptions, recipe_nutrition, recipe_ingredients
        collection_name="recipe_descriptions",
        url=QDRANT_CLOUD_URL,
        api_key=QDRANT_CLOUD_KEY,
    )

    qdrant_nutrition = Qdrant.from_existing_collection(
        embedding=base_embeddings_model,
        collection_name="recipe_nutrition",
        url=QDRANT_CLOUD_URL,
        api_key=QDRANT_CLOUD_KEY,
    )

    qdrant_ingredients = Qdrant.from_existing_collection(
        embedding=base_embeddings_model,
        collection_name="recipe_ingredients",
        url=QDRANT_CLOUD_URL,
        api_key=QDRANT_CLOUD_KEY,
    )

    # Convert retrieved documents to JSON-serializable format
    descriptions_retriever = qdrant_descriptions.as_retriever(search_kwargs={"k": 20})
    nutrition_retriever = qdrant_nutrition.as_retriever(search_kwargs={"k": 20})
    ingredients_retriever = qdrant_ingredients.as_retriever(search_kwargs={"k": 20})

    ensemble_retriever = EnsembleRetriever(
        retrievers=[
            descriptions_retriever,
            nutrition_retriever,
            ingredients_retriever,
        ],
        weights=[
            0.5,
            0.25,
            0.25,
        ],
    )

    return ensemble_retriever


def _list_to_string(l: list) -> str:
    return ", ".join([f"`{item}`" for item in l])


class ModifiedSelfQueryRetriever(SelfQueryRetriever):
    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        """Get documents relevant for a query.

        Args:
            query: string to find relevant documents for

        Returns:
            List of relevant documents
        """
        structured_query = self.query_constructor.invoke(
            {"query": query}, config={"callbacks": run_manager.get_child()}
        )
        # if self.verbose:
        #     logger.info(f"Generated Query: {structured_query}")

        new_query, search_kwargs = self._prepare_query(query, structured_query)

        print("search_kwargs", search_kwargs)
        self.search_kwargs = search_kwargs

        docs = self._get_docs_with_query(new_query, search_kwargs)
        return docs


def get_self_retriever(llm_model):
    metadata_field_info = [
        AttributeInfo(
            name="cuisine",
            description="The national / ethnic cuisine categories of the recipe."
            f"It should be one of {_list_to_string(CUISINES)}. "
            "It only supports contain comparisons. "
            f"Here are some examples: contain (cuisine, '{CUISINES[0]}')",
            type="list[string]",
        ),
        AttributeInfo(
            name="diet",
            description="The diets / dietary restrictions satisfied by this recipe."
            f"It should be one of {_list_to_string(DIETS)}. "
            "It only supports contain comparisons. "
            f"Here are some examples: contain (diet, '{DIETS[0]}')",
            type="list[string]",
        ),
        AttributeInfo(
            name="equipment",
            description="The equipment required by this recipe."
            f"It should be one of {_list_to_string(EQUIPMENT)}. "
            "It only supports contain comparisons. "
            f"Here are some examples: contain (equipment, '{EQUIPMENT[0]}')",
            type="list[string]",
        ),
        AttributeInfo(
            name="occasion",
            description="The occasions, holidays, celebrations that are well suited for this recipe."
            f"It should be one of {_list_to_string(OCCASIONS)}. "
            "It only supports contain comparisons. "
            f"Here are some examples: contain (occasion, '{OCCASIONS[0]}')",
            type="list[string]",
        ),
        # AttributeInfo(
        #     name="ingredients",
        #     description="The ingredients used to make this recipe."
        #     f"It should be one of {_list_to_string(KEY_INGREDIENTS)}"
        #     "It only supports contain comparisons. "
        #     f"Here are some examples: contain (ingredients, '{KEY_INGREDIENTS[0]}')",
        #     type="list[string]",
        # ),
        AttributeInfo(
            name="time",
            description="The estimated time in minutes required to cook and prepare the recipe",
            type="integer",
        ),
    ]

    config = MyScaleSettings(
        host=os.environ["MYSCALE_HOST"],
        port=443,
        username=os.environ["MYSCALE_USERNAME"],
        password=os.environ["MYSCALE_PASSWORD"],
    )
    vectorstore = MyScale(base_embeddings_model, config)

    retriever = ModifiedSelfQueryRetriever.from_llm(
        llm_model,
        vectorstore,
        "Brief summary and key attributes of a recipe, including ingredients, cooking time, occasion, cuisine and diet",
        metadata_field_info,
        verbose=True,
        search_kwargs={"k": 10},
    )
    return retriever