#!/usr/bin/env python3 """ Builds and persists a LangChain vector store over the Website documentation using Chroma. Source: https://github.com/Arize-ai/phoenix/blob/main/scripts/data/build_langchain_vector_store.py """ import argparse import getpass import logging import shutil import sys from functools import partial from typing import List from langchain.docstore.document import Document as LangChainDocument from langchain.document_loaders import GitbookLoader from langchain.embeddings import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from tiktoken import Encoding, encoding_for_model def load_gitbook_docs(docs_url: str) -> List[LangChainDocument]: """Loads documents from a Gitbook URL. Args: docs_url (str): URL to Gitbook docs. Returns: List[LangChainDocument]: List of documents in LangChain format. """ loader = GitbookLoader( docs_url, load_all_paths=True, ) return loader.load() def tiktoken_len(text: str, tokenizer: Encoding) -> int: """Returns the length of a text in tokens. Args: text (str): The text to tokenize and count. tokenizer (tiktoken.Encoding): The tokenizer. Returns: int: The number of tokens in the text. """ tokens = tokenizer.encode(text, disallowed_special=()) return len(tokens) def chunk_docs( documents: List[LangChainDocument], tokenizer: Encoding, chunk_size: int = 400, chunk_overlap: int = 20, ) -> List[LangChainDocument]: """Chunks the documents. The chunking strategy used in this function is from the following notebook and accompanying video: - https://github.com/pinecone-io/examples/blob/master/generation/langchain/handbook/ xx-langchain-chunking.ipynb - https://www.youtube.com/watch?v=eqOfr4AGLk8 Args: documents (List[LangChainDocument]): A list of input documents. tokenizer (tiktoken.Encoding): The tokenizer used to count the number of tokens in a text. chunk_size (int, optional): The size of the chunks in tokens. chunk_overlap (int, optional): The chunk overlap in tokens. Returns: List[LangChainDocument]: The chunked documents. """ text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=partial(tiktoken_len, tokenizer=tokenizer), separators=["\n\n", "\n", " ", ""], ) return text_splitter.split_documents(documents) if __name__ == "__main__": logging.basicConfig(level=logging.INFO, stream=sys.stdout) parser = argparse.ArgumentParser() parser.add_argument( "--persist-path", type=str, required=False, help="Path to persist index.", default="langchain-chroma-pulze-docs", ) args = parser.parse_args() docs_url = "https://docs.pulze.ai/" embedding_model_name = "text-embedding-ada-002" langchain_documents = load_gitbook_docs(docs_url) chunked_langchain_documents = chunk_docs( langchain_documents, tokenizer=encoding_for_model(embedding_model_name), chunk_size=200, ) embedding_model = OpenAIEmbeddings(model=embedding_model_name) shutil.rmtree(args.persist_path, ignore_errors=True) vector_store = Chroma.from_documents( chunked_langchain_documents, embedding=embedding_model, persist_directory=args.persist_path ) read_vector_store = Chroma( persist_directory=args.persist_path, embedding_function=embedding_model ) # print(read_vector_store.similarity_search("How do I use Pulze?"))