|
|
|
""" |
|
Builds and persists a LangChain vector store over the Website documentation using Chroma. |
|
Source: https://github.com/Arize-ai/phoenix/blob/main/scripts/data/build_langchain_vector_store.py |
|
""" |
|
|
|
import argparse |
|
import getpass |
|
import logging |
|
import shutil |
|
import sys |
|
from functools import partial |
|
from typing import List |
|
|
|
from langchain.docstore.document import Document as LangChainDocument |
|
from langchain.document_loaders import GitbookLoader |
|
from langchain.embeddings import OpenAIEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from tiktoken import Encoding, encoding_for_model |
|
|
|
|
|
def load_gitbook_docs(docs_url: str) -> List[LangChainDocument]: |
|
"""Loads documents from a Gitbook URL. |
|
|
|
Args: |
|
docs_url (str): URL to Gitbook docs. |
|
|
|
Returns: |
|
List[LangChainDocument]: List of documents in LangChain format. |
|
""" |
|
loader = GitbookLoader( |
|
docs_url, |
|
load_all_paths=True, |
|
) |
|
return loader.load() |
|
|
|
|
|
def tiktoken_len(text: str, tokenizer: Encoding) -> int: |
|
"""Returns the length of a text in tokens. |
|
|
|
Args: |
|
text (str): The text to tokenize and count. |
|
tokenizer (tiktoken.Encoding): The tokenizer. |
|
|
|
Returns: |
|
int: The number of tokens in the text. |
|
""" |
|
|
|
tokens = tokenizer.encode(text, disallowed_special=()) |
|
return len(tokens) |
|
|
|
|
|
def chunk_docs( |
|
documents: List[LangChainDocument], |
|
tokenizer: Encoding, |
|
chunk_size: int = 400, |
|
chunk_overlap: int = 20, |
|
) -> List[LangChainDocument]: |
|
"""Chunks the documents. |
|
|
|
The chunking strategy used in this function is from the following notebook and accompanying |
|
video: |
|
|
|
- https://github.com/pinecone-io/examples/blob/master/generation/langchain/handbook/ |
|
xx-langchain-chunking.ipynb |
|
- https://www.youtube.com/watch?v=eqOfr4AGLk8 |
|
|
|
Args: |
|
documents (List[LangChainDocument]): A list of input documents. |
|
|
|
tokenizer (tiktoken.Encoding): The tokenizer used to count the number of tokens in a text. |
|
|
|
chunk_size (int, optional): The size of the chunks in tokens. |
|
|
|
chunk_overlap (int, optional): The chunk overlap in tokens. |
|
|
|
Returns: |
|
List[LangChainDocument]: The chunked documents. |
|
""" |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
length_function=partial(tiktoken_len, tokenizer=tokenizer), |
|
separators=["\n\n", "\n", " ", ""], |
|
) |
|
return text_splitter.split_documents(documents) |
|
|
|
|
|
if __name__ == "__main__": |
|
logging.basicConfig(level=logging.INFO, stream=sys.stdout) |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--persist-path", |
|
type=str, |
|
required=False, |
|
help="Path to persist index.", |
|
default="langchain-chroma-pulze-docs", |
|
) |
|
args = parser.parse_args() |
|
|
|
docs_url = "https://docs.pulze.ai/" |
|
embedding_model_name = "text-embedding-ada-002" |
|
langchain_documents = load_gitbook_docs(docs_url) |
|
chunked_langchain_documents = chunk_docs( |
|
langchain_documents, |
|
tokenizer=encoding_for_model(embedding_model_name), |
|
chunk_size=200, |
|
) |
|
|
|
embedding_model = OpenAIEmbeddings(model=embedding_model_name) |
|
shutil.rmtree(args.persist_path, ignore_errors=True) |
|
vector_store = Chroma.from_documents( |
|
chunked_langchain_documents, embedding=embedding_model, persist_directory=args.persist_path |
|
) |
|
read_vector_store = Chroma( |
|
persist_directory=args.persist_path, embedding_function=embedding_model |
|
) |
|
|