""" Indexing with vector database """ from pathlib import Path import re import chromadb from unidecode import unidecode from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_chroma import Chroma from langchain_huggingface import HuggingFaceEmbeddings # Load PDF document and create doc splits def load_doc(list_file_path, chunk_size, chunk_overlap): """Load PDF document and create doc splits""" loaders = [PyPDFLoader(x) for x in list_file_path] pages = [] for loader in loaders: pages.extend(loader.load()) text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) doc_splits = text_splitter.split_documents(pages) return doc_splits # Generate collection name for vector database # - Use filepath as input, ensuring unicode text # - Handle multiple languages (arabic, chinese) def create_collection_name(filepath): """Create collection name for vector database""" # Extract filename without extension collection_name = Path(filepath).stem # Fix potential issues from naming convention ## Remove space collection_name = collection_name.replace(" ", "-") ## ASCII transliterations of Unicode text collection_name = unidecode(collection_name) ## Remove special characters collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name) ## Limit length to 50 characters collection_name = collection_name[:50] ## Minimum length of 3 characters if len(collection_name) < 3: collection_name = collection_name + "xyz" ## Enforce start and end as alphanumeric character if not collection_name[0].isalnum(): collection_name = "A" + collection_name[1:] if not collection_name[-1].isalnum(): collection_name = collection_name[:-1] + "Z" print("\n\nFilepath: ", filepath) print("Collection name: ", collection_name) return collection_name # Create vector database def create_db(splits, collection_name): """Create embeddings and vector database""" embedding = HuggingFaceEmbeddings( model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", # model_name="sentence-transformers/all-MiniLM-L6-v2", # model_kwargs={"device": "cpu"}, # encode_kwargs={'normalize_embeddings': False} ) chromadb.api.client.SharedSystemClient.clear_system_cache() new_client = chromadb.EphemeralClient() vectordb = Chroma.from_documents( documents=splits, embedding=embedding, client=new_client, collection_name=collection_name, # persist_directory=default_persist_directory ) return vectordb