#from langchain.embeddings import HuggingFaceEmbeddings from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter from langchain.llms import OpenAI from langchain.chains import ConversationalRetrievalChain, RetrievalQA from langchain.chat_models import ChatOpenAI from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.document_loaders import TextLoader, PyPDFLoader from typing import Optional import os embeddings_model_name ="multi-qa-MiniLM-L6-cos-v1" persist_directory = "db" target_source_chunks = 4 openai_api_key = os.environ.get('OPENAI_API_KEY') #embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) embeddings = SentenceTransformerEmbeddings(model_name=embeddings_model_name) def load_vectorestore_from_pdf(path:str, embeddings=embeddings, persist:Optional[bool]=True): loader = PyPDFLoader(path) documents = loader.load() #print(len(documents)) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) documents = text_splitter.split_documents(documents) #print(len(documents)) if not persist: vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=None) return vectorstore vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory) vectorstore.persist() vectorstore = None return None if __name__ == "__main__": load_vectorestore_from_pdf()