from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import BSHTMLLoader from langchain.chains import RetrievalQA from langchain.document_loaders import TextLoader from langchain.document_loaders import DirectoryLoader #Load the documents loader = DirectoryLoader('.', glob="./source/*.html", loader_cls=BSHTMLLoader) docs=loader.load() #splitting the text into chunks, trying with 1000 size print("splitting to chunks") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) texts = text_splitter.split_documents(docs) #init db and embeddings print("Creating embeddings") persist_directory="./index/chroma" embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") print("Storing in db") vectordb = Chroma.from_documents(documents=texts, embedding=embeddings, persist_directory=persist_directory) sentences = ["This is an example sentence", "Each sentence is converted","A monkey in zoo","Shark in the park","Boss on loose","a quiet date"] query="This sentence is an example" query2="Each sentence is converted" id=["a1","a2","a3","a4","a5","a6"] meta=[{"n":1},{"z":2},{},{},{"n":3},{"n":4}] #docsearch=FAISS.from_texts(sentences,embeddings,meta,id) #m=docsearch.similarity_search_with_score(query2,filter={"n":2}) print("Querying db") query="How to Increase Flexibility Without Losing Productivity" docs = vectordb.similarity_search(query) for i in range(0,len(docs)): print("\n") print(docs[i].page_content) print("Done")