from typing import List, Type from langchain.docstore.document import Document from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.embeddings.base import Embeddings from langchain.vectorstores import VectorStore from langchain.vectorstores.faiss import FAISS from .debug import FakeEmbeddings, FakeVectorStore from .parsing import File class FolderIndex: """Index for a collection of files (a folder)""" def __init__(self, files: List[File], index: VectorStore): self.name: str = "default" self.files = files self.index: VectorStore = index @staticmethod def _combine_files(files: List[File]) -> List[Document]: """Combines all the documents in a list of files into a single list.""" all_texts = [] for file in files: for doc in file.docs: doc.metadata["file_name"] = file.name doc.metadata["file_id"] = file.id all_texts.append(doc) return all_texts @classmethod def from_files( cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore] ) -> "FolderIndex": """Creates an index from files.""" all_docs = cls._combine_files(files) index = vector_store.from_documents( documents=all_docs, embedding=embeddings, ) return cls(files=files, index=index) def embed_files( files: List[File], embedding: str, vector_store: str, **kwargs ) -> FolderIndex: model_name = "BAAI/bge-small-en" model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity model_norm = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) # embeddings = OpenAIEmbeddings embeddings = model_norm return FolderIndex.from_files( files=files, embeddings=embeddings, vector_store=FAISS )