from langchain.vectorstores import VectorStore from knowledge_gpt.core.parsing import File from langchain.vectorstores.faiss import FAISS from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings.base import Embeddings from typing import List, Type from langchain.docstore.document import Document from knowledge_gpt.core.debug import FakeVectorStore, FakeEmbeddings class FolderIndex: """Index for a collection of files (a folder)""" def __init__(self, files: List[File], index: VectorStore): self.name: str = "default" self.files = files self.index: VectorStore = index @staticmethod def _combine_files(files: List[File]) -> List[Document]: """Combines all the documents in a list of files into a single list.""" all_texts = [] for file in files: for doc in file.docs: doc.metadata["file_name"] = file.name doc.metadata["file_id"] = file.id all_texts.append(doc) return all_texts @classmethod def from_files( cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore] ) -> "FolderIndex": """Creates an index from files.""" all_docs = cls._combine_files(files) index = vector_store.from_documents( documents=all_docs, embedding=embeddings, ) return cls(files=files, index=index) def embed_files( files: List[File], embedding: str, vector_store: str, **kwargs ) -> FolderIndex: """Embeds a collection of files and stores them in a FolderIndex.""" supported_embeddings: dict[str, Type[Embeddings]] = { "openai": OpenAIEmbeddings, "debug": FakeEmbeddings, } supported_vector_stores: dict[str, Type[VectorStore]] = { "faiss": FAISS, "debug": FakeVectorStore, } if embedding in supported_embeddings: _embeddings = supported_embeddings[embedding](**kwargs) else: raise NotImplementedError(f"Embedding {embedding} not supported.") if vector_store in supported_vector_stores: _vector_store = supported_vector_stores[vector_store] else: raise NotImplementedError(f"Vector store {vector_store} not supported.") return FolderIndex.from_files( files=files, embeddings=_embeddings, vector_store=_vector_store )