from typing import List, Dict, Union, Any import numpy as np from annoy import AnnoyIndex from .vector_store import VectorStore class AnnoyDB(VectorStore): def __init__( self, embedding_dim: int, metric: str = 'angular' ) -> None: self.documents = [] self.metadata = [] self.embedding_dim = embedding_dim self.index = AnnoyIndex(embedding_dim, metric) self.index_built = False def add_document(self, text: str, metadata: Dict[str, Any] = None): """ Add a document to the search index. Args: text: The document text metadata: Optional metadata about the document """ self.documents.append(text) self.metadata.append(metadata or {}) # Generate embedding using Sentence Transformers embedding = self.model.encode(text, show_progress_bar=False) # Add to Annoy index index_id = len(self.documents) - 1 self.index.add_item(index_id, embedding) self.index_built = False def add_documents(self, texts: List[str], embeddings: np.array, metadata_list: List[Dict[str, Any]] = None): """ Batch add documents to the search index. Args: texts: List of document texts metadata_list: Optional list of metadata dictionaries """ if metadata_list is None: metadata_list = [{} for _ in texts] # Add documents and embeddings print("Adding to index...") for i, (text, metadata, embedding) in enumerate(zip(texts, metadata_list, embeddings)): self.documents.append(text) self.metadata.append(metadata) self.index.add_item(len(self.documents) - 1, embedding) self.index_built = False print("Done") def add_data(self, embedding: np.ndarray, document: str): item_id = len(self.documents) self.index.add_item(item_id, embedding) self.documents.append(document) def build(self, num_trees:int = 10): self.index.build(num_trees) def save(self, filepath: str): self.index.save(filepath) def load(self, filepath: str): self.index.load(filepath) def search(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Dict[str, Union[str, float]]]: indices, distances = self.index.get_nns_by_vector( query_embedding, top_k, include_distances=True ) results = [ { "document": self.documents[idx], "score": 1 / (1 + distance) # Convert distance to similarity } for idx, distance in zip(indices, distances) ] return results