Spaces:

isl-research
/

sparksearch-demo

Sleeping

File size: 2,884 Bytes

bd3532f

from typing import List, Dict, Union, Any

import numpy as np
from annoy import AnnoyIndex

from .vector_store import VectorStore

class AnnoyDB(VectorStore):
    def __init__(
            self,
            embedding_dim: int,
            metric: str = 'angular'
        ) -> None:
        self.documents = []
        self.metadata = []
        self.embedding_dim = embedding_dim
        
        self.index = AnnoyIndex(embedding_dim, metric)
        self.index_built = False
        
    def add_document(self, text: str, metadata: Dict[str, Any] = None):
        """
        Add a document to the search index.
        
        Args:
            text: The document text
            metadata: Optional metadata about the document
        """
        self.documents.append(text)
        self.metadata.append(metadata or {})
        
        # Generate embedding using Sentence Transformers
        embedding = self.model.encode(text, show_progress_bar=False)
        
        # Add to Annoy index
        index_id = len(self.documents) - 1
        self.index.add_item(index_id, embedding)
        self.index_built = False

    def add_documents(self, texts: List[str], embeddings: np.array, metadata_list: List[Dict[str, Any]] = None):
        """
        Batch add documents to the search index.
        
        Args:
            texts: List of document texts
            metadata_list: Optional list of metadata dictionaries
        """
        if metadata_list is None:
            metadata_list = [{} for _ in texts]
        
        # Add documents and embeddings
        print("Adding to index...")
        for i, (text, metadata, embedding) in enumerate(zip(texts, metadata_list, embeddings)):
            self.documents.append(text)
            self.metadata.append(metadata)
            self.index.add_item(len(self.documents) - 1, embedding)
        
        self.index_built = False
        print("Done")
        
    def add_data(self, embedding: np.ndarray, document: str):
        item_id = len(self.documents)
        self.index.add_item(item_id, embedding)
        self.documents.append(document)
        
    def build(self, num_trees:int = 10):
        self.index.build(num_trees)
        
    def save(self, filepath: str):
        self.index.save(filepath)
        
    def load(self, filepath: str):
        self.index.load(filepath)
        
    def search(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Dict[str, Union[str, float]]]:
        indices, distances = self.index.get_nns_by_vector(
            query_embedding, top_k, include_distances=True
        )
        
        results = [
            {
                "document": self.documents[idx], 
                "score": 1 / (1 + distance)  # Convert distance to similarity
            } for idx, distance in zip(indices, distances)
        ]
        
        return results