Spaces:
Sleeping
Sleeping
File size: 2,884 Bytes
bd3532f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from typing import List, Dict, Union, Any
import numpy as np
from annoy import AnnoyIndex
from .vector_store import VectorStore
class AnnoyDB(VectorStore):
def __init__(
self,
embedding_dim: int,
metric: str = 'angular'
) -> None:
self.documents = []
self.metadata = []
self.embedding_dim = embedding_dim
self.index = AnnoyIndex(embedding_dim, metric)
self.index_built = False
def add_document(self, text: str, metadata: Dict[str, Any] = None):
"""
Add a document to the search index.
Args:
text: The document text
metadata: Optional metadata about the document
"""
self.documents.append(text)
self.metadata.append(metadata or {})
# Generate embedding using Sentence Transformers
embedding = self.model.encode(text, show_progress_bar=False)
# Add to Annoy index
index_id = len(self.documents) - 1
self.index.add_item(index_id, embedding)
self.index_built = False
def add_documents(self, texts: List[str], embeddings: np.array, metadata_list: List[Dict[str, Any]] = None):
"""
Batch add documents to the search index.
Args:
texts: List of document texts
metadata_list: Optional list of metadata dictionaries
"""
if metadata_list is None:
metadata_list = [{} for _ in texts]
# Add documents and embeddings
print("Adding to index...")
for i, (text, metadata, embedding) in enumerate(zip(texts, metadata_list, embeddings)):
self.documents.append(text)
self.metadata.append(metadata)
self.index.add_item(len(self.documents) - 1, embedding)
self.index_built = False
print("Done")
def add_data(self, embedding: np.ndarray, document: str):
item_id = len(self.documents)
self.index.add_item(item_id, embedding)
self.documents.append(document)
def build(self, num_trees:int = 10):
self.index.build(num_trees)
def save(self, filepath: str):
self.index.save(filepath)
def load(self, filepath: str):
self.index.load(filepath)
def search(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Dict[str, Union[str, float]]]:
indices, distances = self.index.get_nns_by_vector(
query_embedding, top_k, include_distances=True
)
results = [
{
"document": self.documents[idx],
"score": 1 / (1 + distance) # Convert distance to similarity
} for idx, distance in zip(indices, distances)
]
return results
|