Spaces:

isl-research
/

sparksearch-demo

Sleeping

App Files Files Community

sparksearch-demo / SmartSearch /database /annoydb.py

teddyllm

Upload 20 files

bd3532f verified 17 days ago

raw

history blame contribute delete

2.88 kB

	from typing import List, Dict, Union, Any

	import numpy as np
	from annoy import AnnoyIndex

	from .vector_store import VectorStore

	class AnnoyDB(VectorStore):
	def __init__(
	self,
	embedding_dim: int,
	metric: str = 'angular'
	) -> None:
	self.documents = []
	self.metadata = []
	self.embedding_dim = embedding_dim

	self.index = AnnoyIndex(embedding_dim, metric)
	self.index_built = False

	def add_document(self, text: str, metadata: Dict[str, Any] = None):
	"""
	Add a document to the search index.

	Args:
	text: The document text
	metadata: Optional metadata about the document
	"""
	self.documents.append(text)
	self.metadata.append(metadata or {})

	# Generate embedding using Sentence Transformers
	embedding = self.model.encode(text, show_progress_bar=False)

	# Add to Annoy index
	index_id = len(self.documents) - 1
	self.index.add_item(index_id, embedding)
	self.index_built = False

	def add_documents(self, texts: List[str], embeddings: np.array, metadata_list: List[Dict[str, Any]] = None):
	"""
	Batch add documents to the search index.

	Args:
	texts: List of document texts
	metadata_list: Optional list of metadata dictionaries
	"""
	if metadata_list is None:
	metadata_list = [{} for _ in texts]

	# Add documents and embeddings
	print("Adding to index...")
	for i, (text, metadata, embedding) in enumerate(zip(texts, metadata_list, embeddings)):
	self.documents.append(text)
	self.metadata.append(metadata)
	self.index.add_item(len(self.documents) - 1, embedding)

	self.index_built = False
	print("Done")

	def add_data(self, embedding: np.ndarray, document: str):
	item_id = len(self.documents)
	self.index.add_item(item_id, embedding)
	self.documents.append(document)

	def build(self, num_trees:int = 10):
	self.index.build(num_trees)

	def save(self, filepath: str):
	self.index.save(filepath)

	def load(self, filepath: str):
	self.index.load(filepath)

	def search(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Dict[str, Union[str, float]]]:
	indices, distances = self.index.get_nns_by_vector(
	query_embedding, top_k, include_distances=True
	)

	results = [
	{
	"document": self.documents[idx],
	"score": 1 / (1 + distance) # Convert distance to similarity
	} for idx, distance in zip(indices, distances)
	]

	return results