import os from typing import List, Dict, Union, Optional, Any import numpy as np from .embedding_provider import EmbeddingProvider from .database.annoydb import AnnoyDB from .keyword_search_provider import KeywordSearchProvider class HybridSearch: def __init__( self, embedding_provider: EmbeddingProvider, documents: List[str] = None, ann_filepath: Optional[str] = None, semantic_weight: float = 0.7, keyword_weight: float = 0.3 ) -> None: self.embedding_provider = embedding_provider self.documents = documents if ann_filepath and os.path.exists(ann_filepath): self.index = AnnoyDB self.embeddings = self.embedding_provider.embed_documents(documents) self.vector_db = AnnoyDB( embedding_dim=self.embeddings.shape[1] ) for emb, doc in zip(self.embeddings, documents): self.vector_db.add_data(emb, doc) self.vector_db.build() # Keyword Search Setup self.keyword_search = KeywordSearchProvider(documents) # Weights for hybrid search self.semantic_weight = semantic_weight self.keyword_weight = keyword_weight self.documents = documents def hybrid_search(self, query: str, top_k: int = 5) -> List[Dict[str, Union[str, float]]]: # Embed query query_embedding = self.embedding_provider.embed_query(query) # Perform semantic search semantic_results = self.vector_db.search(query_embedding, top_k) # Perform keyword search keyword_results = self.keyword_search.search(query, top_k) # Combine results with weighted scoring combined_results = {} for result in semantic_results: doc = result['document'] combined_results[doc] = { 'semantic_score': result['score'] * self.semantic_weight, 'keyword_score': 0, 'hybrid_score': result['score'] * self.semantic_weight } for result in keyword_results: doc = result['document'] if doc in combined_results: combined_results[doc]['keyword_score'] = result['score'] * self.keyword_weight combined_results[doc]['hybrid_score'] += result['score'] * self.keyword_weight else: combined_results[doc] = { 'semantic_score': 0, 'keyword_score': result['score'] * self.keyword_weight, 'hybrid_score': result['score'] * self.keyword_weight } # Sort and return top results sorted_results = sorted( [ {**{'document': doc}, **scores} for doc, scores in combined_results.items() ], key=lambda x: x['hybrid_score'], reverse=True ) return sorted_results[:top_k] def set_weights(self, semantic_weight: float, keyword_weight: float): """ Dynamically update search weights. Args: semantic_weight: New weight for semantic search keyword_weight: New weight for keyword search """ if not (0 <= semantic_weight <= 1 and 0 <= keyword_weight <= 1): raise ValueError("Weights must be between 0 and 1") if not np.isclose(semantic_weight + keyword_weight, 1.0): raise ValueError("Semantic and keyword weights must sum to 1.0") self.semantic_weight = semantic_weight self.keyword_weight = keyword_weight