sparksearch-demo / SmartSearch /keyword_search_provider.py
teddyllm's picture
Upload 20 files
bd3532f verified
raw
history blame contribute delete
918 Bytes
from typing import List, Dict, Union
class KeywordSearchProvider:
def __init__(self, documents: List[str]):
from sklearn.feature_extraction.text import TfidfVectorizer
self.vectorizer = TfidfVectorizer()
self.tfidf_matrix = self.vectorizer.fit_transform(documents)
self.documents = documents
def search(self, query: str, top_k: int = 5) -> List[Dict[str, Union[str, float]]]:
from sklearn.metrics.pairwise import cosine_similarity
query_vector = self.vectorizer.transform([query])
similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]
# Get top-k results
top_indices = similarities.argsort()[-top_k:][::-1]
results = [
{
"document": self.documents[idx],
"score": similarities[idx]
} for idx in top_indices
]
return results