Spaces:
Sleeping
Sleeping
from typing import List, Dict, Union | |
class KeywordSearchProvider: | |
def __init__(self, documents: List[str]): | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
self.vectorizer = TfidfVectorizer() | |
self.tfidf_matrix = self.vectorizer.fit_transform(documents) | |
self.documents = documents | |
def search(self, query: str, top_k: int = 5) -> List[Dict[str, Union[str, float]]]: | |
from sklearn.metrics.pairwise import cosine_similarity | |
query_vector = self.vectorizer.transform([query]) | |
similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0] | |
# Get top-k results | |
top_indices = similarities.argsort()[-top_k:][::-1] | |
results = [ | |
{ | |
"document": self.documents[idx], | |
"score": similarities[idx] | |
} for idx in top_indices | |
] | |
return results |