Spaces:
Sleeping
Sleeping
File size: 918 Bytes
bd3532f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
from typing import List, Dict, Union
class KeywordSearchProvider:
def __init__(self, documents: List[str]):
from sklearn.feature_extraction.text import TfidfVectorizer
self.vectorizer = TfidfVectorizer()
self.tfidf_matrix = self.vectorizer.fit_transform(documents)
self.documents = documents
def search(self, query: str, top_k: int = 5) -> List[Dict[str, Union[str, float]]]:
from sklearn.metrics.pairwise import cosine_similarity
query_vector = self.vectorizer.transform([query])
similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]
# Get top-k results
top_indices = similarities.argsort()[-top_k:][::-1]
results = [
{
"document": self.documents[idx],
"score": similarities[idx]
} for idx in top_indices
]
return results |