Spaces:
Sleeping
Sleeping
| from typing import List, Dict, Union | |
| class KeywordSearchProvider: | |
| def __init__(self, documents: List[str]): | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| self.vectorizer = TfidfVectorizer() | |
| self.tfidf_matrix = self.vectorizer.fit_transform(documents) | |
| self.documents = documents | |
| def search(self, query: str, top_k: int = 5) -> List[Dict[str, Union[str, float]]]: | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| query_vector = self.vectorizer.transform([query]) | |
| similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0] | |
| # Get top-k results | |
| top_indices = similarities.argsort()[-top_k:][::-1] | |
| results = [ | |
| { | |
| "document": self.documents[idx], | |
| "score": similarities[idx] | |
| } for idx in top_indices | |
| ] | |
| return results |