File size: 918 Bytes
bd3532f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from typing import List, Dict, Union

class KeywordSearchProvider:
    def __init__(self, documents: List[str]):
        from sklearn.feature_extraction.text import TfidfVectorizer
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(documents)
        self.documents = documents

    def search(self, query: str, top_k: int = 5) -> List[Dict[str, Union[str, float]]]:
        from sklearn.metrics.pairwise import cosine_similarity
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]
        
        # Get top-k results
        top_indices = similarities.argsort()[-top_k:][::-1]
        results = [
            {
                "document": self.documents[idx], 
                "score": similarities[idx]
            } for idx in top_indices
        ]
        
        return results