Spaces:
Running
Running
from abc import ABC, abstractmethod | |
from typing import Dict, List | |
from scipy.sparse import csr_array # type: ignore | |
class BaseSparseEmbedding(ABC): | |
"""Interface for Sparse embedding models. | |
You can inherit from it and implement your custom sparse embedding model. | |
""" | |
def embed_query(self, query: str) -> Dict[int, float]: | |
"""Embed query text.""" | |
def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]: | |
"""Embed search docs.""" | |
class BM25SparseEmbedding(BaseSparseEmbedding): | |
"""This is a class that inherits BaseSparseEmbedding | |
and implements a sparse vector embedding model based on BM25. | |
This class uses the BM25 model in Milvus model to implement sparse vector embedding. | |
This model requires pymilvus[model] to be installed. | |
`pip install pymilvus[model]` | |
For more information please refer to: | |
https://milvus.io/docs/embed-with-bm25.md | |
""" | |
def __init__(self, corpus: List[str], language: str = "en"): | |
from pymilvus.model.sparse import BM25EmbeddingFunction # type: ignore | |
from pymilvus.model.sparse.bm25.tokenizers import ( # type: ignore | |
build_default_analyzer, | |
) | |
self.analyzer = build_default_analyzer(language=language) | |
self.bm25_ef = BM25EmbeddingFunction(self.analyzer, num_workers=1) | |
self.bm25_ef.fit(corpus) | |
def embed_query(self, text: str) -> Dict[int, float]: | |
return self._sparse_to_dict(self.bm25_ef.encode_queries([text])) | |
def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]: | |
sparse_arrays = self.bm25_ef.encode_documents(texts) | |
return [self._sparse_to_dict(sparse_array) for sparse_array in sparse_arrays] | |
def _sparse_to_dict(self, sparse_array: csr_array) -> Dict[int, float]: | |
row_indices, col_indices = sparse_array.nonzero() | |
non_zero_values = sparse_array.data | |
result_dict = {} | |
for col_index, value in zip(col_indices, non_zero_values): | |
result_dict[col_index] = value | |
return result_dict | |