Spaces:

anpigon
/

langchain-qa-bot

Runtime error

App Files Files Community

langchain-qa-bot / docs /langchain /libs /partners /milvus /langchain_milvus /utils /sparse.py

anpigon

add langchain docs

ed4d993 about 1 year ago

raw

history blame contribute delete

2.11 kB

	from abc import ABC, abstractmethod
	from typing import Dict, List

	from scipy.sparse import csr_array # type: ignore


	class BaseSparseEmbedding(ABC):
	"""Interface for Sparse embedding models.
	You can inherit from it and implement your custom sparse embedding model.
	"""

	@abstractmethod
	def embed_query(self, query: str) -> Dict[int, float]:
	"""Embed query text."""

	@abstractmethod
	def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
	"""Embed search docs."""


	class BM25SparseEmbedding(BaseSparseEmbedding):
	"""This is a class that inherits BaseSparseEmbedding
	and implements a sparse vector embedding model based on BM25.
	This class uses the BM25 model in Milvus model to implement sparse vector embedding.
	This model requires pymilvus[model] to be installed.
	`pip install pymilvus[model]`
	For more information please refer to:
	https://milvus.io/docs/embed-with-bm25.md
	"""

	def __init__(self, corpus: List[str], language: str = "en"):
	from pymilvus.model.sparse import BM25EmbeddingFunction # type: ignore
	from pymilvus.model.sparse.bm25.tokenizers import ( # type: ignore
	build_default_analyzer,
	)

	self.analyzer = build_default_analyzer(language=language)
	self.bm25_ef = BM25EmbeddingFunction(self.analyzer, num_workers=1)
	self.bm25_ef.fit(corpus)

	def embed_query(self, text: str) -> Dict[int, float]:
	return self._sparse_to_dict(self.bm25_ef.encode_queries([text]))

	def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
	sparse_arrays = self.bm25_ef.encode_documents(texts)
	return [self._sparse_to_dict(sparse_array) for sparse_array in sparse_arrays]

	def _sparse_to_dict(self, sparse_array: csr_array) -> Dict[int, float]:
	row_indices, col_indices = sparse_array.nonzero()
	non_zero_values = sparse_array.data
	result_dict = {}
	for col_index, value in zip(col_indices, non_zero_values):
	result_dict[col_index] = value
	return result_dict