Spaces:

berkatil
/

mrr

Running

App Files Files Community

mrr / mrr.py

berkatil

fix

6b816f7 6 months ago

raw

history blame contribute delete

3.88 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Mean reciprocal rank metric"""

	import evaluate
	import datasets
	import json
	from ranx import Qrels, Run
	from ranx import evaluate as ran_evaluate


	_CITATION = """\
	@inproceedings{ranx,
	author = {Elias Bassani},
	title = {ranx: {A} Blazing-Fast Python Library for Ranking Evaluation and Comparison},
	booktitle = {{ECIR} {(2)}},
	series = {Lecture Notes in Computer Science},
	volume = {13186},
	pages = {259--264},
	publisher = {Springer},
	year = {2022},
	doi = {10.1007/978-3-030-99739-7\_30}
	}
	"""

	_DESCRIPTION = """\
	This is the mean reciprocal rank (mrr) metric for retrieval systems.
	It is the multiplicative inverse of the rank of the first retrieved relevant document: 1 for first place, 1/2 for second place, 1/3 for third place, and so on. You can refer to [here](https://amenra.github.io/ranx/metrics/#mean-reciprocal-rank)
	"""


	_KWARGS_DESCRIPTION = """
	Args:
	predictions: dictionary of dictionaries where each dictionary consists of document relevancy scores produced by the model for a given query
	One dictionary per query.
	references: List of list of strings where each lists consists of the relevant document names for a given query in a sorted relevancy order.
	The outer list is sorted from query one to n.
	k: `int`, optional, default is None, it is to calculate mrr@k
	Returns:
	mrr (`float`): mean reciprocal rank. Minimum possible value is 0. Maximum possible value is 1.0
	Examples:

	>>> my_new_module = evaluate.load("mrr")
	>>> references= [json.dumps({"q_1":{"d_1":1, "d_2":2} }),
	json.dumps({"q_2":{"d_2":1, "d_3":2, "d_5":3}})]
	>>> predictions = [json.dumps({"q_1": { "d_1": 0.8, "d_2": 0.9}}),
	json.dumps({"q_2": {"d_2": 0.9, "d_1": 0.8, "d_5": 0.7, "d_3": 0.3}})]
	>>> results = my_new_module.compute(references=references, predictions=predictions)
	>>> print(results)
	{'recall': 1.0}
	"""

	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class mrr(evaluate.Metric):
	def _info(self):
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=datasets.Features({
	'predictions': datasets.Value("string"),
	'references': datasets.Value("string")
	}),
	# Homepage of the module for documentation
	reference_urls=["https://amenra.github.io/ranx/"]
	)

	def _compute(self, predictions, references, k=None):
	"""Returns the scores"""
	preds = {}
	refs = {}
	for pred in predictions:
	preds = preds \| json.loads(pred)
	for ref in references:
	refs = refs \| json.loads(ref)

	run = Run(preds)
	qrels = Qrels(refs)
	metric = "mrr" if k is None else f"mrr@{k}"
	mrr_score = ran_evaluate(qrels, run, metric)
	return {
	"mrr": mrr_score,
	}