Spaces:

berkatil
/

mrr

Running

File size: 3,880 Bytes

23afbaa
 
 
 
 
 
 
 
 
 
 
 
 
6b816f7
23afbaa
 
 
c01271c
 
 
23afbaa
 
 
c01271c
 
 
 
 
 
 
 
 
 
23afbaa
 
 
 
c01271c
 
23afbaa
 
 
 
 
c01271c
 
 
 
 
23afbaa
c01271c
23afbaa
c01271c
 
 
 
 
 
 
23afbaa
c01271c
23afbaa
 
 
6b816f7
23afbaa
 
 
 
 
 
 
 
 
c01271c
6b816f7
23afbaa
 
c01271c
23afbaa
 
c01271c
23afbaa
c01271c
 
 
 
 
 
 
 
 
 
 
23afbaa
c01271c
23afbaa

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mean reciprocal rank metric"""

import evaluate
import datasets
import json
from ranx import Qrels, Run
from ranx import evaluate as ran_evaluate


_CITATION = """\
@inproceedings{ranx,
  author       = {Elias Bassani},
  title        = {ranx: {A} Blazing-Fast Python Library for Ranking Evaluation and Comparison},
  booktitle    = {{ECIR} {(2)}},
  series       = {Lecture Notes in Computer Science},
  volume       = {13186},
  pages        = {259--264},
  publisher    = {Springer},
  year         = {2022},
  doi          = {10.1007/978-3-030-99739-7\_30}
}
"""

_DESCRIPTION = """\
This is the mean reciprocal rank (mrr) metric for retrieval systems.
It is the multiplicative inverse of the rank of the first retrieved relevant document: 1 for first place, 1/2 for second place, 1/3 for third place, and so on. You can refer to [here](https://amenra.github.io/ranx/metrics/#mean-reciprocal-rank)
"""


_KWARGS_DESCRIPTION = """
Args:
    predictions: dictionary of dictionaries where each dictionary consists of document relevancy scores produced by the model for a given query 
        One dictionary per query.  
    references: List of list of strings where each lists consists of the relevant document names for a given query in a sorted relevancy order.
        The outer list is sorted from query one to n.
    k: `int`, optional, default is None, it is to calculate mrr@k
Returns:
    mrr (`float`): mean reciprocal rank. Minimum possible value is 0. Maximum possible value is 1.0
Examples:
   
    >>> my_new_module = evaluate.load("mrr")
    >>> references= [json.dumps({"q_1":{"d_1":1, "d_2":2} }), 
             json.dumps({"q_2":{"d_2":1, "d_3":2, "d_5":3}})] 
    >>> predictions = [json.dumps({"q_1": { "d_1": 0.8, "d_2": 0.9}}),
         json.dumps({"q_2": {"d_2": 0.9, "d_1": 0.8, "d_5": 0.7, "d_3": 0.3}})]
    >>> results = my_new_module.compute(references=references, predictions=predictions)
    >>> print(results)
    {'recall': 1.0}
"""

@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class mrr(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features({
                'predictions':  datasets.Value("string"),
                'references':  datasets.Value("string")
            }),
            # Homepage of the module for documentation
            reference_urls=["https://amenra.github.io/ranx/"]
        )

    def _compute(self, predictions, references, k=None):
        """Returns the scores"""
        preds = {}
        refs = {}
        for pred in predictions:
            preds = preds | json.loads(pred)
        for ref in references:
            refs = refs | json.loads(ref)
        
        run = Run(preds)
        qrels = Qrels(refs)
        metric = "mrr" if k is None else f"mrr@{k}"
        mrr_score = ran_evaluate(qrels, run, metric)
        return {
            "mrr": mrr_score,
        }