Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on 5 days ago

Commit

b9d0035

verified ·

1 Parent(s): 24df49f

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

api.py +9 -2
metric_utils.py +5 -1
metrics.py +435 -19
operators.py +34 -18
serializers.py +20 -1
task.py +6 -1
templates.py +91 -10
types.py +9 -0
version.py +1 -1

api.py CHANGED Viewed

@@ -7,6 +7,7 @@ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from .artifact import fetch_artifact
 from .card import TaskCard
 from .dataset_utils import get_dataset_artifact
 from .inference import (
     InferenceEngine,
     LogProbInferenceEngine,
@@ -198,8 +199,14 @@ def load_dataset(
     ).with_transform(loads_instance)
-def evaluate(predictions, data) -> EvaluationResults:
-    return _compute(predictions=predictions, references=data)
 def post_process(predictions, data) -> List[Dict[str, Any]]:

 from .artifact import fetch_artifact
 from .card import TaskCard
 from .dataset_utils import get_dataset_artifact
+from .error_utils import UnitxtError
 from .inference import (
     InferenceEngine,
     LogProbInferenceEngine,
     ).with_transform(loads_instance)
+def evaluate(
+    predictions, dataset: Union[Dataset, IterableDataset] = None, data=None
+) -> EvaluationResults:
+    if dataset is None and data is None:
+        raise UnitxtError(message="Specify 'dataset' in evaluate")
+    if data is not None:
+        dataset = data  # for backward compatibility
+    return _compute(predictions=predictions, references=dataset)
 def post_process(predictions, data) -> List[Dict[str, Any]]:

metric_utils.py CHANGED Viewed

@@ -38,7 +38,11 @@ constants = get_constants()
 def nan_mean(scores):
-    return mean(score for score in scores if score == score)
 class FromPredictionsAndOriginalData(StreamInitializerOperator):

 def nan_mean(scores):
+    result = mean(score for score in scores if score == score)
+    try:
+        return float(result)
+    except:
+        return result
 class FromPredictionsAndOriginalData(StreamInitializerOperator):

metrics.py CHANGED Viewed

@@ -7,10 +7,10 @@ import string
 import uuid
 import warnings
 from abc import ABC, abstractmethod
-from collections import Counter, defaultdict
 from dataclasses import field
 from functools import lru_cache
-from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 import numpy
 import numpy as np
@@ -317,6 +317,398 @@ class Metric(Artifact):
                 instance["score"]["global"].pop(score_ci)
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
     # Use None to disable confidence interval computation.
@@ -539,10 +931,10 @@ class MetricWithConfidenceInterval(Metric):
                     confidence_level=self.confidence_level,
                     random_state=random_gen,
                 ).confidence_interval
-            result["score_ci_low"] = ci.low
-            result["score_ci_high"] = ci.high
-            result[f"{score_name}_ci_low"] = ci.low
-            result[f"{score_name}_ci_high"] = ci.high
         return result
@@ -1732,7 +2124,7 @@ class HuggingfaceMetric(GlobalMetric):
             **self.hf_compute_args,
         )
         if self.hf_main_score:
-            result[self.main_score] = result[self.hf_main_score]
             del result[self.hf_main_score]
         if self.scale != 1.0:
             assert (
@@ -1752,6 +2144,8 @@ class HuggingfaceMetric(GlobalMetric):
                         result[key], float
                     ), "Scaled field '{key}' is not float: {result[key]}"
                     result[key] /= self.scale
         return result
@@ -1837,17 +2231,49 @@ class HuggingfaceInstanceMetric(InstanceMetric):
         return score
 class Meteor(InstanceMetric):
     main_score = "meteor"
     ci_scores = ["meteor"]
     reduction_map = {"mean": ["meteor"]}
     prediction_type = str
-    _requirements_list: List[str] = ["nltk"]
     alpha: float = 0.9
     beta: int = 3
     gamma: float = 0.5
-    # unitxt uses nltk version >= 3.8
     def prepare(self):
         super().prepare()
@@ -1861,16 +2287,6 @@ class Meteor(InstanceMetric):
         self.word_tokenize = word_tokenize
         self.meteor_score = meteor_score
-    def verify(self):
-        import importlib.metadata as importlib_metadata
-        from datasets.config import version
-        nltk_version = version.parse(importlib_metadata.version("nltk"))
-        assert nltk_version >= version.Version(
-            "3.6.6"
-        ), "nltk version must be at least 3.6.6"
     def compute(self, references, prediction, task_data):
         score = self.meteor_score.meteor_score(
             [self.word_tokenize(ref) for ref in references],

 import uuid
 import warnings
 from abc import ABC, abstractmethod
+from collections import Counter, defaultdict, namedtuple
 from dataclasses import field
 from functools import lru_cache
+from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Union
 import numpy
 import numpy as np
                 instance["score"]["global"].pop(score_ci)
+def new_random_generator():
+    # The np.random.default_rng expects a 32-bit int, while hash(..) can return a 64-bit integer.
+    # So use '& MAX_32BIT' to get a 32-bit seed.
+    _max_32bit = 2**32 - 1
+    return np.random.default_rng(hash(get_seed()) & _max_32bit)
+class ConfidenceIntervalMixin(Artifact):
+    n_resamples: int = 1000
+    confidence_level: float = 0.95
+    ci_score_names: List[str] = None
+    @abstractmethod
+    def _sample_to_scores(self, sample: List[Any]) -> Dict[str, Any]:
+        pass
+    def get_statistic(self, data: List[Any], score_names: List[str]):
+        def statistic_function(indices, axis=0):
+            # indices might be a 1D or 2D array, depending on bootstrap internals
+            # For simplicity, ensure we handle them as 1D.
+            indices = np.atleast_1d(indices).astype(int)
+            # Gather the subset
+            sample = [data[i] for i in indices]
+            # Compute metrics on this sample
+            scores = self._sample_to_scores(sample)
+            # Return them in consistent order
+            return np.array([scores[m] for m in score_names])
+        return statistic_function
+    def bootstrap(self, data: List[Any], score_names: List[str]):
+        if self.ci_score_names is not None:
+            score_names = self.ci_score_names
+        intervals = bootstrap(
+            (np.arange(len(data)),),
+            statistic=self.get_statistic(data, score_names),
+            n_resamples=self.n_resamples,
+            confidence_level=self.confidence_level,
+            random_state=new_random_generator(),
+            paired=False,
+            vectorized=False,  # set to True if your statistic function is vectorized
+            method="BCa",
+        ).confidence_interval
+        result = {}
+        for i, metric in enumerate(score_names):
+            result[f"{metric}_ci_low"] = float(intervals.low[i])
+            result[f"{metric}_ci_high"] = float(intervals.high[i])
+        return result
+from typing import Generic, TypeVar, NamedTuple
+from dataclasses import dataclass
+IntermediateType = TypeVar("IntermediateType")
+PredictionType = TypeVar("PredictionType")
+class EvaluationInput(tuple, Generic[PredictionType]):
+    def __new__(
+        cls,
+        prediction: PredictionType,
+        references: List[PredictionType],
+        task_data: Dict[str, Any],
+    ) -> "EvaluationInput[PredictionType]":
+        return super().__new__(cls, (prediction, references, task_data))
+def is_original_key(key):
+    if (
+        key.endswith("_ci_low")
+        or key.endswith("_ci_high")
+        or key == "score"
+        or key == "num_of_instances"
+        or key == "score_name"
+    ):
+        return False
+    return True
+class MapReduceMetric(
+    StreamOperator,
+    Metric,
+    ConfidenceIntervalMixin,
+    Generic[PredictionType, IntermediateType],
+):
+    score_prefix = ""
+    reference_field: str = NonPositionalField(default="references")
+    prediction_field: str = NonPositionalField(default="prediction")
+    def map(
+        self,
+        prediction: PredictionType,
+        references: List[PredictionType],
+        task_data: Dict[str, Any],
+    ) -> IntermediateType:
+        raise NotImplementedError()
+    def reduce_one(self, intermidate: IntermediateType):
+        return self.reduce([intermidate])
+    @abstractmethod
+    def reduce(self, intermediates: List[IntermediateType]) -> Dict[str, Any]:
+        return {}
+    def disable_confidence_interval_calculation(self):
+        self.n_resamples = None
+    def annotate_scores(self, scores):
+        scores = {
+            **{self.score_prefix + key: val for key, val in scores.items()},
+            "score_name": self.score_prefix + self.main_score,
+            "score": scores[self.main_score],
+        }
+        for level in ["high", "low"]:
+            if f"{self.main_score}_ci_{level}" in scores:
+                scores[f"score_ci_{level}"] = scores[f"{self.main_score}_ci_{level}"]
+        return scores
+    def _sample_to_scores(self, sample: List[Any]) -> Dict[str, Any]:
+        return self.reduce(sample)
+    def reduce_and_bootstrap(
+        self, intermediates: List[IntermediateType]
+    ) -> Dict[str, Any]:
+        scores = self.reduce(intermediates)
+        score_names = [k for k, v in scores.items() if isinstance(v, float)]
+        if self.n_resamples is None:
+            return scores
+        intervals = self.bootstrap(intermediates, score_names)
+        return {**scores, **intervals}
+    def _instance_to_evaluation_input(
+        self, instance: Dict[str, Any]
+    ) -> EvaluationInput[PredictionType]:
+        instance = self.verify_instance(instance)
+        task_data = instance.get("task_data", {})
+        if self.reference_field == "references":
+            references = instance["references"]
+        else:
+            references = task_data[self.reference_field]
+            if not isinstance(references, list):
+                references = [references]
+        if self.prediction_field == "prediction":
+            prediction = instance["prediction"]
+        else:
+            prediction = task_data[self.prediction_field]
+        self._validate_prediction(prediction)
+        self._validate_reference(references)
+        return EvaluationInput[PredictionType](
+            prediction=prediction, references=references, task_data=task_data
+        )
+    def _instances_stream_to_evaluation_inputs(
+        self, stream: Stream
+    ) -> Generator[EvaluationInput[PredictionType], None, None]:
+        for instance in stream:
+            yield self._instance_to_evaluation_input(instance)
+    def map_stream(
+        self,
+        evaluation_inputs_stream: Generator[
+            EvaluationInput[PredictionType], None, None
+        ],
+    ):
+        intermediates = []
+        for prediction, references, task_data in evaluation_inputs_stream:
+            intermediate = self.map(
+                prediction=prediction, references=references, task_data=task_data
+            )
+            intermediates.append(intermediate)
+        return intermediates
+    def process(self, stream: Stream, stream_name: Optional[str] = None):
+        instances_scores, global_scores = self.compute(stream, stream_name)
+        for i, (instance, instance_scores) in enumerate(zip(stream, instances_scores)):
+            previous_score = instance.get("score", {"global": {}, "instance": {}})
+            if i == 0:
+                for key in global_scores:
+                    if is_original_key(key) and key in previous_score["global"]:
+                        UnitxtWarning(
+                            message=f"Metric '{key}' that has just been evaluated with value {global_scores[key]}, is already recorded "
+                            f"to have value {previous_score['global'][key]} by a previous metric evaluation on this instance or stream. "
+                            f"To avoid overwriting the existing value, add a score_prefix to the metric name (e.g. score_prefix='my_second_' , "
+                            f"which will yield, in this case, a score named: 'my_second_{key}')",
+                            additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
+                        )
+            global_scores = {**previous_score["global"], **global_scores}
+            instance_scores = {**previous_score["instance"], **instance_scores}
+            yield {
+                **instance,
+                "score": {"global": global_scores, "instance": instance_scores},
+            }
+    def compute(self, stream: Stream, stream_name: Optional[str] = None):
+        evaluation_inputs_stream = self._instances_stream_to_evaluation_inputs(stream)
+        intermediates_list = self.map_stream(evaluation_inputs_stream)
+        instances_scores = []
+        for intermediate in intermediates_list:
+            instance_score = self.reduce_one(intermediate)
+            instance_score = self.annotate_scores(instance_score)
+            instances_scores.append(instance_score)
+        global_scores = self.reduce_and_bootstrap(intermediates_list)
+        global_scores = self.annotate_scores(global_scores)
+        global_scores["num_of_instances"] = len(intermediates_list)
+        return instances_scores, global_scores
+def get_index_or_default(lst, item, default=-1):
+    try:
+        return lst.index(item)
+    except ValueError:
+        return default
+class AggregationReduction(Artifact, Generic[IntermediateType]):
+    def reduce(self, intermidates: List[IntermediateType]) -> Dict[str, Any]:
+        pass
+class DictReduction(AggregationReduction[Dict[str, float]]):
+    def reduce_list(self, lst: List[float]):
+        pass
+    def reduce(self, intermidates: List[Dict[str, float]]):
+        lists = {}
+        for intermidate in intermidates:
+            for key, val in intermidate.items():
+                if key not in lists:
+                    lists[key] = []
+                lists[key].append(val)
+        result = {}
+        for key, val_list in lists.items():
+            result[key] = self.reduce_list(val_list)
+        return result
+class MeanReduction(DictReduction):
+    def reduce_list(self, lst: List[float]):
+        return nan_mean(lst)
+class MaxReduction(DictReduction):
+    def reduce_list(self, lst: List[float]):
+        return float(nan_max(lst))
+class ReductionInstanceMetric(
+    MapReduceMetric[PredictionType, IntermediateType],
+    Generic[PredictionType, IntermediateType],
+):
+    reduction: AggregationReduction[IntermediateType]
+    def reduce(self, intermediates: List[IntermediateType]) -> Dict[str, Any]:
+        return self.reduction.reduce(intermediates)
+    def reduce_one(self, intermidate: IntermediateType):
+        return recursive_copy(intermidate)
+class AccuracyFast(ReductionInstanceMetric[str, Dict[str, float]]):
+    main_score = "accuracy"
+    reduction = MeanReduction()
+    def map(
+        self, prediction: str, references: List[str], task_data: Dict[str, Any]
+    ) -> Dict[str, float]:
+        return {
+            self.main_score: float(
+                str(prediction) in [str(reference) for reference in references]
+            )
+        }
+class F1Fast(MapReduceMetric[str, Tuple[int, int]]):
+    main_score = "f1"
+    averages: List[Literal["f1", "macro", "micro", "per_class"]] = [
+        "f1",
+        "micro",
+        "macro",
+        "per_class",
+    ]
+    ignore_punc: bool = True
+    ignore_case: bool = True
+    _requirements_list = ["scikit-learn", "regex"]
+    def prepare(self):
+        super().prepare()
+        from sklearn.metrics import f1_score
+        self._metric = f1_score
+        import regex
+        from functools import partial
+        self.remove_punc = partial(regex.compile(r"\p{P}+").sub, "")
+    def get_str_id(self, str):
+        if str not in self.str_to_id:
+            id = len(self.str_to_id)
+            self.str_to_id[str] = id
+            self.id_to_str[id] = str
+        return self.str_to_id[str]
+    def map_stream(
+        self, evaluation_inputs_stream: Generator[EvaluationInput[str], None, None]
+    ):
+        self.str_to_id = {}
+        self.id_to_str = {}
+        return super().map_stream(evaluation_inputs_stream)
+    def map(
+        self, prediction: str, references: List[str], task_data: Dict[str, Any]
+    ) -> Tuple[int, int]:
+        reference_index = self.get_str_id(references[0])
+        prediction_index = self.get_str_id(prediction)
+        return prediction_index, reference_index
+    def reduce(self, intermediates: List[Tuple[int, int]]) -> Dict[str, Any]:
+        y_true = []
+        y_pred = []
+        labels = set()
+        for pred_idx, ref_idx in intermediates:
+            y_pred.append(pred_idx)
+            y_true.append(ref_idx)
+            labels.add(ref_idx)
+        labels = list(labels)
+        result = {}
+        if "f1" in self.averages:
+            result["f1"] = float(
+                self._metric(
+                    y_true,
+                    y_pred,
+                    average="macro",
+                    labels=labels,
+                    zero_division=0,
+                )
+            )
+        if "micro" in self.averages:
+            result["f1_micro"] = float(
+                self._metric(
+                    y_true,
+                    y_pred,
+                    average="micro",
+                    labels=labels,
+                    zero_division=0,
+                )
+            )
+        if "macro" in self.averages:
+            result["f1_macro"] = float(
+                self._metric(
+                    y_true,
+                    y_pred,
+                    average="macro",
+                    labels=labels,
+                    zero_division=0,
+                )
+            )
+        if "per_class" in self.averages:
+            f1_per_class = self._metric(
+                y_true, y_pred, average=None, labels=list(labels), zero_division=0
+            )
+            for label, score in zip(labels, f1_per_class):
+                class_name = self.id_to_str[label]
+                result[f"f1_{class_name}"] = float(score)
+        return result
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
     # Use None to disable confidence interval computation.
                     confidence_level=self.confidence_level,
                     random_state=random_gen,
                 ).confidence_interval
+            result["score_ci_low"] = float(ci.low)
+            result["score_ci_high"] = float(ci.high)
+            result[f"{score_name}_ci_low"] = float(ci.low)
+            result[f"{score_name}_ci_high"] = float(ci.high)
         return result
             **self.hf_compute_args,
         )
         if self.hf_main_score:
+            result[self.main_score] = float(result[self.hf_main_score])
             del result[self.hf_main_score]
         if self.scale != 1.0:
             assert (
                         result[key], float
                     ), "Scaled field '{key}' is not float: {result[key]}"
                     result[key] /= self.scale
+        if self.main_score in result:
+            result[self.main_score] = float(result[self.main_score])
         return result
         return score
+class MeteorFast(ReductionInstanceMetric[str, Dict[str, float]]):
+    main_score = "meteor"
+    reduction = MeanReduction()
+    _requirements_list: List[str] = ["nltk>=3.6.6"]
+    alpha: float = 0.9
+    beta: int = 3
+    gamma: float = 0.5
+    def prepare(self):
+        super().prepare()
+        import nltk
+        nltk.download("wordnet", quiet=True)
+        nltk.download("omw-1.4", quiet=True)
+        from nltk import word_tokenize
+        from nltk.translate import meteor_score
+        self.word_tokenize = word_tokenize
+        self.meteor_score = meteor_score
+    def map(
+        self, prediction: str, references: List[str], task_data: Dict[str, Any]
+    ) -> Dict[str, float]:
+        score = self.meteor_score.meteor_score(
+            [self.word_tokenize(ref) for ref in references],
+            self.word_tokenize(prediction),
+            alpha=self.alpha,
+            beta=self.beta,
+            gamma=self.gamma,
+        )
+        return {self.main_score: score}
 class Meteor(InstanceMetric):
     main_score = "meteor"
     ci_scores = ["meteor"]
     reduction_map = {"mean": ["meteor"]}
     prediction_type = str
+    _requirements_list: List[str] = ["nltk>=3.6.6"]
     alpha: float = 0.9
     beta: int = 3
     gamma: float = 0.5
     def prepare(self):
         super().prepare()
         self.word_tokenize = word_tokenize
         self.meteor_score = meteor_score
     def compute(self, references, prediction, task_data):
         score = self.meteor_score.meteor_score(
             [self.word_tokenize(ref) for ref in references],

operators.py CHANGED Viewed

@@ -55,6 +55,7 @@ from typing import (
     Generator,
     Iterable,
     List,
     Optional,
     Tuple,
     Union,
@@ -1633,6 +1634,12 @@ class ApplyStreamOperatorsField(StreamOperator, ArtifactFetcherMixin):
         yield from stream
 class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
     """Applies metric operators to a stream based on a metric field specified in each instance.
@@ -1647,13 +1654,6 @@ class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         from .metrics import Metric, MetricsList
-        def update_scores_of_stream_instances(
-            stream: Stream, scores: List[dict]
-        ) -> Generator:
-            for instance, score in zip(stream, scores):
-                instance["score"] = recursive_copy(score)
-                yield instance
         # to be populated only when two or more metrics
         accumulated_scores = []
@@ -1680,29 +1680,28 @@ class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
                     f"Operator {metric_name} must be a Metric or MetricsList"
                 )
         # Each metric operator computes its score and then sets the main score, overwriting
         # the previous main score value (if any). So, we need to reverse the order of the listed metrics.
         # This will cause the first listed metric to run last, and the main score will be set
         # by the first listed metric (as desired).
         metrics_list = list(reversed(metrics_list))
-        for metric_no, metric in enumerate(metrics_list):
-            if not self.calc_confidence_intervals:
-                metric.disable_confidence_interval_calculation()
-            if metric_no > 0:
-                # update input stream with accumulated scores
                 reusable_generator = ReusableGenerator(
                     generator=update_scores_of_stream_instances,
                     gen_kwargs={"stream": stream, "scores": accumulated_scores},
                 )
                 multi_stream = MultiStream.from_generators({"tmp": reusable_generator})
-            else:
-                multi_stream = MultiStream.from_iterables({"tmp": stream})
             multi_stream = metric(multi_stream)
-            if metric_no < len(metrics_list) - 1:
-                # not the last metric, so prepare for the next metric by
-                # updating accumulated_scores
                 accumulated_scores = []
                 for inst in multi_stream["tmp"]:
                     accumulated_scores.append(recursive_copy(inst["score"]))
@@ -2214,3 +2213,20 @@ class CollateInstances(StreamOperator):
                 f"batch_size must be an integer equal to or greater than 1. "
                 f"Got: {self.batch_size}."
             )

     Generator,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Union,
         yield from stream
+def update_scores_of_stream_instances(stream: Stream, scores: List[dict]) -> Generator:
+    for instance, score in zip(stream, scores):
+        instance["score"] = recursive_copy(score)
+        yield instance
 class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
     """Applies metric operators to a stream based on a metric field specified in each instance.
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         from .metrics import Metric, MetricsList
         # to be populated only when two or more metrics
         accumulated_scores = []
                     f"Operator {metric_name} must be a Metric or MetricsList"
                 )
+        for metric in metrics_list:
+            if not self.calc_confidence_intervals:
+                metric.disable_confidence_interval_calculation()
         # Each metric operator computes its score and then sets the main score, overwriting
         # the previous main score value (if any). So, we need to reverse the order of the listed metrics.
         # This will cause the first listed metric to run last, and the main score will be set
         # by the first listed metric (as desired).
         metrics_list = list(reversed(metrics_list))
+        for i, metric in enumerate(metrics_list):
+            if i == 0:  # first metric
+                multi_stream = MultiStream({"tmp": stream})
+            else:  # metrics with previous scores
                 reusable_generator = ReusableGenerator(
                     generator=update_scores_of_stream_instances,
                     gen_kwargs={"stream": stream, "scores": accumulated_scores},
                 )
                 multi_stream = MultiStream.from_generators({"tmp": reusable_generator})
             multi_stream = metric(multi_stream)
+            if i < len(metrics_list) - 1:  # last metric
                 accumulated_scores = []
                 for inst in multi_stream["tmp"]:
                     accumulated_scores.append(recursive_copy(inst["score"]))
                 f"batch_size must be an integer equal to or greater than 1. "
                 f"Got: {self.batch_size}."
             )
+class WikipediaFetcher(FieldOperator):
+    mode: Literal["summary", "text"] = "text"
+    _requirements_list = ["Wikipedia-API"]
+    def prepare(self):
+        super().prepare()
+        import wikipediaapi
+        self.wikipedia = wikipediaapi.Wikipedia("Unitxt")
+    def process_value(self, value: Any) -> Any:
+        title = value.split("/")[-1]
+        page = self.wikipedia.page(title)
+        return {"title": page.title, "body": getattr(page, self.mode)}

serializers.py CHANGED Viewed

@@ -7,7 +7,7 @@ from .dataclass import AbstractField, Field
 from .operators import InstanceFieldOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype, to_type_string
-from .types import Dialog, Image, Number, Table, Video
 constants = get_constants()
@@ -127,9 +127,28 @@ class VideoSerializer(ImageSerializer):
         return "".join(serialized_images)
 class MultiTypeSerializer(Serializer):
     serializers: List[SingleTypeSerializer] = Field(
         default_factory=lambda: [
             ImageSerializer(),
             VideoSerializer(),
             TableSerializer(),

 from .operators import InstanceFieldOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype, to_type_string
+from .types import Dialog, Document, Image, MultiDocument, Number, Table, Video
 constants = get_constants()
         return "".join(serialized_images)
+class DocumentSerializer(SingleTypeSerializer):
+    serialized_type = Document
+    def serialize(self, value: Document, instance: Dict[str, Any]) -> str:
+        return f"# {value['title']}\n\n{value['body']}"
+class MultiDocumentSerializer(DocumentSerializer):
+    serialized_type = MultiDocument
+    def serialize(self, value: MultiDocument, instance: Dict[str, Any]) -> str:
+        documents = []
+        for document in value:
+            documents.append(super().serialize(document, instance))
+        return "\n\n".join(documents)
 class MultiTypeSerializer(Serializer):
     serializers: List[SingleTypeSerializer] = Field(
         default_factory=lambda: [
+            DocumentSerializer(),
+            MultiDocumentSerializer(),
             ImageSerializer(),
             VideoSerializer(),
             TableSerializer(),

task.py CHANGED Viewed

@@ -116,13 +116,18 @@ class Task(InstanceOperator, ArtifactFetcherMixin):
                 self.prediction_type
             )
     def task_deprecations(self):
         if hasattr(self, "inputs") and self.inputs is not None:
             depr_message = (
                 "The 'inputs' field is deprecated. Please use 'input_fields' instead."
             )
             warnings.warn(depr_message, DeprecationWarning, stacklevel=2)
         if hasattr(self, "outputs") and self.outputs is not None:
             depr_message = "The 'outputs' field is deprecated. Please use 'reference_fields' instead."
             warnings.warn(depr_message, DeprecationWarning, stacklevel=2)

                 self.prediction_type
             )
+        if hasattr(self, "inputs") and self.inputs is not None:
+            self.inputs = self.input_fields
+        if hasattr(self, "outputs") and self.outputs is not None:
+            self.outputs = self.reference_fields
     def task_deprecations(self):
         if hasattr(self, "inputs") and self.inputs is not None:
             depr_message = (
                 "The 'inputs' field is deprecated. Please use 'input_fields' instead."
             )
             warnings.warn(depr_message, DeprecationWarning, stacklevel=2)
         if hasattr(self, "outputs") and self.outputs is not None:
             depr_message = "The 'outputs' field is deprecated. Please use 'reference_fields' instead."
             warnings.warn(depr_message, DeprecationWarning, stacklevel=2)

templates.py CHANGED Viewed

@@ -495,7 +495,31 @@ class PairwiseComparativeRatingTemplate(InputOutputTemplate):
 class MultipleChoiceTemplate(InputFormatTemplate):
-    """Formats the input (that specifies the question), the multiple choices to select the answer from, and specifies the field with the correct answer."""
     target_prefix: str = ""
     choices_field: str = "choices"
@@ -504,7 +528,13 @@ class MultipleChoiceTemplate(InputFormatTemplate):
     source_choice_format: str = "{choice_numeral}. {choice_text}"
     target_choice_format: str = "{choice_numeral}"
     enumerator: str = "capitals"
     shuffle_choices: bool = False
     def prepare(self):
         super().prepare()
@@ -538,6 +568,31 @@ class MultipleChoiceTemplate(InputFormatTemplate):
                 "XX",
             ]
     def inputs_to_choices(self, data: Dict[str, Any], choice_format: str) -> str:
         choices = data[self.choices_field]
         enumrated_choices = []
@@ -612,18 +667,44 @@ class MultipleChoiceTemplate(InputFormatTemplate):
     def preprocess_input_and_reference_fields(
         self, input_fields: Dict[str, Any], reference_fields: Dict[str, Any]
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        if self.shuffle_choices:
-            target_index = self.outputs_to_target_index(reference_fields)
-            original_label_choice = reference_fields[self.choices_field][target_index]
-            choices = input_fields[self.choices_field]
-            random_seed = {**input_fields}
-            random_generator = new_random_generator(random_seed)
             random_generator.shuffle(choices)
-            input_fields[self.choices_field] = choices
-            reference_fields[self.choices_field] = choices
-            reference_fields[self.target_field] = choices.index(original_label_choice)
         return input_fields, reference_fields

 class MultipleChoiceTemplate(InputFormatTemplate):
+    """Formats the input that specifies a multiple-choice question, with a list of possible answers to choose from, and identifies the correct answer.
+    Args:
+        target_prefix (str): Optional prefix that can be added before the target label in
+            generated prompts or outputs.
+        choices_field (str): The key under which the multiple choices are stored in the
+            input and reference dictionaries.
+        target_field (str): The key under which the correct choice is stored in the
+            reference dictionary (can be integer index or textual label).
+        choices_separator (str): A string used to join formatted choices (e.g. ", ").
+        source_choice_format (str): A Python format string used for displaying each choice
+            in the input fields (e.g. "{choice_numeral}. {choice_text}").
+        target_choice_format (str): A Python format string used for displaying each choice
+            in the target or final output (e.g. "{choice_numeral}").
+        enumerator (str): Determines how choice numerals are enumerated. Possible values
+            include "capitals", "lowercase", "numbers", or "roman".
+        shuffle_choices (bool): If True, shuffle the choices. The shuffling seed can be
+            set with `shuffle_choices_seed`.
+        shuffle_choices_seed (int, optional): If provided, the choices are shuffled with
+            this fixed integer seed for reproducibility.
+        sort_choices_by_length (bool): If True, sorts choices by their length (ascending).
+        sort_choices_alphabetically (bool): If True, sorts choices in alphabetical order.
+        reverse_choices (bool): If True, reverses the order of the choices after any
+            sorting has been applied. Defaults to False to preserve backward compatibility.
+    """
     target_prefix: str = ""
     choices_field: str = "choices"
     source_choice_format: str = "{choice_numeral}. {choice_text}"
     target_choice_format: str = "{choice_numeral}"
     enumerator: str = "capitals"
     shuffle_choices: bool = False
+    shuffle_choices_seed: int = None
+    sort_choices_by_length: bool = False
+    sort_choices_alphabetically: bool = False
+    reverse_choices: bool = False  # False by default for backward-compat
+    place_correct_choice_position: int = None
     def prepare(self):
         super().prepare()
                 "XX",
             ]
+    def verify(self):
+        super().verify()
+        if self.shuffle_choices and (
+            self.sort_choices_by_length
+            or self.sort_choices_alphabetically
+            or self.reverse_choices
+            or self.place_correct_choice_position is not None
+        ):
+            raise UnitxtError(
+                "You cannot combine shuffle_choices with sorting or reversing flags."
+            )
+        if self.sort_choices_by_length and self.sort_choices_alphabetically:
+            raise UnitxtError(
+                "You cannot combine both sort_choices_by_length and sort_choices_alphabetically simultaneously."
+            )
+        if self.place_correct_choice_position is not None and (
+            self.sort_choices_by_length
+            or self.sort_choices_alphabetically
+            or self.reverse_choices
+        ):
+            raise UnitxtError(
+                "You cannot combine place_correct_choice_position with sorting or reversing flags."
+            )
     def inputs_to_choices(self, data: Dict[str, Any], choice_format: str) -> str:
         choices = data[self.choices_field]
         enumrated_choices = []
     def preprocess_input_and_reference_fields(
         self, input_fields: Dict[str, Any], reference_fields: Dict[str, Any]
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        if (
+            not self.shuffle_choices
+            and not self.sort_choices_by_length
+            and not self.sort_choices_alphabetically
+            and not self.reverse_choices
+            and self.place_correct_choice_position is None
+        ):
+            return input_fields, reference_fields
+        choices = input_fields[self.choices_field]
+        target_index = self.outputs_to_target_index(reference_fields)
+        original_label_choice = reference_fields[self.choices_field][target_index]
+        if self.sort_choices_by_length:
+            choices.sort(key=len)
+        if self.sort_choices_alphabetically:
+            choices.sort()
+        if self.reverse_choices:
+            choices.reverse()
+        if self.shuffle_choices:
+            random_generator = new_random_generator(
+                self.shuffle_choices_seed
+                if self.shuffle_choices_seed is not None
+                else {**input_fields}
+            )
             random_generator.shuffle(choices)
+        if self.place_correct_choice_position is not None:
+            if not 0 <= self.place_correct_choice_position < len(choices):
+                raise ValueError(
+                    f"fix_correct_choice_position={self.place_correct_choice_position} out of range (0..{len(choices) - 1})."
+                )
+            choices.remove(original_label_choice)
+            choices.insert(self.place_correct_choice_position, original_label_choice)
+        # Update both input_fields and reference_fields once at the end
+        input_fields[self.choices_field] = choices
+        reference_fields[self.choices_field] = choices
+        reference_fields[self.target_field] = choices.index(original_label_choice)
         return input_fields, reference_fields

types.py CHANGED Viewed

@@ -26,6 +26,13 @@ class Image(TypedDict):
     format: str
 Video = NewType("Video", List[Image])
@@ -46,4 +53,6 @@ register_type(Table)
 register_type(Audio)
 register_type(Image)
 register_type(Video)
 register_type(RagResponse)

     format: str
+class Document(TypedDict):
+    title: str
+    body: str
+MultiDocument = NewType("MultiDocument", List[Document])
 Video = NewType("Video", List[Image])
 register_type(Audio)
 register_type(Image)
 register_type(Video)
+register_type(Document)
+register_type(MultiDocument)
 register_type(RagResponse)

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.16.1"


1	+ version = "1.16.2"