Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on May 5, 2024

Commit

a350a45

verified ·

1 Parent(s): ee0deab

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

dataset.py +7 -3
metric.py +6 -1
metrics.py +72 -47
standard.py +3 -1
task.py +11 -1
version.py +1 -1

dataset.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .catalog import __file__ as _
 from .collections import __file__ as _
 from .collections_operators import __file__ as _
 from .dataclass import __file__ as _
 from .dataset_utils import get_dataset_artifact
 from .deprecation_utils import __file__ as _
 from .dialog_operators import __file__ as _
@@ -19,11 +20,13 @@ from .file_utils import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _
 from .generator_utils import __file__ as _
 from .hf_utils import verify_versions_compatibility
 from .inference import __file__ as _
 from .instructions import __file__ as _
 from .llm_as_judge import __file__ as _
 from .loaders import __file__ as _
 from .logging_utils import get_logger
 from .metric import __file__ as _
 from .metric_utils import __file__ as _
@@ -37,6 +40,7 @@ from .random_utils import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
@@ -50,6 +54,7 @@ from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .utils import is_package_installed
 from .validate import __file__ as _
 from .version import __file__ as _
@@ -70,9 +75,8 @@ class Dataset(datasets.GeneratorBasedBuilder):
             if is_package_installed("unitxt"):
                 verify_versions_compatibility("dataset", self.VERSION)
-                from unitxt.dataset_utils import (
-                    get_dataset_artifact as get_dataset_artifact_installed,
-                )
                 logger.info("Loading with installed unitxt library...")
                 dataset = get_dataset_artifact_installed(self.config.name)

 from .collections import __file__ as _
 from .collections_operators import __file__ as _
 from .dataclass import __file__ as _
+from .dataset_utils import __file__ as _
 from .dataset_utils import get_dataset_artifact
 from .deprecation_utils import __file__ as _
 from .dialog_operators import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _
 from .generator_utils import __file__ as _
+from .hf_utils import __file__ as _
 from .hf_utils import verify_versions_compatibility
 from .inference import __file__ as _
 from .instructions import __file__ as _
 from .llm_as_judge import __file__ as _
 from .loaders import __file__ as _
+from .logging_utils import __file__ as _
 from .logging_utils import get_logger
 from .metric import __file__ as _
 from .metric_utils import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
+from .settings_utils import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
+from .utils import __file__ as _
 from .utils import is_package_installed
 from .validate import __file__ as _
 from .version import __file__ as _
             if is_package_installed("unitxt"):
                 verify_versions_compatibility("dataset", self.VERSION)
+                from unitxt.dataset_utils import \
+                    get_dataset_artifact as get_dataset_artifact_installed
                 logger.info("Loading with installed unitxt library...")
                 dataset = get_dataset_artifact_installed(self.config.name)

metric.py CHANGED Viewed

@@ -19,13 +19,16 @@ from .file_utils import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _
 from .generator_utils import __file__ as _
 from .hf_utils import verify_versions_compatibility
 from .inference import __file__ as _
 from .instructions import __file__ as _
 from .llm_as_judge import __file__ as _
 from .loaders import __file__ as _
 from .logging_utils import __file__ as _
-from .metric_utils import UNITXT_METRIC_SCHEMA, _compute
 from .metrics import __file__ as _
 from .normalizers import __file__ as _
 from .operator import __file__ as _
@@ -36,6 +39,7 @@ from .random_utils import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
@@ -49,6 +53,7 @@ from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .utils import is_package_installed
 from .validate import __file__ as _
 from .version import __file__ as _

 from .formats import __file__ as _
 from .fusion import __file__ as _
 from .generator_utils import __file__ as _
+from .hf_utils import __file__ as _
 from .hf_utils import verify_versions_compatibility
 from .inference import __file__ as _
 from .instructions import __file__ as _
 from .llm_as_judge import __file__ as _
 from .loaders import __file__ as _
 from .logging_utils import __file__ as _
+from .metric_utils import UNITXT_METRIC_SCHEMA
+from .metric_utils import __file__ as _
+from .metric_utils import _compute
 from .metrics import __file__ as _
 from .normalizers import __file__ as _
 from .operator import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
+from .settings_utils import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
+from .utils import __file__ as _
 from .utils import is_package_installed
 from .validate import __file__ as _
 from .version import __file__ as _

metrics.py CHANGED Viewed

@@ -29,7 +29,7 @@ from .operators import CopyFields
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
-from .type_utils import isoftype, parse_type_string, to_float_or_default
 logger = get_logger()
 settings = get_settings()
@@ -1261,17 +1261,28 @@ class F1Micro(F1):
     average = "micro"
-class F1Binary(F1):
     """Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions."""
     process_single_instances = False
     main_score = "f1_binary"
-    average = "binary"
-    pos_classes = {"1", "1.0", "yes", "true"}
     threshold = 0.5
-    def get_str_id(self, str):
-        return int(str)
     def compute(
         self,
@@ -1279,12 +1290,21 @@ class F1Binary(F1):
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
-        predictions_floats = [to_float_or_default(p) for p in predictions]
-        predictions = [str(int(p > self.threshold)) for p in predictions_floats]
-        references = [
-            ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
-        ]
-        return super().compute(references, predictions, task_data)
 class RecallBinary(F1Binary):
@@ -1538,7 +1558,7 @@ class KendallTauMetric(GlobalMetric):
     main_score = "kendalltau_b"
     variant = "b"
     process_single_instances = False
-    prediction_type = "str"
     _requirements_list: List[str] = ["scipy"]
@@ -1555,8 +1575,6 @@ class KendallTauMetric(GlobalMetric):
     ) -> dict:
         if isinstance(references[0], list):
             references = [reference[0] for reference in references]
-        references = [to_float_or_default(r) for r in references]
-        predictions = [to_float_or_default(p) for p in predictions]
         kendall_results = self.kendalltau(references, predictions, variant=self.variant)
         corr = kendall_results.correlation
@@ -1602,7 +1620,7 @@ class RocAuc(GlobalMetric):
     process_single_instances = False
     _requirements_list: List[str] = ["sklearn"]
     single_reference_per_prediction = True
-    prediction_type = "str"
     def prepare(self):
         from sklearn import metrics
@@ -1618,8 +1636,6 @@ class RocAuc(GlobalMetric):
     ) -> dict:
         if isinstance(references[0], list):
             references = [reference[0] for reference in references]
-        references = [to_float_or_default(r) for r in references]
-        predictions = [to_float_or_default(p) for p in predictions]
         false_positive_rates, true_positive_rates, _ = self.roc_curve(
             y_true=references, y_score=predictions
@@ -3337,33 +3353,42 @@ class BinaryMaxF1(F1Binary):
     """Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions."""
     main_score = "max_f1_binary"
-    prediction_type = str
     single_reference_per_prediction = True
     def compute(
         self,
-        references: List[List[str]],
-        predictions: List[List[str]],
         task_data: List[Dict],
     ) -> dict:
-        float_predictions = [to_float_or_default(p) for p in predictions]
         best_thr = -1
         best_f1 = -1
-        thrs = {round(fp, 3) for fp in float_predictions}
         for thr in thrs:
             new_predictions = [
-                "1" if float_prediction >= thr else "0"
-                for float_prediction in float_predictions
-            ]
-            f1 = super().compute(references, new_predictions, task_data)[
-                self.main_score
             ]
             if f1 > best_f1:
                 best_f1 = f1
                 best_thr = thr
-        return {self.main_score: best_f1, "best_thr_maxf1": best_thr}
 class BinaryAccuracy(InstanceMetric):
@@ -3372,20 +3397,25 @@ class BinaryAccuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy_binary"]}
     main_score = "accuracy_binary"
     ci_scores = ["accuracy_binary"]
-    pos_classes = {"1", "1.0", "yes", "true"}
     threshold = 0.5
-    prediction_type = "str"
     single_reference_per_prediction = True
     def compute(
-        self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict:
-        float_prediction = to_float_or_default(prediction)
-        prediction = str(int(float_prediction > self.threshold))
-        references = ["1"] if references[0].lower() in self.pos_classes else ["0"]
-        result = {self.main_score: float([prediction] == references)}
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score
         return result
@@ -3396,9 +3426,7 @@ class BinaryMaxAccuracy(GlobalMetric):
     process_single_instances = False
     main_score = "max_accuracy_binary"
-    pos_classes = {"1", "1.0", "yes", "true"}
-    prediction_type = "str"
     single_reference_per_prediction = True
     def compute(
@@ -3407,10 +3435,7 @@ class BinaryMaxAccuracy(GlobalMetric):
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
-        float_predictions = [to_float_or_default(p) for p in predictions]
-        references = [
-            ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
-        ]
         # Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions
         # that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that
@@ -3421,8 +3446,8 @@ class BinaryMaxAccuracy(GlobalMetric):
         # the largest float predictions, to induce the partition into all-failing , none-passing.
         fp = [
-            (float_predictions[i], i, -1 if references[i][0] == "1" else +1)
-            for i in range(len(float_predictions))
         ]
         fp.sort()
         # each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also
@@ -3436,7 +3461,7 @@ class BinaryMaxAccuracy(GlobalMetric):
         current_thr = fp[0][0]
         # partition float_predictions into all-passing, none-failing
-        current_acc = sum(r[0] == "1" for r in references)
         # number of predictions that thr sends to the reference they are paired with
         best_acc = current_acc

 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
+from .type_utils import isoftype, parse_type_string
 logger = get_logger()
 settings = get_settings()
     average = "micro"
+class F1Binary(GlobalMetric):
     """Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions."""
     process_single_instances = False
     main_score = "f1_binary"
+    average = None
     threshold = 0.5
+    prediction_type = "Union[float, int]"
+    _metric = None
+    metric = "f1"
+    single_reference_per_prediction = True
+    def prepare(self):
+        super().prepare()
+        self._metric = evaluate.load(self.metric)
+    def _validate_reference(self, reference):
+        super()._validate_reference(reference)
+        assert reference[0] in [
+            0,
+            1,
+        ], f"all references of {self.main_score} must by 0 or 1"
     def compute(
         self,
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
+        flattened_int_references = [int(r[0]) for r in references]
+        int_predictions = [int(p > self.threshold) for p in predictions]
+        result = self._metric.compute(
+            references=flattened_int_references,
+            predictions=int_predictions,
+            labels=[0, 1],
+            average=self.average,
+        )
+        if isinstance(result[self.metric], numpy.ndarray):
+            return {
+                self.main_score: result[self.metric][1],
+                f"{self.main_score}_neg": result[self.metric][0],
+            }
+        return {self.main_score: result[self.metric]}
 class RecallBinary(F1Binary):
     main_score = "kendalltau_b"
     variant = "b"
     process_single_instances = False
+    prediction_type = "float"
     _requirements_list: List[str] = ["scipy"]
     ) -> dict:
         if isinstance(references[0], list):
             references = [reference[0] for reference in references]
         kendall_results = self.kendalltau(references, predictions, variant=self.variant)
         corr = kendall_results.correlation
     process_single_instances = False
     _requirements_list: List[str] = ["sklearn"]
     single_reference_per_prediction = True
+    prediction_type = "float"
     def prepare(self):
         from sklearn import metrics
     ) -> dict:
         if isinstance(references[0], list):
             references = [reference[0] for reference in references]
         false_positive_rates, true_positive_rates, _ = self.roc_curve(
             y_true=references, y_score=predictions
     """Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions."""
     main_score = "max_f1_binary"
     single_reference_per_prediction = True
     def compute(
         self,
+        references: List[List[float]],
+        predictions: List[List[float]],
         task_data: List[Dict],
     ) -> dict:
         best_thr = -1
         best_f1 = -1
+        best_thr_neg = -1
+        best_f1_neg = -1
+        thrs = {round(fp, 3) for fp in predictions}
         for thr in thrs:
             new_predictions = [
+                1.0 if float_prediction >= thr else 0.0
+                for float_prediction in predictions
             ]
+            f1_results = super().compute(references, new_predictions, task_data)
+            f1 = f1_results[self.main_score]
             if f1 > best_f1:
                 best_f1 = f1
                 best_thr = thr
+            f1_neg = f1_results[f"{self.main_score}_neg"]
+            if f1_neg > best_f1_neg:
+                best_f1_neg = f1_neg
+                best_thr_neg = thr
+        return {
+            self.main_score: best_f1,
+            "best_thr_maxf1": best_thr,
+            f"{self.main_score}_neg": best_f1_neg,
+            "best_thr_maxf1_neg": best_thr_neg,
+        }
 class BinaryAccuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy_binary"]}
     main_score = "accuracy_binary"
     ci_scores = ["accuracy_binary"]
     threshold = 0.5
+    prediction_type = "Union[float,int]"
     single_reference_per_prediction = True
+    def _validate_reference(self, reference):
+        super()._validate_reference(reference)
+        assert reference[0] in [
+            0,
+            1,
+        ], f"all references of {self.main_score} must by 0 or 1"
     def compute(
+        self, references: List[float], prediction: float, task_data: List[Dict]
     ) -> dict:
+        prediction = int(prediction > self.threshold)
+        reference = int(references[0])
+        result = {self.main_score: float(prediction == reference)}
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score
         return result
     process_single_instances = False
     main_score = "max_accuracy_binary"
+    prediction_type = "Union[float,int]"
     single_reference_per_prediction = True
     def compute(
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
+        references = [[int(r[0])] for r in references]
         # Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions
         # that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that
         # the largest float predictions, to induce the partition into all-failing , none-passing.
         fp = [
+            (predictions[i], i, -1 if references[i][0] == 1 else +1)
+            for i in range(len(predictions))
         ]
         fp.sort()
         # each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also
         current_thr = fp[0][0]
         # partition float_predictions into all-passing, none-failing
+        current_acc = sum(r[0] == 1 for r in references)
         # number of predictions that thr sends to the reference they are paired with
         best_acc = current_acc

standard.py CHANGED Viewed

@@ -225,7 +225,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
             self.augmentor.set_task_input_fields(self.card.task.augmentable_inputs)
             self.processing.steps.append(self.augmentor)
-        if self.num_demos > 0:
             self.processing.steps.append(
                 CreateDemosPool(
                     from_split=self.demos_taken_from,
@@ -234,6 +234,8 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
                     remove_targets_from_source_split=self.demos_removed_from_data,
                 )
             )
             if self.sampler is None:
                 if self.card.sampler is None:
                     raise ValueError(

             self.augmentor.set_task_input_fields(self.card.task.augmentable_inputs)
             self.processing.steps.append(self.augmentor)
+        if self.demos_pool_size is not None:
             self.processing.steps.append(
                 CreateDemosPool(
                     from_split=self.demos_taken_from,
                     remove_targets_from_source_split=self.demos_removed_from_data,
                 )
             )
+        if self.num_demos > 0:
             if self.sampler is None:
                 if self.card.sampler is None:
                     raise ValueError(

task.py CHANGED Viewed

@@ -3,7 +3,13 @@ from typing import Any, Dict, List, Optional, Union
 from .artifact import fetch_artifact
 from .logging_utils import get_logger
 from .operator import StreamInstanceOperator
-from .type_utils import isoftype, parse_type_string, verify_required_schema
 class Tasker:
@@ -79,6 +85,10 @@ class FormTask(Tasker, StreamInstanceOperator):
                 prediction_type == metric_prediction_type
                 or prediction_type == Any
                 or metric_prediction_type == Any
             ):
                 continue

 from .artifact import fetch_artifact
 from .logging_utils import get_logger
 from .operator import StreamInstanceOperator
+from .type_utils import (
+    get_args,
+    get_origin,
+    isoftype,
+    parse_type_string,
+    verify_required_schema,
+)
 class Tasker:
                 prediction_type == metric_prediction_type
                 or prediction_type == Any
                 or metric_prediction_type == Any
+                or (
+                    get_origin(metric_prediction_type) is Union
+                    and prediction_type in get_args(metric_prediction_type)
+                )
             ):
                 continue

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.7.9"


1	+ version = "1.8.0"