Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Apr 6

Commit

64dd81e

verified ·

1 Parent(s): d346c89

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

api.py +2 -1
formats.py +3 -0
fusion.py +3 -0
inference.py +28 -10
llm_as_judge_constants.py +1 -1
loaders.py +40 -17
metrics.py +166 -73
processors.py +7 -0
settings_utils.py +1 -0
sql_utils.py +10 -3
standard.py +1 -1
string_operators.py +2 -0
utils.py +77 -0
version.py +1 -1

api.py CHANGED Viewed

@@ -9,6 +9,7 @@ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from datasets.exceptions import DatasetGenerationError
 from .artifact import fetch_artifact
 from .card import TaskCard
 from .dataset_utils import get_dataset_artifact
 from .error_utils import UnitxtError
@@ -78,7 +79,7 @@ def _verify_dataset_args(dataset_query: Optional[str] = None, dataset_args=None)
 def load_recipe(dataset_query: Optional[str] = None, **kwargs) -> DatasetRecipe:
-    if isinstance(dataset_query, DatasetRecipe):
         return dataset_query
     _verify_dataset_args(dataset_query, kwargs)

 from datasets.exceptions import DatasetGenerationError
 from .artifact import fetch_artifact
+from .benchmark import Benchmark
 from .card import TaskCard
 from .dataset_utils import get_dataset_artifact
 from .error_utils import UnitxtError
 def load_recipe(dataset_query: Optional[str] = None, **kwargs) -> DatasetRecipe:
+    if isinstance(dataset_query, (DatasetRecipe, Benchmark)):
         return dataset_query
     _verify_dataset_args(dataset_query, kwargs)

formats.py CHANGED Viewed

@@ -18,6 +18,7 @@ from .image_operators import image_to_data_url
 from .operator import InstanceOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype
 constants = get_constants()
@@ -33,6 +34,7 @@ class GraniteDocumentsFormat(Format):
     _requirements_list = ["transformers"]
     def prepare(self):
         super().prepare()
         from transformers import AutoTokenizer
@@ -487,6 +489,7 @@ class HFSystemFormat(ChatAPIFormat):
     model_name: str
     _requirements_list = ["transformers", "Jinja2"]
     def prepare(self):
         super().prepare()
         from transformers import AutoTokenizer

 from .operator import InstanceOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype
+from .utils import retry_connection_with_exponential_backoff
 constants = get_constants()
     _requirements_list = ["transformers"]
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare(self):
         super().prepare()
         from transformers import AutoTokenizer
     model_name: str
     _requirements_list = ["transformers", "Jinja2"]
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare(self):
         super().prepare()
         from transformers import AutoTokenizer

fusion.py CHANGED Viewed

@@ -2,11 +2,13 @@ from abc import abstractmethod
 from typing import Dict, Generator, List, Optional, Union
 from .dataclass import NonPositionalField
 from .operator import SourceOperator
 from .random_utils import new_random_generator
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
 class BaseFusion(SourceOperator):
     """BaseFusion operator that combines multiple multistreams into one.
@@ -76,6 +78,7 @@ class FixedFusion(BaseFusion):
             if split not in multi_stream:
                 continue
             emitted_from_this_split = 0
             try:
                 for instance in multi_stream[split]:
                     if (

 from typing import Dict, Generator, List, Optional, Union
 from .dataclass import NonPositionalField
+from .logging_utils import get_logger
 from .operator import SourceOperator
 from .random_utils import new_random_generator
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
+logger = get_logger()
 class BaseFusion(SourceOperator):
     """BaseFusion operator that combines multiple multistreams into one.
             if split not in multi_stream:
                 continue
             emitted_from_this_split = 0
+            logger.info(f"Processing {split} from {origin_name}...")
             try:
                 for instance in multi_stream[split]:
                     if (

inference.py CHANGED Viewed

@@ -31,7 +31,6 @@ from typing import (
 )
 from datasets import Dataset, DatasetDict, Image
-from diskcache import Cache
 from tqdm import tqdm, trange
 from tqdm.asyncio import tqdm_asyncio
@@ -50,6 +49,7 @@ from .operator import PackageRequirementsMixin
 from .operators import ArtifactFetcherMixin
 from .settings_utils import get_constants, get_settings
 from .type_utils import isoftype
 constants = get_constants()
 settings = get_settings()
@@ -183,7 +183,9 @@ class InferenceEngine(Artifact):
         if not settings.mock_inference_mode:
             super().prepare()  # no need to prepare a mock
             self.prepare_engine()
-            self._cache = Cache(get_settings().inference_engine_cache_path + self.__class__.__name__)
     def __call__(
         self,
@@ -199,6 +201,7 @@ class InferenceEngine(Artifact):
     def _get_cache_key(self, instance: Dict[str, Any]) -> str:
         """Generate a unique cache key for each input."""
         record = self.get_instance_cache_key(instance)
         record.update(self.to_dict())
         instance_str = json.dumps(record, sort_keys=True)
         return hashlib.md5(instance_str.encode()).hexdigest()
@@ -875,6 +878,7 @@ class HFPeftInferenceEngine(HFAutoModelInferenceEngine):
             self.peft_config.base_model_name_or_path
         )
     def _init_model(self):
         from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
         from transformers import AutoConfig
@@ -938,14 +942,26 @@ class HFPipelineBasedInferenceEngine(
         if settings.hf_offline_models_path is not None:
             path = os.path.join(settings.hf_offline_models_path, path)
-        self.task = (
-            "text2text-generation"
-            if AutoConfig.from_pretrained(
-                path,
-                trust_remote_code=True,
-            ).is_encoder_decoder
-            else "text-generation"
-        )
     def _get_model_args(self) -> Dict[str, Any]:
         import torch
@@ -977,6 +993,7 @@ class HFPipelineBasedInferenceEngine(
         return args
     def _create_pipeline(self, model_args: Dict[str, Any]):
         from transformers import AutoTokenizer, pipeline
@@ -3336,6 +3353,7 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
     def get_engine_id(self):
         return get_model_and_label_id(self.model_name, self.label)
     def prepare_engine(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer

 )
 from datasets import Dataset, DatasetDict, Image
 from tqdm import tqdm, trange
 from tqdm.asyncio import tqdm_asyncio
 from .operators import ArtifactFetcherMixin
 from .settings_utils import get_constants, get_settings
 from .type_utils import isoftype
+from .utils import retry_connection_with_exponential_backoff
 constants = get_constants()
 settings = get_settings()
         if not settings.mock_inference_mode:
             super().prepare()  # no need to prepare a mock
             self.prepare_engine()
+            if self.use_cache:
+                from diskcache import Cache
+                self._cache = Cache(settings.inference_engine_cache_path + self.__class__.__name__)
     def __call__(
         self,
     def _get_cache_key(self, instance: Dict[str, Any]) -> str:
         """Generate a unique cache key for each input."""
         record = self.get_instance_cache_key(instance)
+        record["version"] = constants.version
         record.update(self.to_dict())
         instance_str = json.dumps(record, sort_keys=True)
         return hashlib.md5(instance_str.encode()).hexdigest()
             self.peft_config.base_model_name_or_path
         )
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def _init_model(self):
         from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
         from transformers import AutoConfig
         if settings.hf_offline_models_path is not None:
             path = os.path.join(settings.hf_offline_models_path, path)
+        try:
+            # Try loading as a full model (HF model or local full model)
+            config = AutoConfig.from_pretrained(path, trust_remote_code=True)
+        except Exception:
+            try:
+                from peft import PeftConfig
+                # If full model loading fails, try loading as a PEFT adapter
+                peft_config = PeftConfig.from_pretrained(path)
+                if not peft_config.base_model_name_or_path:
+                    raise ValueError(f"Base model name not found in PEFT config for {path}")
+                # Load the base model's config
+                config = AutoConfig.from_pretrained(peft_config.base_model_name_or_path, trust_remote_code=True)
+            except Exception as err2:
+                raise ValueError(f"Could not determine model type for: {path}") from err2
+        self.task =  "text2text-generation" if config.is_encoder_decoder else "text-generation"
     def _get_model_args(self) -> Dict[str, Any]:
         import torch
         return args
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def _create_pipeline(self, model_args: Dict[str, Any]):
         from transformers import AutoTokenizer, pipeline
     def get_engine_id(self):
         return get_model_and_label_id(self.model_name, self.label)
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare_engine(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer

llm_as_judge_constants.py CHANGED Viewed

@@ -85,7 +85,7 @@ class EvaluatorNameEnum(str, Enum):
 class ModelProviderEnum(str, Enum):
     WATSONX = "watsonx"
-    OPENAI = "openai"
     RITS = "rits"
     AZURE_OPENAI = "azure"

 class ModelProviderEnum(str, Enum):
     WATSONX = "watsonx"
+    OPENAI = "open-ai"
     RITS = "rits"
     AZURE_OPENAI = "azure"

loaders.py CHANGED Viewed

@@ -57,7 +57,6 @@ import pandas as pd
 import requests
 from datasets import (
     DatasetDict,
-    DownloadConfig,
     IterableDataset,
     IterableDatasetDict,
     get_dataset_split_names,
@@ -75,7 +74,7 @@ from .operators import Set
 from .settings_utils import get_settings
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
-from .utils import LRUCache, recursive_copy
 logger = get_logger()
 settings = get_settings()
@@ -84,6 +83,7 @@ class UnitxtUnverifiedCodeError(UnitxtError):
     def __init__(self, path):
         super().__init__(f"Loader cannot load and run remote code from {path} in huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE.", Documentation.SETTINGS)
 def hf_load_dataset(path: str, *args, **kwargs):
     if settings.hf_offline_datasets_path is not None:
         path = os.path.join(settings.hf_offline_datasets_path, path)
@@ -91,9 +91,6 @@ def hf_load_dataset(path: str, *args, **kwargs):
         return _hf_load_dataset(
             path,
             *args, **kwargs,
-                download_config=DownloadConfig(
-                    max_retries=settings.loaders_max_retries,
-                ),
                 verification_mode="no_checks",
                 trust_remote_code=settings.allow_unverified_code,
                 download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
@@ -101,6 +98,24 @@ def hf_load_dataset(path: str, *args, **kwargs):
     except ValueError as e:
         if "trust_remote_code" in str(e):
             raise UnitxtUnverifiedCodeError(path) from e
 class Loader(SourceOperator):
     """A base class for all loaders.
@@ -287,6 +302,9 @@ class LoadHF(LazyLoader):
             return settings.stream_hf_datasets_by_default
         return self.streaming
     # returns Dict when split names are not known in advance, and just the the single split dataset - if known
     def load_dataset(
         self, split: str, streaming=None, disable_memory_caching=False
@@ -307,9 +325,15 @@ class LoadHF(LazyLoader):
                 split=split,
                 num_proc=self.num_proc,
             )
-            self.__class__._loader_cache.max_size = settings.loader_cache_size
             if not disable_memory_caching:
                 self.__class__._loader_cache[dataset_id] = dataset
         return dataset
     def _maybe_set_classification_policy(self):
@@ -323,22 +347,16 @@ class LoadHF(LazyLoader):
                 None,  # No warning when loading from public hub
             )
     def get_splits(self):
         if self.splits is not None:
             return self.splits
         try:
-            return get_dataset_split_names(
                 path=self.path,
-                config_name=self.name,
-                trust_remote_code=settings.allow_unverified_code,
-                download_config=DownloadConfig(
-                    max_retries=settings.loaders_max_retries,
-                    extract_on_the_fly=True,
-                ),
             )
-        except Exception as e:
-            if "trust_remote_code" in str(e):
-                raise UnitxtUnverifiedCodeError(self.path) from e
             UnitxtWarning(
                 f'LoadHF(path="{self.path}", name="{self.name}") could not retrieve split names without loading the dataset. Consider defining "splits" in the LoadHF definition to improve loading time.'
             )
@@ -350,11 +368,16 @@ class LoadHF(LazyLoader):
                 NotImplementedError
             ):  # streaming is not supported for zipped files so we load without streaming
                 dataset = self.load_dataset(split=None, streaming=False)
             return list(dataset.keys())
     def split_generator(self, split: str) -> Generator:
         if self.get_limit() is not None:
-            self.log_limited_loading()
         try:
             dataset = self.load_dataset(split=split)
         except (

 import requests
 from datasets import (
     DatasetDict,
     IterableDataset,
     IterableDatasetDict,
     get_dataset_split_names,
 from .settings_utils import get_settings
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
+from .utils import LRUCache, recursive_copy, retry_connection_with_exponential_backoff
 logger = get_logger()
 settings = get_settings()
     def __init__(self, path):
         super().__init__(f"Loader cannot load and run remote code from {path} in huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE.", Documentation.SETTINGS)
+@retry_connection_with_exponential_backoff(backoff_factor=2)
 def hf_load_dataset(path: str, *args, **kwargs):
     if settings.hf_offline_datasets_path is not None:
         path = os.path.join(settings.hf_offline_datasets_path, path)
         return _hf_load_dataset(
             path,
             *args, **kwargs,
                 verification_mode="no_checks",
                 trust_remote_code=settings.allow_unverified_code,
                 download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
     except ValueError as e:
         if "trust_remote_code" in str(e):
             raise UnitxtUnverifiedCodeError(path) from e
+        raise e # Re raise
+@retry_connection_with_exponential_backoff(backoff_factor=2)
+def hf_get_dataset_splits(path: str, name: str):
+    try:
+        return get_dataset_split_names(
+            path=path,
+            config_name=name,
+            trust_remote_code=settings.allow_unverified_code,
+        )
+    except Exception as e:
+        if "trust_remote_code" in str(e):
+            raise UnitxtUnverifiedCodeError(path) from e
+        if "Couldn't find cache" in str(e):
+            raise FileNotFoundError(f"Dataset cache path={path}, name={name} was not found.") from e
+        raise e # Re raise
 class Loader(SourceOperator):
     """A base class for all loaders.
             return settings.stream_hf_datasets_by_default
         return self.streaming
+    def is_in_cache(self, split):
+        dataset_id = str(self) + "_" + str(split)
+        return dataset_id in self.__class__._loader_cache
     # returns Dict when split names are not known in advance, and just the the single split dataset - if known
     def load_dataset(
         self, split: str, streaming=None, disable_memory_caching=False
                 split=split,
                 num_proc=self.num_proc,
             )
+            if dataset is None:
+                raise NotImplementedError() from None
             if not disable_memory_caching:
+                self.__class__._loader_cache.max_size = settings.loader_cache_size
                 self.__class__._loader_cache[dataset_id] = dataset
+        self._already_logged_limited_loading = True
         return dataset
     def _maybe_set_classification_policy(self):
                 None,  # No warning when loading from public hub
             )
+    @retry_connection_with_exponential_backoff(max_retries=3, backoff_factor=2)
     def get_splits(self):
         if self.splits is not None:
             return self.splits
         try:
+            return hf_get_dataset_splits(
                 path=self.path,
+                name=self.name,
             )
+        except Exception:
             UnitxtWarning(
                 f'LoadHF(path="{self.path}", name="{self.name}") could not retrieve split names without loading the dataset. Consider defining "splits" in the LoadHF definition to improve loading time.'
             )
                 NotImplementedError
             ):  # streaming is not supported for zipped files so we load without streaming
                 dataset = self.load_dataset(split=None, streaming=False)
+            if dataset is None:
+                raise FileNotFoundError(f"Dataset path={self.path}, name={self.name} was not found.") from None
             return list(dataset.keys())
     def split_generator(self, split: str) -> Generator:
         if self.get_limit() is not None:
+            if not self.is_in_cache(split):
+                self.log_limited_loading()
         try:
             dataset = self.load_dataset(split=split)
         except (

metrics.py CHANGED Viewed

@@ -29,7 +29,6 @@ import numpy
 import numpy as np
 import pandas as pd
 import requests
-from datasets import DownloadConfig
 from scipy.stats import bootstrap
 from scipy.stats._warnings_errors import DegenerateDataWarning
@@ -65,14 +64,14 @@ from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
 from .type_utils import Type, isoftype, parse_type_string, to_type_string
-from .utils import deep_copy, recursive_copy
 logger = get_logger()
 settings = get_settings()
 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 def hf_evaluate_load(path: str, *args, **kwargs):
     if settings.hf_offline_metrics_path is not None:
         path = os.path.join(settings.hf_offline_metrics_path, path)
@@ -81,9 +80,6 @@ def hf_evaluate_load(path: str, *args, **kwargs):
         *args,
         **kwargs,
         experiment_id=str(uuid.uuid4()),
-        download_config=DownloadConfig(
-            max_retries=settings.loaders_max_retries,
-        ),
         verification_mode="no_checks",
         trust_remote_code=settings.allow_unverified_code,
         download_mode=(
@@ -127,6 +123,7 @@ def nan_max(x):
         warnings.simplefilter("ignore", category=RuntimeWarning)
         return np.nanmax(x)
 def nan_std(x):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=RuntimeWarning)
@@ -398,12 +395,14 @@ class Statistic:
         result = np.array([scores[m] for m in self.score_names])
         self._history.append(result)
         return result
     def mean(self, idx):
         return nan_mean([result[idx] for result in self._history])
     def std(self, idx):
         return nan_std([result[idx] for result in self._history])
 class ConfidenceIntervalMixin(Artifact):
     n_resamples: int = 1000
     confidence_level: float = 0.95
@@ -413,18 +412,16 @@ class ConfidenceIntervalMixin(Artifact):
     def _sample_to_scores(self, sample: List[Any]) -> Dict[str, Any]:
         pass
     def bootstrap(self, data: List[Any], score_names: List[str]):
         if self.ci_score_names is not None:
             score_names = self.ci_score_names
         statistic = Statistic(data, score_names, self._sample_to_scores)
         with warnings.catch_warnings():
-            warnings.filterwarnings( # Ignore error the arises when all sample scores are identical
                 "ignore",
                 message="invalid value encountered in divide",
-                category=RuntimeWarning
             )
             intervals = bootstrap(
@@ -438,14 +435,17 @@ class ConfidenceIntervalMixin(Artifact):
                 method="BCa",
             ).confidence_interval
         result = {}
         for i, metric in enumerate(score_names):
             high = intervals.high[i]
             low = intervals.low[i]
             if np.isnan(high) and np.isnan(low):
-                if statistic.std(i) == 0: # When sample scores are identical "BCa" will fail (due to division by std 0)
-                    high = low = statistic.mean(i) # In this case we will use the mean (as there is no variance)
             result[f"{metric}_ci_low"] = float(low)
             result[f"{metric}_ci_high"] = float(high)
@@ -2807,7 +2807,7 @@ class FinQAEval(InstanceMetric):
         remote_url = "https://raw.githubusercontent.com/czyssrs/FinQA/dfc5b72c01ee17c442d28d5201b82a1f4e95d5af/code/evaluate/evaluate.py"
         local_filepath = "/tmp/finqa_eval_script.py"
         module_name = "finqa_eval"
-        hash_of_script = "42430b8613082bb4b85d49210284135d" # pragma: allowlist secret
         download_finqa_eval_script_file(remote_url, local_filepath, hash_of_script)
         self.finqa_module = load_finqa_eval_module_from_file(
@@ -3415,10 +3415,11 @@ class CustomF1(GlobalMetric):
 class KeyValueExtraction(GlobalMetric):
-    prediction_type = Dict[str,str]
-    metric : Metric
     single_reference_per_prediction = True
     main_score = ""
     def prepare(self):
         super().prepare()
         self.main_score = f"{self.metric.main_score}_micro"
@@ -3436,18 +3437,25 @@ class KeyValueExtraction(GlobalMetric):
         for reference in references:
             all_reference_keys.update(list(reference.keys()))
         for key in all_reference_keys:
-            key_statistics[key]= []
-        num_prediction_keys=0
-        illegal_prediction_keys=0
         for reference, prediction in zip(references, predictions):
             for key in all_reference_keys:
-                if (key not in reference and key not in prediction):
                     continue
-                if (key in reference and key in prediction):
-                    multi_stream = MultiStream.from_iterables({"test": [{"prediction" : prediction[key],
-                                                                        "references" : [reference[key]]}
-                                                                                                                                                                                                          ]})
                     output_multi_stream = self.metric(multi_stream)
                     output_stream = output_multi_stream["test"]
                     score = next(iter(output_stream))["score"]["global"]["score"]
@@ -3460,7 +3468,7 @@ class KeyValueExtraction(GlobalMetric):
                 if key not in all_reference_keys:
                     illegal_prediction_keys += 1
-        result={}
         average = 0
         total = 0
@@ -3476,13 +3484,16 @@ class KeyValueExtraction(GlobalMetric):
         result[f"{self.metric.main_score}_micro"] = weighted_average / total
         result[f"{self.metric.main_score}_macro"] = average / len(key_statistics)
-        if (num_prediction_keys !=0):
-            result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 1 - 1.0 * illegal_prediction_keys /  num_prediction_keys
         else:
             result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 0
         return result
 class NER(CustomF1):
     """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
@@ -3713,6 +3724,7 @@ class Detector(BulkInstanceMetric):
     _requirements_list: List[str] = ["transformers", "torch"]
     def prepare(self):
         super().prepare()
         import torch
@@ -3753,6 +3765,7 @@ class RegardMetric(GlobalMetric):
     _requirements_list: List[str] = ["transformers", "torch", "tqdm"]
     def prepare(self):
         super().prepare()
         from transformers import AutoModelForSequenceClassification, AutoTokenizer
@@ -3942,6 +3955,7 @@ class SafetyMetric(MapReduceMetric[str, Tuple[float, str]], TorchDeviceMixin):
         return result
     def prepare(self):
         super().prepare()
         from transformers import pipeline
@@ -4121,6 +4135,7 @@ class Perplexity(BulkInstanceMetric):
     _requirements_list: List[str] = ["transformers", "torch"]
     def compute(
         self,
         references: List[List[Any]],
@@ -4394,6 +4409,7 @@ class FaithfulnessHHEM(BulkInstanceMetric):
     _requirements_list: List[str] = ["transformers", "torch"]
     def prepare(self):
         super().prepare()
         import torch
@@ -6051,6 +6067,7 @@ class GraniteGuardianBase(InstanceMetric):
     _requirements_list: List[str] = ["torch", "transformers"]
     def prepare(self):
         from transformers import AutoTokenizer
@@ -6116,9 +6133,18 @@ class GraniteGuardianBase(InstanceMetric):
         )
         messages = self.process_input_fields(task_data)
         prompt = self.get_prompt(messages)
-        data_classification_policy = task_data.get("metadata", {}).get("data_classification_policy")
-        result = self.inference_engine.infer_log_probs([{"source": prompt, "data_classification_policy": data_classification_policy}])
         generated_tokens_list = result[0]
         label, prob_of_risk = self.parse_output(generated_tokens_list)
@@ -6371,13 +6397,20 @@ class SQLExecutionAccuracy(InstanceMetric):
         df1.fillna(0, inplace=True)
         df2.fillna(0, inplace=True)
         if df1.shape != df2.shape:
             return False
-        df1_rows_sorted = [sorted(map(str, row)) for row in df1.to_numpy()]
-        df2_rows_sorted = [sorted(map(str, row)) for row in df2.to_numpy()]
-        return df1_rows_sorted == df2_rows_sorted
     @staticmethod
     def compare_dfs_ignore_colnames_unordered_rows(df1, df2):
@@ -6391,46 +6424,85 @@ class SQLExecutionAccuracy(InstanceMetric):
             True if the DataFrames have the same content (ignoring column names and row order),
             False otherwise.
         """
-        return set(map(tuple, df1.to_numpy())) == set(map(tuple, df2.to_numpy()))
     @staticmethod
-    def is_subset_ignore_colnames(df1, df2):
-        """Checks if df1 is a subset of df2 based on row content, ignoring column names.
         Args:
-            df1: Pandas DataFrame 1 to compare.
-            df2: Pandas DataFrame 2 to compare.
         Returns:
-            True if df1 is a subset of df2 based on column values,
-            False otherwise.
         """
-        if df1.empty or df2.empty or df1.shape[1] > df2.shape[1]:
-            return False
-        def make_hashable(value):
-            if isinstance(value, dict):
-                return json.dumps(value, sort_keys=True)
-            if isinstance(value, list):
-                return tuple(value)
-            return value
-        df1_cols = [
-            tuple(make_hashable(value) for value in df1.iloc[:, i])
-            for i in range(df1.shape[1])
-        ]
-        df2_cols = [
-            tuple(make_hashable(value) for value in df2.iloc[:, j])
-            for j in range(df2.shape[1])
-        ]
-        df2_cols_count = Counter(df2_cols)
-        for col in df1_cols:
-            if df2_cols_count[col] > 0:
-                df2_cols_count[col] -= 1
-            else:
-                return False
-        return True
     def get_sql_execution_results(
         self, predicted_sql: str, gold_sql: str, connector
@@ -6446,7 +6518,7 @@ class SQLExecutionAccuracy(InstanceMetric):
         a 12-tuple of
         1. execution_result: if df responses match
         2. non_empty_execution_result: if dfs are non-empty and match
-        3. subset_non_empty_execution_result: if non-empty dfs and gt df subset of predicted df
         4. non_empty_gold_df: if gt df is non-empty
         5. gold_sql_runtime: ground truth query runtime
         6. predicted_sql_runtime: predicted query runtime
@@ -6569,12 +6641,21 @@ class SQLExecutionAccuracy(InstanceMetric):
             pred_res = pred_res["results"]
         predicted_df = pd.DataFrame(pred_res)
         if "ORDER BY" in gold_sql.upper():
             execution_result = (
                 1
                 if self.compare_dfs_ignore_colnames_ordered_rows(predicted_df, gold_df)
                 else 0
             )
         else:
             execution_result = (
                 1
@@ -6583,14 +6664,13 @@ class SQLExecutionAccuracy(InstanceMetric):
                 )
                 else 0
             )
-        subset_non_empty_execution_result = 0
-        non_empty_execution_result = 0
-        if non_empty_gold_df:
-            if execution_result == 1:
-                non_empty_execution_result = 1
-            if self.is_subset_ignore_colnames(gold_df, predicted_df):
-                subset_non_empty_execution_result = 1
         return (
             execution_result,
@@ -6672,6 +6752,7 @@ class SQLNonExecutionAccuracy(InstanceMetric):
             "sqlglot_optimized_equivalence",
             "sqlparse_equivalence",
             "sql_exact_match",
         ]
     }
     main_score = "sqlglot_equivalence"
@@ -6682,6 +6763,7 @@ class SQLNonExecutionAccuracy(InstanceMetric):
         "sqlglot_optimized_equivalence",
         "sqlparse_equivalence",
         "sql_exact_match",
     ]
     prediction_type = "Any"  # string representation is compared
@@ -6729,6 +6811,17 @@ class SQLNonExecutionAccuracy(InstanceMetric):
             ),
             "sql_exact_match": float(sql_exact_match(predicted_sql, gold_sql)),
         }
         logger.debug(f"SQL Non Execution Accuracy Result: {result}")
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score

 import numpy as np
 import pandas as pd
 import requests
 from scipy.stats import bootstrap
 from scipy.stats._warnings_errors import DegenerateDataWarning
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
 from .type_utils import Type, isoftype, parse_type_string, to_type_string
+from .utils import deep_copy, recursive_copy, retry_connection_with_exponential_backoff
 logger = get_logger()
 settings = get_settings()
 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
+@retry_connection_with_exponential_backoff(backoff_factor=2)
 def hf_evaluate_load(path: str, *args, **kwargs):
     if settings.hf_offline_metrics_path is not None:
         path = os.path.join(settings.hf_offline_metrics_path, path)
         *args,
         **kwargs,
         experiment_id=str(uuid.uuid4()),
         verification_mode="no_checks",
         trust_remote_code=settings.allow_unverified_code,
         download_mode=(
         warnings.simplefilter("ignore", category=RuntimeWarning)
         return np.nanmax(x)
 def nan_std(x):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=RuntimeWarning)
         result = np.array([scores[m] for m in self.score_names])
         self._history.append(result)
         return result
     def mean(self, idx):
         return nan_mean([result[idx] for result in self._history])
     def std(self, idx):
         return nan_std([result[idx] for result in self._history])
 class ConfidenceIntervalMixin(Artifact):
     n_resamples: int = 1000
     confidence_level: float = 0.95
     def _sample_to_scores(self, sample: List[Any]) -> Dict[str, Any]:
         pass
     def bootstrap(self, data: List[Any], score_names: List[str]):
         if self.ci_score_names is not None:
             score_names = self.ci_score_names
         statistic = Statistic(data, score_names, self._sample_to_scores)
         with warnings.catch_warnings():
+            warnings.filterwarnings(  # Ignore error the arises when all sample scores are identical
                 "ignore",
                 message="invalid value encountered in divide",
+                category=RuntimeWarning,
             )
             intervals = bootstrap(
                 method="BCa",
             ).confidence_interval
         result = {}
         for i, metric in enumerate(score_names):
             high = intervals.high[i]
             low = intervals.low[i]
             if np.isnan(high) and np.isnan(low):
+                if (
+                    statistic.std(i) == 0
+                ):  # When sample scores are identical "BCa" will fail (due to division by std 0)
+                    high = low = statistic.mean(
+                        i
+                    )  # In this case we will use the mean (as there is no variance)
             result[f"{metric}_ci_low"] = float(low)
             result[f"{metric}_ci_high"] = float(high)
         remote_url = "https://raw.githubusercontent.com/czyssrs/FinQA/dfc5b72c01ee17c442d28d5201b82a1f4e95d5af/code/evaluate/evaluate.py"
         local_filepath = "/tmp/finqa_eval_script.py"
         module_name = "finqa_eval"
+        hash_of_script = "42430b8613082bb4b85d49210284135d"  # pragma: allowlist secret
         download_finqa_eval_script_file(remote_url, local_filepath, hash_of_script)
         self.finqa_module = load_finqa_eval_module_from_file(
 class KeyValueExtraction(GlobalMetric):
+    prediction_type = Dict[str, str]
+    metric: Metric
     single_reference_per_prediction = True
     main_score = ""
     def prepare(self):
         super().prepare()
         self.main_score = f"{self.metric.main_score}_micro"
         for reference in references:
             all_reference_keys.update(list(reference.keys()))
         for key in all_reference_keys:
+            key_statistics[key] = []
+        num_prediction_keys = 0
+        illegal_prediction_keys = 0
         for reference, prediction in zip(references, predictions):
             for key in all_reference_keys:
+                if key not in reference and key not in prediction:
                     continue
+                if key in reference and key in prediction:
+                    multi_stream = MultiStream.from_iterables(
+                        {
+                            "test": [
+                                {
+                                    "prediction": prediction[key],
+                                    "references": [reference[key]],
+                                }
+                            ]
+                        }
+                    )
                     output_multi_stream = self.metric(multi_stream)
                     output_stream = output_multi_stream["test"]
                     score = next(iter(output_stream))["score"]["global"]["score"]
                 if key not in all_reference_keys:
                     illegal_prediction_keys += 1
+        result = {}
         average = 0
         total = 0
         result[f"{self.metric.main_score}_micro"] = weighted_average / total
         result[f"{self.metric.main_score}_macro"] = average / len(key_statistics)
+        if num_prediction_keys != 0:
+            result[f"{self.metric.main_score}_legal_keys_in_predictions"] = (
+                1 - 1.0 * illegal_prediction_keys / num_prediction_keys
+            )
         else:
             result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 0
         return result
 class NER(CustomF1):
     """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
     _requirements_list: List[str] = ["transformers", "torch"]
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare(self):
         super().prepare()
         import torch
     _requirements_list: List[str] = ["transformers", "torch", "tqdm"]
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare(self):
         super().prepare()
         from transformers import AutoModelForSequenceClassification, AutoTokenizer
         return result
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare(self):
         super().prepare()
         from transformers import pipeline
     _requirements_list: List[str] = ["transformers", "torch"]
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def compute(
         self,
         references: List[List[Any]],
     _requirements_list: List[str] = ["transformers", "torch"]
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare(self):
         super().prepare()
         import torch
     _requirements_list: List[str] = ["torch", "transformers"]
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare(self):
         from transformers import AutoTokenizer
         )
         messages = self.process_input_fields(task_data)
         prompt = self.get_prompt(messages)
+        data_classification_policy = task_data.get("metadata", {}).get(
+            "data_classification_policy"
+        )
+        result = self.inference_engine.infer_log_probs(
+            [
+                {
+                    "source": prompt,
+                    "data_classification_policy": data_classification_policy,
+                }
+            ]
+        )
         generated_tokens_list = result[0]
         label, prob_of_risk = self.parse_output(generated_tokens_list)
         df1.fillna(0, inplace=True)
         df2.fillna(0, inplace=True)
+        # Compare row counts first for a quick check
         if df1.shape != df2.shape:
             return False
+        # Convert DataFrames to numpy arrays of strings to handle mixed types
+        df1_array = df1.values.astype(str)
+        df2_array = df2.values.astype(str)
+        # Sort each row's elements (column order independence)
+        df1_sorted_rows = np.array([np.sort(row) for row in df1_array])
+        df2_sorted_rows = np.array([np.sort(row) for row in df2_array])
+        # Compare the sorted rows in order
+        return np.array_equal(df1_sorted_rows, df2_sorted_rows)
     @staticmethod
     def compare_dfs_ignore_colnames_unordered_rows(df1, df2):
             True if the DataFrames have the same content (ignoring column names and row order),
             False otherwise.
         """
+        # Compare shapes early on
+        if df1.shape != df2.shape:
+            return False
+        # Convert DataFrames to numpy arrays of strings (to handle mixed data types)
+        df1_array = df1.values.astype(str)
+        df2_array = df2.values.astype(str)
+        # Sort columns first, then sort rows
+        df1_sorted = np.sort(np.sort(df1_array, axis=1), axis=0)
+        df2_sorted = np.sort(np.sort(df2_array, axis=1), axis=0)
+        # Compare the sorted arrays
+        return np.array_equal(df1_sorted, df2_sorted)
     @staticmethod
+    def compare_dfs_ignore_colnames_subset(df1, df2, ignore_row_order=True):
+        """Checks if the values of either DataFrame are a subset of the values in the other DataFrame.
+        Comparison is column order independent, and could optionally be row order independent.
+        We interpret "subset" as follows:
+        - For each row in df1, there must be a matching (or superset) row in df2, i.e. the set of values
+          in the df1 row is a subset of the set of values in that df2 row. Then do the same check in reverse.
+        - If either condition (df1 is subset of df2 OR df2 is subset of df1) is satisfied, return True.
+        We treat an empty dataframe as a subset of nothing, while in theory is a subset of any dataframe.
         Args:
+            df1 (pd.DataFrame): Pandas DataFrame 1 to compare.
+            df2 (pd.DataFrame): Pandas DataFrame 2 to compare.
+            ignore_row_order (bool): If True, row order doesn't matter; if False, row order is respected.
         Returns:
+            bool: True if df1 is a subset of df2 or vice versa, based on the specified row-order condition.
         """
+        df1_array = df1.values.astype(str)
+        df2_array = df2.values.astype(str)
+        df1_sorted_rows = [np.sort(row) for row in df1_array]
+        df2_sorted_rows = [np.sort(row) for row in df2_array]
+        def row_is_subset(r_small, r_big):
+            """Check if all elements of r_small are in r_big."""
+            return set(r_small).issubset(set(r_big))
+        def df_is_subset_of_another(rows_small, rows_big, respect_order):
+            """Check if the rows_small is subset of rows_big under the given order condition."""
+            if not rows_small:
+                return False  # DataFrame needs to be non-empty
+            # If row order matters:
+            if respect_order:
+                i, j = 0, 0
+                while i < len(rows_small) and j < len(rows_big):
+                    if row_is_subset(rows_small[i], rows_big[j]):
+                        i += 1
+                    j += 1
+                return i == len(rows_small)
+            # Row order doesn't matter:
+            matched_indices = set()
+            for r_small in rows_small:
+                found_match = False
+                for idx, r_big in enumerate(rows_big):
+                    if idx not in matched_indices and row_is_subset(r_small, r_big):
+                        found_match = True
+                        matched_indices.add(idx)
+                        break
+                if not found_match:
+                    return False
+            return True
+        df1_sub_df2 = df_is_subset_of_another(
+            df1_sorted_rows, df2_sorted_rows, not ignore_row_order
+        )
+        df2_sub_df1 = df_is_subset_of_another(
+            df2_sorted_rows, df1_sorted_rows, not ignore_row_order
+        )
+        return df1_sub_df2 or df2_sub_df1
     def get_sql_execution_results(
         self, predicted_sql: str, gold_sql: str, connector
         a 12-tuple of
         1. execution_result: if df responses match
         2. non_empty_execution_result: if dfs are non-empty and match
+        3. subset_non_empty_execution_result: if non-empty dfs and one is a subset of the other
         4. non_empty_gold_df: if gt df is non-empty
         5. gold_sql_runtime: ground truth query runtime
         6. predicted_sql_runtime: predicted query runtime
             pred_res = pred_res["results"]
         predicted_df = pd.DataFrame(pred_res)
+        subset_non_empty_execution_result = 0
+        non_empty_execution_result = 0
         if "ORDER BY" in gold_sql.upper():
             execution_result = (
                 1
                 if self.compare_dfs_ignore_colnames_ordered_rows(predicted_df, gold_df)
                 else 0
             )
+            if non_empty_gold_df:
+                if execution_result == 1:
+                    non_empty_execution_result = 1
+                if self.compare_dfs_ignore_colnames_subset(
+                    gold_df, predicted_df, ignore_row_order=False
+                ):
+                    subset_non_empty_execution_result = 1
         else:
             execution_result = (
                 1
                 )
                 else 0
             )
+            if non_empty_gold_df:
+                if execution_result == 1:
+                    non_empty_execution_result = 1
+                if self.compare_dfs_ignore_colnames_subset(
+                    gold_df, predicted_df, ignore_row_order=True
+                ):
+                    subset_non_empty_execution_result = 1
         return (
             execution_result,
             "sqlglot_optimized_equivalence",
             "sqlparse_equivalence",
             "sql_exact_match",
+            "sql_syntactic_equivalence",
         ]
     }
     main_score = "sqlglot_equivalence"
         "sqlglot_optimized_equivalence",
         "sqlparse_equivalence",
         "sql_exact_match",
+        "sql_syntactic_equivalence",
     ]
     prediction_type = "Any"  # string representation is compared
             ),
             "sql_exact_match": float(sql_exact_match(predicted_sql, gold_sql)),
         }
+        result["sql_syntactic_equivalence"] = float(
+            any(
+                result[key]
+                for key in [
+                    "sqlglot_equivalence",
+                    "sqlglot_optimized_equivalence",
+                    "sqlparse_equivalence",
+                    "sql_exact_match",
+                ]
+            )
+        )
         logger.debug(f"SQL Non Execution Accuracy Result: {result}")
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score

processors.py CHANGED Viewed

@@ -292,6 +292,13 @@ class ExtractMtBenchRatingJudgment(FieldOperator):
         except:
             return 0.0
 class ExtractMtBenchLabelJudgment(FieldOperator):
     def process_value(self, text: Any) -> Any:

         except:
             return 0.0
+class ExtractHarmRatingJudgement(FieldOperator):
+    def process_value(self, text: Any) -> Any:
+        match = re.search(r"\[\[([\d]+\.?[\d]*)\]\]", text)
+        try:
+            return float(match.group(1))*0.25 - 0.25
+        except:
+            return np.NaN
 class ExtractMtBenchLabelJudgment(FieldOperator):
     def process_value(self, text: Any) -> Any:

settings_utils.py CHANGED Viewed

@@ -160,6 +160,7 @@ if Settings.is_uninitilized():
     settings.hf_offline_metrics_path = None
     settings.hf_offline_models_path = None
     settings.inference_engine_cache_path = "./inference_engine_cache/"
 if Constants.is_uninitilized():
     constants = Constants()

     settings.hf_offline_metrics_path = None
     settings.hf_offline_models_path = None
     settings.inference_engine_cache_path = "./inference_engine_cache/"
+    settings.max_connection_retries = 3
 if Constants.is_uninitilized():
     constants = Constants()

sql_utils.py CHANGED Viewed

@@ -275,8 +275,15 @@ class Cache:
         logger.info(f"Cache miss for key: {key}. Computing value...")
         result = compute_fn()
-        self.cache[key] = result
-        logger.info(f"Stored result in cache for key: {key}")
         return result
     async def async_get_or_set(self, key, compute_fn, no_cache=False, refresh=False):
@@ -494,7 +501,7 @@ class RemoteDatabaseConnector(DatabaseConnector):
         schema_text = ""
         for table in schema["tables"]:
-            schema_text += f"Table: {table['table_name']} has columns: {[col['column_name'] for col in table['columns']]}\n"
         return schema_text

         logger.info(f"Cache miss for key: {key}. Computing value...")
         result = compute_fn()
+        if result and not (
+            isinstance(result, tuple) and len(result) == 2 and result[0] is None
+        ):
+            self.cache[key] = result
+            logger.info(f"Stored result in cache for key: {key}")
+        else:
+            logger.info(f"None result. Bypassing caching for key: {key}")
         return result
     async def async_get_or_set(self, key, compute_fn, no_cache=False, refresh=False):
         schema_text = ""
         for table in schema["tables"]:
+            schema_text += f"Table: {table['name'] if 'name' in table else table['table_name']} has columns: {[col['name'] if 'name' in col else col['column_name'] for col in table['columns']]}\n"
         return schema_text

standard.py CHANGED Viewed

@@ -503,7 +503,7 @@ class DatasetRecipe(SourceSequentialOperator):
             loader = self.card.loader
             if self.loader_limit:
                 loader.loader_limit = self.loader_limit
-                logger.info(f"Loader line limit was set to  {self.loader_limit}")
             self.loading.steps.append(loader)
             # This is required in case loader_limit is not enforced by the loader

             loader = self.card.loader
             if self.loader_limit:
                 loader.loader_limit = self.loader_limit
+                # logger.info(f"Loader line limit was set to  {self.loader_limit}")
             self.loading.steps.append(loader)
             # This is required in case loader_limit is not enforced by the loader

string_operators.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import (
 from .operators import FieldOperator, InstanceOperator
 from .settings_utils import get_settings
 settings = get_settings()
@@ -50,6 +51,7 @@ class TokensSlice(FieldOperator):
     _requirements_list = ["transformers"]
     def prepare(self):
         super().prepare()
         from transformers import AutoTokenizer

 from .operators import FieldOperator, InstanceOperator
 from .settings_utils import get_settings
+from .utils import retry_connection_with_exponential_backoff
 settings = get_settings()
     _requirements_list = ["transformers"]
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
     def prepare(self):
         super().prepare()
         from transformers import AutoTokenizer

utils.py CHANGED Viewed

@@ -1,15 +1,92 @@
 import copy
 import importlib.util
 import json
 import os
 import re
 import threading
 from collections import OrderedDict
 from functools import lru_cache
 from typing import Any, Dict
 from .text_utils import is_made_of_sub_strings
 class Singleton(type):
     _instances = {}

 import copy
+import functools
 import importlib.util
 import json
+import logging
 import os
+import random
 import re
 import threading
+import time
 from collections import OrderedDict
 from functools import lru_cache
 from typing import Any, Dict
+from urllib.error import HTTPError as UrllibHTTPError
+from requests.exceptions import ConnectionError, HTTPError
+from requests.exceptions import Timeout as TimeoutError
+from .settings_utils import get_settings
 from .text_utils import is_made_of_sub_strings
+settings = get_settings()
+def retry_connection_with_exponential_backoff(max_retries=None,
+                                  retry_exceptions=(ConnectionError, TimeoutError, HTTPError, FileNotFoundError, UrllibHTTPError),
+                                  backoff_factor=1):
+    """Decorator that implements retry with exponential backoff for network operations.
+    Also handles errors that were triggered by the specified retry exceptions,
+    whether they're direct causes or part of the exception context.
+    Args:
+        max_retries: Maximum number of retry attempts (falls back to settings if None)
+        retry_exceptions: Tuple of exceptions that should trigger a retry
+        backoff_factor: Base delay factor in seconds for backoff calculation
+    Returns:
+        The decorated function with retry logic
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            # Get max_retries from settings if not provided
+            retries = max_retries if max_retries is not None else settings.max_connection_retries
+            for attempt in range(retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    # Check if this exception or any of its causes match the retry exceptions
+                    should_retry = False
+                    current_exc = e
+                    # Check the exception chain for both __cause__ (explicit) and __context__ (implicit)
+                    visited_exceptions = set()  # To prevent infinite loops in rare cyclic exception references
+                    while current_exc is not None and id(current_exc) not in visited_exceptions:
+                        visited_exceptions.add(id(current_exc))
+                        if isinstance(current_exc, retry_exceptions):
+                            should_retry = True
+                            break
+                        # First check __cause__ (from "raise X from Y")
+                        if current_exc.__cause__ is not None:
+                            current_exc = current_exc.__cause__
+                        # Then check __context__ (from "try: ... except: raise X")
+                        elif current_exc.__context__ is not None:
+                            current_exc = current_exc.__context__
+                        else:
+                            # No more causes in the chain
+                            break
+                    if not should_retry:
+                        # Not a retry exception or caused by a retry exception, so re-raise
+                        raise
+                    if attempt >= retries - 1:  # Last attempt
+                        raise  # Re-raise the last exception
+                    # Calculate exponential backoff with jitter
+                    wait_time = backoff_factor * (2 ** attempt) + random.uniform(0, 1)
+                    logging.warning(f"{func.__name__} failed (attempt {attempt+1}/{retries}). "
+                                  f"Retrying in {wait_time:.2f}s. Error: {e!s}")
+                    time.sleep(wait_time)
+            raise ValueError("there was a problem") from None
+        return wrapper
+    return decorator
 class Singleton(type):
     _instances = {}

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.21.0"


1	+ version = "1.22.0"