Elron commited on
Commit
64dd81e
·
verified ·
1 Parent(s): d346c89

Upload folder using huggingface_hub

Browse files
Files changed (14) hide show
  1. api.py +2 -1
  2. formats.py +3 -0
  3. fusion.py +3 -0
  4. inference.py +28 -10
  5. llm_as_judge_constants.py +1 -1
  6. loaders.py +40 -17
  7. metrics.py +166 -73
  8. processors.py +7 -0
  9. settings_utils.py +1 -0
  10. sql_utils.py +10 -3
  11. standard.py +1 -1
  12. string_operators.py +2 -0
  13. utils.py +77 -0
  14. version.py +1 -1
api.py CHANGED
@@ -9,6 +9,7 @@ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
9
  from datasets.exceptions import DatasetGenerationError
10
 
11
  from .artifact import fetch_artifact
 
12
  from .card import TaskCard
13
  from .dataset_utils import get_dataset_artifact
14
  from .error_utils import UnitxtError
@@ -78,7 +79,7 @@ def _verify_dataset_args(dataset_query: Optional[str] = None, dataset_args=None)
78
 
79
 
80
  def load_recipe(dataset_query: Optional[str] = None, **kwargs) -> DatasetRecipe:
81
- if isinstance(dataset_query, DatasetRecipe):
82
  return dataset_query
83
 
84
  _verify_dataset_args(dataset_query, kwargs)
 
9
  from datasets.exceptions import DatasetGenerationError
10
 
11
  from .artifact import fetch_artifact
12
+ from .benchmark import Benchmark
13
  from .card import TaskCard
14
  from .dataset_utils import get_dataset_artifact
15
  from .error_utils import UnitxtError
 
79
 
80
 
81
  def load_recipe(dataset_query: Optional[str] = None, **kwargs) -> DatasetRecipe:
82
+ if isinstance(dataset_query, (DatasetRecipe, Benchmark)):
83
  return dataset_query
84
 
85
  _verify_dataset_args(dataset_query, kwargs)
formats.py CHANGED
@@ -18,6 +18,7 @@ from .image_operators import image_to_data_url
18
  from .operator import InstanceOperator
19
  from .settings_utils import get_constants
20
  from .type_utils import isoftype
 
21
 
22
  constants = get_constants()
23
 
@@ -33,6 +34,7 @@ class GraniteDocumentsFormat(Format):
33
 
34
  _requirements_list = ["transformers"]
35
 
 
36
  def prepare(self):
37
  super().prepare()
38
  from transformers import AutoTokenizer
@@ -487,6 +489,7 @@ class HFSystemFormat(ChatAPIFormat):
487
  model_name: str
488
  _requirements_list = ["transformers", "Jinja2"]
489
 
 
490
  def prepare(self):
491
  super().prepare()
492
  from transformers import AutoTokenizer
 
18
  from .operator import InstanceOperator
19
  from .settings_utils import get_constants
20
  from .type_utils import isoftype
21
+ from .utils import retry_connection_with_exponential_backoff
22
 
23
  constants = get_constants()
24
 
 
34
 
35
  _requirements_list = ["transformers"]
36
 
37
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
38
  def prepare(self):
39
  super().prepare()
40
  from transformers import AutoTokenizer
 
489
  model_name: str
490
  _requirements_list = ["transformers", "Jinja2"]
491
 
492
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
493
  def prepare(self):
494
  super().prepare()
495
  from transformers import AutoTokenizer
fusion.py CHANGED
@@ -2,11 +2,13 @@ from abc import abstractmethod
2
  from typing import Dict, Generator, List, Optional, Union
3
 
4
  from .dataclass import NonPositionalField
 
5
  from .operator import SourceOperator
6
  from .random_utils import new_random_generator
7
  from .stream import DynamicStream, MultiStream
8
  from .type_utils import isoftype
9
 
 
10
 
11
  class BaseFusion(SourceOperator):
12
  """BaseFusion operator that combines multiple multistreams into one.
@@ -76,6 +78,7 @@ class FixedFusion(BaseFusion):
76
  if split not in multi_stream:
77
  continue
78
  emitted_from_this_split = 0
 
79
  try:
80
  for instance in multi_stream[split]:
81
  if (
 
2
  from typing import Dict, Generator, List, Optional, Union
3
 
4
  from .dataclass import NonPositionalField
5
+ from .logging_utils import get_logger
6
  from .operator import SourceOperator
7
  from .random_utils import new_random_generator
8
  from .stream import DynamicStream, MultiStream
9
  from .type_utils import isoftype
10
 
11
+ logger = get_logger()
12
 
13
  class BaseFusion(SourceOperator):
14
  """BaseFusion operator that combines multiple multistreams into one.
 
78
  if split not in multi_stream:
79
  continue
80
  emitted_from_this_split = 0
81
+ logger.info(f"Processing {split} from {origin_name}...")
82
  try:
83
  for instance in multi_stream[split]:
84
  if (
inference.py CHANGED
@@ -31,7 +31,6 @@ from typing import (
31
  )
32
 
33
  from datasets import Dataset, DatasetDict, Image
34
- from diskcache import Cache
35
  from tqdm import tqdm, trange
36
  from tqdm.asyncio import tqdm_asyncio
37
 
@@ -50,6 +49,7 @@ from .operator import PackageRequirementsMixin
50
  from .operators import ArtifactFetcherMixin
51
  from .settings_utils import get_constants, get_settings
52
  from .type_utils import isoftype
 
53
 
54
  constants = get_constants()
55
  settings = get_settings()
@@ -183,7 +183,9 @@ class InferenceEngine(Artifact):
183
  if not settings.mock_inference_mode:
184
  super().prepare() # no need to prepare a mock
185
  self.prepare_engine()
186
- self._cache = Cache(get_settings().inference_engine_cache_path + self.__class__.__name__)
 
 
187
 
188
  def __call__(
189
  self,
@@ -199,6 +201,7 @@ class InferenceEngine(Artifact):
199
  def _get_cache_key(self, instance: Dict[str, Any]) -> str:
200
  """Generate a unique cache key for each input."""
201
  record = self.get_instance_cache_key(instance)
 
202
  record.update(self.to_dict())
203
  instance_str = json.dumps(record, sort_keys=True)
204
  return hashlib.md5(instance_str.encode()).hexdigest()
@@ -875,6 +878,7 @@ class HFPeftInferenceEngine(HFAutoModelInferenceEngine):
875
  self.peft_config.base_model_name_or_path
876
  )
877
 
 
878
  def _init_model(self):
879
  from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
880
  from transformers import AutoConfig
@@ -938,14 +942,26 @@ class HFPipelineBasedInferenceEngine(
938
  if settings.hf_offline_models_path is not None:
939
  path = os.path.join(settings.hf_offline_models_path, path)
940
 
941
- self.task = (
942
- "text2text-generation"
943
- if AutoConfig.from_pretrained(
944
- path,
945
- trust_remote_code=True,
946
- ).is_encoder_decoder
947
- else "text-generation"
948
- )
 
 
 
 
 
 
 
 
 
 
 
 
949
 
950
  def _get_model_args(self) -> Dict[str, Any]:
951
  import torch
@@ -977,6 +993,7 @@ class HFPipelineBasedInferenceEngine(
977
 
978
  return args
979
 
 
980
  def _create_pipeline(self, model_args: Dict[str, Any]):
981
  from transformers import AutoTokenizer, pipeline
982
 
@@ -3336,6 +3353,7 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
3336
  def get_engine_id(self):
3337
  return get_model_and_label_id(self.model_name, self.label)
3338
 
 
3339
  def prepare_engine(self):
3340
  from transformers import AutoModelForCausalLM, AutoTokenizer
3341
 
 
31
  )
32
 
33
  from datasets import Dataset, DatasetDict, Image
 
34
  from tqdm import tqdm, trange
35
  from tqdm.asyncio import tqdm_asyncio
36
 
 
49
  from .operators import ArtifactFetcherMixin
50
  from .settings_utils import get_constants, get_settings
51
  from .type_utils import isoftype
52
+ from .utils import retry_connection_with_exponential_backoff
53
 
54
  constants = get_constants()
55
  settings = get_settings()
 
183
  if not settings.mock_inference_mode:
184
  super().prepare() # no need to prepare a mock
185
  self.prepare_engine()
186
+ if self.use_cache:
187
+ from diskcache import Cache
188
+ self._cache = Cache(settings.inference_engine_cache_path + self.__class__.__name__)
189
 
190
  def __call__(
191
  self,
 
201
  def _get_cache_key(self, instance: Dict[str, Any]) -> str:
202
  """Generate a unique cache key for each input."""
203
  record = self.get_instance_cache_key(instance)
204
+ record["version"] = constants.version
205
  record.update(self.to_dict())
206
  instance_str = json.dumps(record, sort_keys=True)
207
  return hashlib.md5(instance_str.encode()).hexdigest()
 
878
  self.peft_config.base_model_name_or_path
879
  )
880
 
881
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
882
  def _init_model(self):
883
  from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
884
  from transformers import AutoConfig
 
942
  if settings.hf_offline_models_path is not None:
943
  path = os.path.join(settings.hf_offline_models_path, path)
944
 
945
+ try:
946
+ # Try loading as a full model (HF model or local full model)
947
+ config = AutoConfig.from_pretrained(path, trust_remote_code=True)
948
+
949
+ except Exception:
950
+ try:
951
+ from peft import PeftConfig
952
+ # If full model loading fails, try loading as a PEFT adapter
953
+ peft_config = PeftConfig.from_pretrained(path)
954
+
955
+ if not peft_config.base_model_name_or_path:
956
+ raise ValueError(f"Base model name not found in PEFT config for {path}")
957
+
958
+ # Load the base model's config
959
+ config = AutoConfig.from_pretrained(peft_config.base_model_name_or_path, trust_remote_code=True)
960
+ except Exception as err2:
961
+ raise ValueError(f"Could not determine model type for: {path}") from err2
962
+
963
+
964
+ self.task = "text2text-generation" if config.is_encoder_decoder else "text-generation"
965
 
966
  def _get_model_args(self) -> Dict[str, Any]:
967
  import torch
 
993
 
994
  return args
995
 
996
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
997
  def _create_pipeline(self, model_args: Dict[str, Any]):
998
  from transformers import AutoTokenizer, pipeline
999
 
 
3353
  def get_engine_id(self):
3354
  return get_model_and_label_id(self.model_name, self.label)
3355
 
3356
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
3357
  def prepare_engine(self):
3358
  from transformers import AutoModelForCausalLM, AutoTokenizer
3359
 
llm_as_judge_constants.py CHANGED
@@ -85,7 +85,7 @@ class EvaluatorNameEnum(str, Enum):
85
 
86
  class ModelProviderEnum(str, Enum):
87
  WATSONX = "watsonx"
88
- OPENAI = "openai"
89
  RITS = "rits"
90
  AZURE_OPENAI = "azure"
91
 
 
85
 
86
  class ModelProviderEnum(str, Enum):
87
  WATSONX = "watsonx"
88
+ OPENAI = "open-ai"
89
  RITS = "rits"
90
  AZURE_OPENAI = "azure"
91
 
loaders.py CHANGED
@@ -57,7 +57,6 @@ import pandas as pd
57
  import requests
58
  from datasets import (
59
  DatasetDict,
60
- DownloadConfig,
61
  IterableDataset,
62
  IterableDatasetDict,
63
  get_dataset_split_names,
@@ -75,7 +74,7 @@ from .operators import Set
75
  from .settings_utils import get_settings
76
  from .stream import DynamicStream, MultiStream
77
  from .type_utils import isoftype
78
- from .utils import LRUCache, recursive_copy
79
 
80
  logger = get_logger()
81
  settings = get_settings()
@@ -84,6 +83,7 @@ class UnitxtUnverifiedCodeError(UnitxtError):
84
  def __init__(self, path):
85
  super().__init__(f"Loader cannot load and run remote code from {path} in huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE.", Documentation.SETTINGS)
86
 
 
87
  def hf_load_dataset(path: str, *args, **kwargs):
88
  if settings.hf_offline_datasets_path is not None:
89
  path = os.path.join(settings.hf_offline_datasets_path, path)
@@ -91,9 +91,6 @@ def hf_load_dataset(path: str, *args, **kwargs):
91
  return _hf_load_dataset(
92
  path,
93
  *args, **kwargs,
94
- download_config=DownloadConfig(
95
- max_retries=settings.loaders_max_retries,
96
- ),
97
  verification_mode="no_checks",
98
  trust_remote_code=settings.allow_unverified_code,
99
  download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
@@ -101,6 +98,24 @@ def hf_load_dataset(path: str, *args, **kwargs):
101
  except ValueError as e:
102
  if "trust_remote_code" in str(e):
103
  raise UnitxtUnverifiedCodeError(path) from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  class Loader(SourceOperator):
106
  """A base class for all loaders.
@@ -287,6 +302,9 @@ class LoadHF(LazyLoader):
287
  return settings.stream_hf_datasets_by_default
288
  return self.streaming
289
 
 
 
 
290
  # returns Dict when split names are not known in advance, and just the the single split dataset - if known
291
  def load_dataset(
292
  self, split: str, streaming=None, disable_memory_caching=False
@@ -307,9 +325,15 @@ class LoadHF(LazyLoader):
307
  split=split,
308
  num_proc=self.num_proc,
309
  )
310
- self.__class__._loader_cache.max_size = settings.loader_cache_size
 
 
 
311
  if not disable_memory_caching:
 
312
  self.__class__._loader_cache[dataset_id] = dataset
 
 
313
  return dataset
314
 
315
  def _maybe_set_classification_policy(self):
@@ -323,22 +347,16 @@ class LoadHF(LazyLoader):
323
  None, # No warning when loading from public hub
324
  )
325
 
 
326
  def get_splits(self):
327
  if self.splits is not None:
328
  return self.splits
329
  try:
330
- return get_dataset_split_names(
331
  path=self.path,
332
- config_name=self.name,
333
- trust_remote_code=settings.allow_unverified_code,
334
- download_config=DownloadConfig(
335
- max_retries=settings.loaders_max_retries,
336
- extract_on_the_fly=True,
337
- ),
338
  )
339
- except Exception as e:
340
- if "trust_remote_code" in str(e):
341
- raise UnitxtUnverifiedCodeError(self.path) from e
342
  UnitxtWarning(
343
  f'LoadHF(path="{self.path}", name="{self.name}") could not retrieve split names without loading the dataset. Consider defining "splits" in the LoadHF definition to improve loading time.'
344
  )
@@ -350,11 +368,16 @@ class LoadHF(LazyLoader):
350
  NotImplementedError
351
  ): # streaming is not supported for zipped files so we load without streaming
352
  dataset = self.load_dataset(split=None, streaming=False)
 
 
 
 
353
  return list(dataset.keys())
354
 
355
  def split_generator(self, split: str) -> Generator:
356
  if self.get_limit() is not None:
357
- self.log_limited_loading()
 
358
  try:
359
  dataset = self.load_dataset(split=split)
360
  except (
 
57
  import requests
58
  from datasets import (
59
  DatasetDict,
 
60
  IterableDataset,
61
  IterableDatasetDict,
62
  get_dataset_split_names,
 
74
  from .settings_utils import get_settings
75
  from .stream import DynamicStream, MultiStream
76
  from .type_utils import isoftype
77
+ from .utils import LRUCache, recursive_copy, retry_connection_with_exponential_backoff
78
 
79
  logger = get_logger()
80
  settings = get_settings()
 
83
  def __init__(self, path):
84
  super().__init__(f"Loader cannot load and run remote code from {path} in huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE.", Documentation.SETTINGS)
85
 
86
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
87
  def hf_load_dataset(path: str, *args, **kwargs):
88
  if settings.hf_offline_datasets_path is not None:
89
  path = os.path.join(settings.hf_offline_datasets_path, path)
 
91
  return _hf_load_dataset(
92
  path,
93
  *args, **kwargs,
 
 
 
94
  verification_mode="no_checks",
95
  trust_remote_code=settings.allow_unverified_code,
96
  download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
 
98
  except ValueError as e:
99
  if "trust_remote_code" in str(e):
100
  raise UnitxtUnverifiedCodeError(path) from e
101
+ raise e # Re raise
102
+
103
+
104
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
105
+ def hf_get_dataset_splits(path: str, name: str):
106
+ try:
107
+ return get_dataset_split_names(
108
+ path=path,
109
+ config_name=name,
110
+ trust_remote_code=settings.allow_unverified_code,
111
+ )
112
+ except Exception as e:
113
+ if "trust_remote_code" in str(e):
114
+ raise UnitxtUnverifiedCodeError(path) from e
115
+
116
+ if "Couldn't find cache" in str(e):
117
+ raise FileNotFoundError(f"Dataset cache path={path}, name={name} was not found.") from e
118
+ raise e # Re raise
119
 
120
  class Loader(SourceOperator):
121
  """A base class for all loaders.
 
302
  return settings.stream_hf_datasets_by_default
303
  return self.streaming
304
 
305
+ def is_in_cache(self, split):
306
+ dataset_id = str(self) + "_" + str(split)
307
+ return dataset_id in self.__class__._loader_cache
308
  # returns Dict when split names are not known in advance, and just the the single split dataset - if known
309
  def load_dataset(
310
  self, split: str, streaming=None, disable_memory_caching=False
 
325
  split=split,
326
  num_proc=self.num_proc,
327
  )
328
+
329
+ if dataset is None:
330
+ raise NotImplementedError() from None
331
+
332
  if not disable_memory_caching:
333
+ self.__class__._loader_cache.max_size = settings.loader_cache_size
334
  self.__class__._loader_cache[dataset_id] = dataset
335
+ self._already_logged_limited_loading = True
336
+
337
  return dataset
338
 
339
  def _maybe_set_classification_policy(self):
 
347
  None, # No warning when loading from public hub
348
  )
349
 
350
+ @retry_connection_with_exponential_backoff(max_retries=3, backoff_factor=2)
351
  def get_splits(self):
352
  if self.splits is not None:
353
  return self.splits
354
  try:
355
+ return hf_get_dataset_splits(
356
  path=self.path,
357
+ name=self.name,
 
 
 
 
 
358
  )
359
+ except Exception:
 
 
360
  UnitxtWarning(
361
  f'LoadHF(path="{self.path}", name="{self.name}") could not retrieve split names without loading the dataset. Consider defining "splits" in the LoadHF definition to improve loading time.'
362
  )
 
368
  NotImplementedError
369
  ): # streaming is not supported for zipped files so we load without streaming
370
  dataset = self.load_dataset(split=None, streaming=False)
371
+
372
+ if dataset is None:
373
+ raise FileNotFoundError(f"Dataset path={self.path}, name={self.name} was not found.") from None
374
+
375
  return list(dataset.keys())
376
 
377
  def split_generator(self, split: str) -> Generator:
378
  if self.get_limit() is not None:
379
+ if not self.is_in_cache(split):
380
+ self.log_limited_loading()
381
  try:
382
  dataset = self.load_dataset(split=split)
383
  except (
metrics.py CHANGED
@@ -29,7 +29,6 @@ import numpy
29
  import numpy as np
30
  import pandas as pd
31
  import requests
32
- from datasets import DownloadConfig
33
  from scipy.stats import bootstrap
34
  from scipy.stats._warnings_errors import DegenerateDataWarning
35
 
@@ -65,14 +64,14 @@ from .random_utils import get_seed
65
  from .settings_utils import get_settings
66
  from .stream import MultiStream, Stream
67
  from .type_utils import Type, isoftype, parse_type_string, to_type_string
68
- from .utils import deep_copy, recursive_copy
69
 
70
  logger = get_logger()
71
  settings = get_settings()
72
 
73
  warnings.filterwarnings("ignore", category=DegenerateDataWarning)
74
 
75
-
76
  def hf_evaluate_load(path: str, *args, **kwargs):
77
  if settings.hf_offline_metrics_path is not None:
78
  path = os.path.join(settings.hf_offline_metrics_path, path)
@@ -81,9 +80,6 @@ def hf_evaluate_load(path: str, *args, **kwargs):
81
  *args,
82
  **kwargs,
83
  experiment_id=str(uuid.uuid4()),
84
- download_config=DownloadConfig(
85
- max_retries=settings.loaders_max_retries,
86
- ),
87
  verification_mode="no_checks",
88
  trust_remote_code=settings.allow_unverified_code,
89
  download_mode=(
@@ -127,6 +123,7 @@ def nan_max(x):
127
  warnings.simplefilter("ignore", category=RuntimeWarning)
128
  return np.nanmax(x)
129
 
 
130
  def nan_std(x):
131
  with warnings.catch_warnings():
132
  warnings.simplefilter("ignore", category=RuntimeWarning)
@@ -398,12 +395,14 @@ class Statistic:
398
  result = np.array([scores[m] for m in self.score_names])
399
  self._history.append(result)
400
  return result
 
401
  def mean(self, idx):
402
  return nan_mean([result[idx] for result in self._history])
403
 
404
  def std(self, idx):
405
  return nan_std([result[idx] for result in self._history])
406
 
 
407
  class ConfidenceIntervalMixin(Artifact):
408
  n_resamples: int = 1000
409
  confidence_level: float = 0.95
@@ -413,18 +412,16 @@ class ConfidenceIntervalMixin(Artifact):
413
  def _sample_to_scores(self, sample: List[Any]) -> Dict[str, Any]:
414
  pass
415
 
416
-
417
  def bootstrap(self, data: List[Any], score_names: List[str]):
418
  if self.ci_score_names is not None:
419
  score_names = self.ci_score_names
420
 
421
-
422
  statistic = Statistic(data, score_names, self._sample_to_scores)
423
  with warnings.catch_warnings():
424
- warnings.filterwarnings( # Ignore error the arises when all sample scores are identical
425
  "ignore",
426
  message="invalid value encountered in divide",
427
- category=RuntimeWarning
428
  )
429
 
430
  intervals = bootstrap(
@@ -438,14 +435,17 @@ class ConfidenceIntervalMixin(Artifact):
438
  method="BCa",
439
  ).confidence_interval
440
 
441
-
442
  result = {}
443
  for i, metric in enumerate(score_names):
444
  high = intervals.high[i]
445
  low = intervals.low[i]
446
  if np.isnan(high) and np.isnan(low):
447
- if statistic.std(i) == 0: # When sample scores are identical "BCa" will fail (due to division by std 0)
448
- high = low = statistic.mean(i) # In this case we will use the mean (as there is no variance)
 
 
 
 
449
  result[f"{metric}_ci_low"] = float(low)
450
  result[f"{metric}_ci_high"] = float(high)
451
 
@@ -2807,7 +2807,7 @@ class FinQAEval(InstanceMetric):
2807
  remote_url = "https://raw.githubusercontent.com/czyssrs/FinQA/dfc5b72c01ee17c442d28d5201b82a1f4e95d5af/code/evaluate/evaluate.py"
2808
  local_filepath = "/tmp/finqa_eval_script.py"
2809
  module_name = "finqa_eval"
2810
- hash_of_script = "42430b8613082bb4b85d49210284135d" # pragma: allowlist secret
2811
 
2812
  download_finqa_eval_script_file(remote_url, local_filepath, hash_of_script)
2813
  self.finqa_module = load_finqa_eval_module_from_file(
@@ -3415,10 +3415,11 @@ class CustomF1(GlobalMetric):
3415
 
3416
  class KeyValueExtraction(GlobalMetric):
3417
 
3418
- prediction_type = Dict[str,str]
3419
- metric : Metric
3420
  single_reference_per_prediction = True
3421
  main_score = ""
 
3422
  def prepare(self):
3423
  super().prepare()
3424
  self.main_score = f"{self.metric.main_score}_micro"
@@ -3436,18 +3437,25 @@ class KeyValueExtraction(GlobalMetric):
3436
  for reference in references:
3437
  all_reference_keys.update(list(reference.keys()))
3438
  for key in all_reference_keys:
3439
- key_statistics[key]= []
3440
 
3441
- num_prediction_keys=0
3442
- illegal_prediction_keys=0
3443
  for reference, prediction in zip(references, predictions):
3444
  for key in all_reference_keys:
3445
- if (key not in reference and key not in prediction):
3446
  continue
3447
- if (key in reference and key in prediction):
3448
- multi_stream = MultiStream.from_iterables({"test": [{"prediction" : prediction[key],
3449
- "references" : [reference[key]]}
3450
- ]})
 
 
 
 
 
 
 
3451
  output_multi_stream = self.metric(multi_stream)
3452
  output_stream = output_multi_stream["test"]
3453
  score = next(iter(output_stream))["score"]["global"]["score"]
@@ -3460,7 +3468,7 @@ class KeyValueExtraction(GlobalMetric):
3460
  if key not in all_reference_keys:
3461
  illegal_prediction_keys += 1
3462
 
3463
- result={}
3464
 
3465
  average = 0
3466
  total = 0
@@ -3476,13 +3484,16 @@ class KeyValueExtraction(GlobalMetric):
3476
 
3477
  result[f"{self.metric.main_score}_micro"] = weighted_average / total
3478
  result[f"{self.metric.main_score}_macro"] = average / len(key_statistics)
3479
- if (num_prediction_keys !=0):
3480
- result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 1 - 1.0 * illegal_prediction_keys / num_prediction_keys
 
 
3481
  else:
3482
  result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 0
3483
 
3484
  return result
3485
 
 
3486
  class NER(CustomF1):
3487
  """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
3488
 
@@ -3713,6 +3724,7 @@ class Detector(BulkInstanceMetric):
3713
 
3714
  _requirements_list: List[str] = ["transformers", "torch"]
3715
 
 
3716
  def prepare(self):
3717
  super().prepare()
3718
  import torch
@@ -3753,6 +3765,7 @@ class RegardMetric(GlobalMetric):
3753
 
3754
  _requirements_list: List[str] = ["transformers", "torch", "tqdm"]
3755
 
 
3756
  def prepare(self):
3757
  super().prepare()
3758
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
@@ -3942,6 +3955,7 @@ class SafetyMetric(MapReduceMetric[str, Tuple[float, str]], TorchDeviceMixin):
3942
 
3943
  return result
3944
 
 
3945
  def prepare(self):
3946
  super().prepare()
3947
  from transformers import pipeline
@@ -4121,6 +4135,7 @@ class Perplexity(BulkInstanceMetric):
4121
 
4122
  _requirements_list: List[str] = ["transformers", "torch"]
4123
 
 
4124
  def compute(
4125
  self,
4126
  references: List[List[Any]],
@@ -4394,6 +4409,7 @@ class FaithfulnessHHEM(BulkInstanceMetric):
4394
 
4395
  _requirements_list: List[str] = ["transformers", "torch"]
4396
 
 
4397
  def prepare(self):
4398
  super().prepare()
4399
  import torch
@@ -6051,6 +6067,7 @@ class GraniteGuardianBase(InstanceMetric):
6051
 
6052
  _requirements_list: List[str] = ["torch", "transformers"]
6053
 
 
6054
  def prepare(self):
6055
  from transformers import AutoTokenizer
6056
 
@@ -6116,9 +6133,18 @@ class GraniteGuardianBase(InstanceMetric):
6116
  )
6117
  messages = self.process_input_fields(task_data)
6118
  prompt = self.get_prompt(messages)
6119
- data_classification_policy = task_data.get("metadata", {}).get("data_classification_policy")
 
 
6120
 
6121
- result = self.inference_engine.infer_log_probs([{"source": prompt, "data_classification_policy": data_classification_policy}])
 
 
 
 
 
 
 
6122
 
6123
  generated_tokens_list = result[0]
6124
  label, prob_of_risk = self.parse_output(generated_tokens_list)
@@ -6371,13 +6397,20 @@ class SQLExecutionAccuracy(InstanceMetric):
6371
  df1.fillna(0, inplace=True)
6372
  df2.fillna(0, inplace=True)
6373
 
 
6374
  if df1.shape != df2.shape:
6375
  return False
6376
 
6377
- df1_rows_sorted = [sorted(map(str, row)) for row in df1.to_numpy()]
6378
- df2_rows_sorted = [sorted(map(str, row)) for row in df2.to_numpy()]
 
 
 
 
 
6379
 
6380
- return df1_rows_sorted == df2_rows_sorted
 
6381
 
6382
  @staticmethod
6383
  def compare_dfs_ignore_colnames_unordered_rows(df1, df2):
@@ -6391,46 +6424,85 @@ class SQLExecutionAccuracy(InstanceMetric):
6391
  True if the DataFrames have the same content (ignoring column names and row order),
6392
  False otherwise.
6393
  """
6394
- return set(map(tuple, df1.to_numpy())) == set(map(tuple, df2.to_numpy()))
 
 
 
 
 
 
 
 
 
 
 
 
 
6395
 
6396
  @staticmethod
6397
- def is_subset_ignore_colnames(df1, df2):
6398
- """Checks if df1 is a subset of df2 based on row content, ignoring column names.
 
 
 
 
 
 
 
 
6399
 
6400
  Args:
6401
- df1: Pandas DataFrame 1 to compare.
6402
- df2: Pandas DataFrame 2 to compare.
 
6403
 
6404
  Returns:
6405
- True if df1 is a subset of df2 based on column values,
6406
- False otherwise.
6407
  """
6408
- if df1.empty or df2.empty or df1.shape[1] > df2.shape[1]:
6409
- return False
6410
-
6411
- def make_hashable(value):
6412
- if isinstance(value, dict):
6413
- return json.dumps(value, sort_keys=True)
6414
- if isinstance(value, list):
6415
- return tuple(value)
6416
- return value
6417
-
6418
- df1_cols = [
6419
- tuple(make_hashable(value) for value in df1.iloc[:, i])
6420
- for i in range(df1.shape[1])
6421
- ]
6422
- df2_cols = [
6423
- tuple(make_hashable(value) for value in df2.iloc[:, j])
6424
- for j in range(df2.shape[1])
6425
- ]
6426
- df2_cols_count = Counter(df2_cols)
6427
- for col in df1_cols:
6428
- if df2_cols_count[col] > 0:
6429
- df2_cols_count[col] -= 1
6430
- else:
6431
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6432
 
6433
- return True
6434
 
6435
  def get_sql_execution_results(
6436
  self, predicted_sql: str, gold_sql: str, connector
@@ -6446,7 +6518,7 @@ class SQLExecutionAccuracy(InstanceMetric):
6446
  a 12-tuple of
6447
  1. execution_result: if df responses match
6448
  2. non_empty_execution_result: if dfs are non-empty and match
6449
- 3. subset_non_empty_execution_result: if non-empty dfs and gt df subset of predicted df
6450
  4. non_empty_gold_df: if gt df is non-empty
6451
  5. gold_sql_runtime: ground truth query runtime
6452
  6. predicted_sql_runtime: predicted query runtime
@@ -6569,12 +6641,21 @@ class SQLExecutionAccuracy(InstanceMetric):
6569
  pred_res = pred_res["results"]
6570
  predicted_df = pd.DataFrame(pred_res)
6571
 
 
 
6572
  if "ORDER BY" in gold_sql.upper():
6573
  execution_result = (
6574
  1
6575
  if self.compare_dfs_ignore_colnames_ordered_rows(predicted_df, gold_df)
6576
  else 0
6577
  )
 
 
 
 
 
 
 
6578
  else:
6579
  execution_result = (
6580
  1
@@ -6583,14 +6664,13 @@ class SQLExecutionAccuracy(InstanceMetric):
6583
  )
6584
  else 0
6585
  )
6586
-
6587
- subset_non_empty_execution_result = 0
6588
- non_empty_execution_result = 0
6589
- if non_empty_gold_df:
6590
- if execution_result == 1:
6591
- non_empty_execution_result = 1
6592
- if self.is_subset_ignore_colnames(gold_df, predicted_df):
6593
- subset_non_empty_execution_result = 1
6594
 
6595
  return (
6596
  execution_result,
@@ -6672,6 +6752,7 @@ class SQLNonExecutionAccuracy(InstanceMetric):
6672
  "sqlglot_optimized_equivalence",
6673
  "sqlparse_equivalence",
6674
  "sql_exact_match",
 
6675
  ]
6676
  }
6677
  main_score = "sqlglot_equivalence"
@@ -6682,6 +6763,7 @@ class SQLNonExecutionAccuracy(InstanceMetric):
6682
  "sqlglot_optimized_equivalence",
6683
  "sqlparse_equivalence",
6684
  "sql_exact_match",
 
6685
  ]
6686
 
6687
  prediction_type = "Any" # string representation is compared
@@ -6729,6 +6811,17 @@ class SQLNonExecutionAccuracy(InstanceMetric):
6729
  ),
6730
  "sql_exact_match": float(sql_exact_match(predicted_sql, gold_sql)),
6731
  }
 
 
 
 
 
 
 
 
 
 
 
6732
  logger.debug(f"SQL Non Execution Accuracy Result: {result}")
6733
  result["score"] = result[self.main_score]
6734
  result["score_name"] = self.main_score
 
29
  import numpy as np
30
  import pandas as pd
31
  import requests
 
32
  from scipy.stats import bootstrap
33
  from scipy.stats._warnings_errors import DegenerateDataWarning
34
 
 
64
  from .settings_utils import get_settings
65
  from .stream import MultiStream, Stream
66
  from .type_utils import Type, isoftype, parse_type_string, to_type_string
67
+ from .utils import deep_copy, recursive_copy, retry_connection_with_exponential_backoff
68
 
69
  logger = get_logger()
70
  settings = get_settings()
71
 
72
  warnings.filterwarnings("ignore", category=DegenerateDataWarning)
73
 
74
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
75
  def hf_evaluate_load(path: str, *args, **kwargs):
76
  if settings.hf_offline_metrics_path is not None:
77
  path = os.path.join(settings.hf_offline_metrics_path, path)
 
80
  *args,
81
  **kwargs,
82
  experiment_id=str(uuid.uuid4()),
 
 
 
83
  verification_mode="no_checks",
84
  trust_remote_code=settings.allow_unverified_code,
85
  download_mode=(
 
123
  warnings.simplefilter("ignore", category=RuntimeWarning)
124
  return np.nanmax(x)
125
 
126
+
127
  def nan_std(x):
128
  with warnings.catch_warnings():
129
  warnings.simplefilter("ignore", category=RuntimeWarning)
 
395
  result = np.array([scores[m] for m in self.score_names])
396
  self._history.append(result)
397
  return result
398
+
399
  def mean(self, idx):
400
  return nan_mean([result[idx] for result in self._history])
401
 
402
  def std(self, idx):
403
  return nan_std([result[idx] for result in self._history])
404
 
405
+
406
  class ConfidenceIntervalMixin(Artifact):
407
  n_resamples: int = 1000
408
  confidence_level: float = 0.95
 
412
  def _sample_to_scores(self, sample: List[Any]) -> Dict[str, Any]:
413
  pass
414
 
 
415
  def bootstrap(self, data: List[Any], score_names: List[str]):
416
  if self.ci_score_names is not None:
417
  score_names = self.ci_score_names
418
 
 
419
  statistic = Statistic(data, score_names, self._sample_to_scores)
420
  with warnings.catch_warnings():
421
+ warnings.filterwarnings( # Ignore error the arises when all sample scores are identical
422
  "ignore",
423
  message="invalid value encountered in divide",
424
+ category=RuntimeWarning,
425
  )
426
 
427
  intervals = bootstrap(
 
435
  method="BCa",
436
  ).confidence_interval
437
 
 
438
  result = {}
439
  for i, metric in enumerate(score_names):
440
  high = intervals.high[i]
441
  low = intervals.low[i]
442
  if np.isnan(high) and np.isnan(low):
443
+ if (
444
+ statistic.std(i) == 0
445
+ ): # When sample scores are identical "BCa" will fail (due to division by std 0)
446
+ high = low = statistic.mean(
447
+ i
448
+ ) # In this case we will use the mean (as there is no variance)
449
  result[f"{metric}_ci_low"] = float(low)
450
  result[f"{metric}_ci_high"] = float(high)
451
 
 
2807
  remote_url = "https://raw.githubusercontent.com/czyssrs/FinQA/dfc5b72c01ee17c442d28d5201b82a1f4e95d5af/code/evaluate/evaluate.py"
2808
  local_filepath = "/tmp/finqa_eval_script.py"
2809
  module_name = "finqa_eval"
2810
+ hash_of_script = "42430b8613082bb4b85d49210284135d" # pragma: allowlist secret
2811
 
2812
  download_finqa_eval_script_file(remote_url, local_filepath, hash_of_script)
2813
  self.finqa_module = load_finqa_eval_module_from_file(
 
3415
 
3416
  class KeyValueExtraction(GlobalMetric):
3417
 
3418
+ prediction_type = Dict[str, str]
3419
+ metric: Metric
3420
  single_reference_per_prediction = True
3421
  main_score = ""
3422
+
3423
  def prepare(self):
3424
  super().prepare()
3425
  self.main_score = f"{self.metric.main_score}_micro"
 
3437
  for reference in references:
3438
  all_reference_keys.update(list(reference.keys()))
3439
  for key in all_reference_keys:
3440
+ key_statistics[key] = []
3441
 
3442
+ num_prediction_keys = 0
3443
+ illegal_prediction_keys = 0
3444
  for reference, prediction in zip(references, predictions):
3445
  for key in all_reference_keys:
3446
+ if key not in reference and key not in prediction:
3447
  continue
3448
+ if key in reference and key in prediction:
3449
+ multi_stream = MultiStream.from_iterables(
3450
+ {
3451
+ "test": [
3452
+ {
3453
+ "prediction": prediction[key],
3454
+ "references": [reference[key]],
3455
+ }
3456
+ ]
3457
+ }
3458
+ )
3459
  output_multi_stream = self.metric(multi_stream)
3460
  output_stream = output_multi_stream["test"]
3461
  score = next(iter(output_stream))["score"]["global"]["score"]
 
3468
  if key not in all_reference_keys:
3469
  illegal_prediction_keys += 1
3470
 
3471
+ result = {}
3472
 
3473
  average = 0
3474
  total = 0
 
3484
 
3485
  result[f"{self.metric.main_score}_micro"] = weighted_average / total
3486
  result[f"{self.metric.main_score}_macro"] = average / len(key_statistics)
3487
+ if num_prediction_keys != 0:
3488
+ result[f"{self.metric.main_score}_legal_keys_in_predictions"] = (
3489
+ 1 - 1.0 * illegal_prediction_keys / num_prediction_keys
3490
+ )
3491
  else:
3492
  result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 0
3493
 
3494
  return result
3495
 
3496
+
3497
  class NER(CustomF1):
3498
  """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
3499
 
 
3724
 
3725
  _requirements_list: List[str] = ["transformers", "torch"]
3726
 
3727
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
3728
  def prepare(self):
3729
  super().prepare()
3730
  import torch
 
3765
 
3766
  _requirements_list: List[str] = ["transformers", "torch", "tqdm"]
3767
 
3768
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
3769
  def prepare(self):
3770
  super().prepare()
3771
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
3955
 
3956
  return result
3957
 
3958
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
3959
  def prepare(self):
3960
  super().prepare()
3961
  from transformers import pipeline
 
4135
 
4136
  _requirements_list: List[str] = ["transformers", "torch"]
4137
 
4138
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
4139
  def compute(
4140
  self,
4141
  references: List[List[Any]],
 
4409
 
4410
  _requirements_list: List[str] = ["transformers", "torch"]
4411
 
4412
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
4413
  def prepare(self):
4414
  super().prepare()
4415
  import torch
 
6067
 
6068
  _requirements_list: List[str] = ["torch", "transformers"]
6069
 
6070
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
6071
  def prepare(self):
6072
  from transformers import AutoTokenizer
6073
 
 
6133
  )
6134
  messages = self.process_input_fields(task_data)
6135
  prompt = self.get_prompt(messages)
6136
+ data_classification_policy = task_data.get("metadata", {}).get(
6137
+ "data_classification_policy"
6138
+ )
6139
 
6140
+ result = self.inference_engine.infer_log_probs(
6141
+ [
6142
+ {
6143
+ "source": prompt,
6144
+ "data_classification_policy": data_classification_policy,
6145
+ }
6146
+ ]
6147
+ )
6148
 
6149
  generated_tokens_list = result[0]
6150
  label, prob_of_risk = self.parse_output(generated_tokens_list)
 
6397
  df1.fillna(0, inplace=True)
6398
  df2.fillna(0, inplace=True)
6399
 
6400
+ # Compare row counts first for a quick check
6401
  if df1.shape != df2.shape:
6402
  return False
6403
 
6404
+ # Convert DataFrames to numpy arrays of strings to handle mixed types
6405
+ df1_array = df1.values.astype(str)
6406
+ df2_array = df2.values.astype(str)
6407
+
6408
+ # Sort each row's elements (column order independence)
6409
+ df1_sorted_rows = np.array([np.sort(row) for row in df1_array])
6410
+ df2_sorted_rows = np.array([np.sort(row) for row in df2_array])
6411
 
6412
+ # Compare the sorted rows in order
6413
+ return np.array_equal(df1_sorted_rows, df2_sorted_rows)
6414
 
6415
  @staticmethod
6416
  def compare_dfs_ignore_colnames_unordered_rows(df1, df2):
 
6424
  True if the DataFrames have the same content (ignoring column names and row order),
6425
  False otherwise.
6426
  """
6427
+ # Compare shapes early on
6428
+ if df1.shape != df2.shape:
6429
+ return False
6430
+
6431
+ # Convert DataFrames to numpy arrays of strings (to handle mixed data types)
6432
+ df1_array = df1.values.astype(str)
6433
+ df2_array = df2.values.astype(str)
6434
+
6435
+ # Sort columns first, then sort rows
6436
+ df1_sorted = np.sort(np.sort(df1_array, axis=1), axis=0)
6437
+ df2_sorted = np.sort(np.sort(df2_array, axis=1), axis=0)
6438
+
6439
+ # Compare the sorted arrays
6440
+ return np.array_equal(df1_sorted, df2_sorted)
6441
 
6442
  @staticmethod
6443
+ def compare_dfs_ignore_colnames_subset(df1, df2, ignore_row_order=True):
6444
+ """Checks if the values of either DataFrame are a subset of the values in the other DataFrame.
6445
+
6446
+ Comparison is column order independent, and could optionally be row order independent.
6447
+ We interpret "subset" as follows:
6448
+ - For each row in df1, there must be a matching (or superset) row in df2, i.e. the set of values
6449
+ in the df1 row is a subset of the set of values in that df2 row. Then do the same check in reverse.
6450
+ - If either condition (df1 is subset of df2 OR df2 is subset of df1) is satisfied, return True.
6451
+
6452
+ We treat an empty dataframe as a subset of nothing, while in theory is a subset of any dataframe.
6453
 
6454
  Args:
6455
+ df1 (pd.DataFrame): Pandas DataFrame 1 to compare.
6456
+ df2 (pd.DataFrame): Pandas DataFrame 2 to compare.
6457
+ ignore_row_order (bool): If True, row order doesn't matter; if False, row order is respected.
6458
 
6459
  Returns:
6460
+ bool: True if df1 is a subset of df2 or vice versa, based on the specified row-order condition.
 
6461
  """
6462
+ df1_array = df1.values.astype(str)
6463
+ df2_array = df2.values.astype(str)
6464
+
6465
+ df1_sorted_rows = [np.sort(row) for row in df1_array]
6466
+ df2_sorted_rows = [np.sort(row) for row in df2_array]
6467
+
6468
+ def row_is_subset(r_small, r_big):
6469
+ """Check if all elements of r_small are in r_big."""
6470
+ return set(r_small).issubset(set(r_big))
6471
+
6472
+ def df_is_subset_of_another(rows_small, rows_big, respect_order):
6473
+ """Check if the rows_small is subset of rows_big under the given order condition."""
6474
+ if not rows_small:
6475
+ return False # DataFrame needs to be non-empty
6476
+
6477
+ # If row order matters:
6478
+ if respect_order:
6479
+ i, j = 0, 0
6480
+ while i < len(rows_small) and j < len(rows_big):
6481
+ if row_is_subset(rows_small[i], rows_big[j]):
6482
+ i += 1
6483
+ j += 1
6484
+ return i == len(rows_small)
6485
+ # Row order doesn't matter:
6486
+ matched_indices = set()
6487
+ for r_small in rows_small:
6488
+ found_match = False
6489
+ for idx, r_big in enumerate(rows_big):
6490
+ if idx not in matched_indices and row_is_subset(r_small, r_big):
6491
+ found_match = True
6492
+ matched_indices.add(idx)
6493
+ break
6494
+ if not found_match:
6495
+ return False
6496
+ return True
6497
+
6498
+ df1_sub_df2 = df_is_subset_of_another(
6499
+ df1_sorted_rows, df2_sorted_rows, not ignore_row_order
6500
+ )
6501
+ df2_sub_df1 = df_is_subset_of_another(
6502
+ df2_sorted_rows, df1_sorted_rows, not ignore_row_order
6503
+ )
6504
 
6505
+ return df1_sub_df2 or df2_sub_df1
6506
 
6507
  def get_sql_execution_results(
6508
  self, predicted_sql: str, gold_sql: str, connector
 
6518
  a 12-tuple of
6519
  1. execution_result: if df responses match
6520
  2. non_empty_execution_result: if dfs are non-empty and match
6521
+ 3. subset_non_empty_execution_result: if non-empty dfs and one is a subset of the other
6522
  4. non_empty_gold_df: if gt df is non-empty
6523
  5. gold_sql_runtime: ground truth query runtime
6524
  6. predicted_sql_runtime: predicted query runtime
 
6641
  pred_res = pred_res["results"]
6642
  predicted_df = pd.DataFrame(pred_res)
6643
 
6644
+ subset_non_empty_execution_result = 0
6645
+ non_empty_execution_result = 0
6646
  if "ORDER BY" in gold_sql.upper():
6647
  execution_result = (
6648
  1
6649
  if self.compare_dfs_ignore_colnames_ordered_rows(predicted_df, gold_df)
6650
  else 0
6651
  )
6652
+ if non_empty_gold_df:
6653
+ if execution_result == 1:
6654
+ non_empty_execution_result = 1
6655
+ if self.compare_dfs_ignore_colnames_subset(
6656
+ gold_df, predicted_df, ignore_row_order=False
6657
+ ):
6658
+ subset_non_empty_execution_result = 1
6659
  else:
6660
  execution_result = (
6661
  1
 
6664
  )
6665
  else 0
6666
  )
6667
+ if non_empty_gold_df:
6668
+ if execution_result == 1:
6669
+ non_empty_execution_result = 1
6670
+ if self.compare_dfs_ignore_colnames_subset(
6671
+ gold_df, predicted_df, ignore_row_order=True
6672
+ ):
6673
+ subset_non_empty_execution_result = 1
 
6674
 
6675
  return (
6676
  execution_result,
 
6752
  "sqlglot_optimized_equivalence",
6753
  "sqlparse_equivalence",
6754
  "sql_exact_match",
6755
+ "sql_syntactic_equivalence",
6756
  ]
6757
  }
6758
  main_score = "sqlglot_equivalence"
 
6763
  "sqlglot_optimized_equivalence",
6764
  "sqlparse_equivalence",
6765
  "sql_exact_match",
6766
+ "sql_syntactic_equivalence",
6767
  ]
6768
 
6769
  prediction_type = "Any" # string representation is compared
 
6811
  ),
6812
  "sql_exact_match": float(sql_exact_match(predicted_sql, gold_sql)),
6813
  }
6814
+ result["sql_syntactic_equivalence"] = float(
6815
+ any(
6816
+ result[key]
6817
+ for key in [
6818
+ "sqlglot_equivalence",
6819
+ "sqlglot_optimized_equivalence",
6820
+ "sqlparse_equivalence",
6821
+ "sql_exact_match",
6822
+ ]
6823
+ )
6824
+ )
6825
  logger.debug(f"SQL Non Execution Accuracy Result: {result}")
6826
  result["score"] = result[self.main_score]
6827
  result["score_name"] = self.main_score
processors.py CHANGED
@@ -292,6 +292,13 @@ class ExtractMtBenchRatingJudgment(FieldOperator):
292
  except:
293
  return 0.0
294
 
 
 
 
 
 
 
 
295
 
296
  class ExtractMtBenchLabelJudgment(FieldOperator):
297
  def process_value(self, text: Any) -> Any:
 
292
  except:
293
  return 0.0
294
 
295
+ class ExtractHarmRatingJudgement(FieldOperator):
296
+ def process_value(self, text: Any) -> Any:
297
+ match = re.search(r"\[\[([\d]+\.?[\d]*)\]\]", text)
298
+ try:
299
+ return float(match.group(1))*0.25 - 0.25
300
+ except:
301
+ return np.NaN
302
 
303
  class ExtractMtBenchLabelJudgment(FieldOperator):
304
  def process_value(self, text: Any) -> Any:
settings_utils.py CHANGED
@@ -160,6 +160,7 @@ if Settings.is_uninitilized():
160
  settings.hf_offline_metrics_path = None
161
  settings.hf_offline_models_path = None
162
  settings.inference_engine_cache_path = "./inference_engine_cache/"
 
163
 
164
  if Constants.is_uninitilized():
165
  constants = Constants()
 
160
  settings.hf_offline_metrics_path = None
161
  settings.hf_offline_models_path = None
162
  settings.inference_engine_cache_path = "./inference_engine_cache/"
163
+ settings.max_connection_retries = 3
164
 
165
  if Constants.is_uninitilized():
166
  constants = Constants()
sql_utils.py CHANGED
@@ -275,8 +275,15 @@ class Cache:
275
 
276
  logger.info(f"Cache miss for key: {key}. Computing value...")
277
  result = compute_fn()
278
- self.cache[key] = result
279
- logger.info(f"Stored result in cache for key: {key}")
 
 
 
 
 
 
 
280
  return result
281
 
282
  async def async_get_or_set(self, key, compute_fn, no_cache=False, refresh=False):
@@ -494,7 +501,7 @@ class RemoteDatabaseConnector(DatabaseConnector):
494
 
495
  schema_text = ""
496
  for table in schema["tables"]:
497
- schema_text += f"Table: {table['table_name']} has columns: {[col['column_name'] for col in table['columns']]}\n"
498
 
499
  return schema_text
500
 
 
275
 
276
  logger.info(f"Cache miss for key: {key}. Computing value...")
277
  result = compute_fn()
278
+
279
+ if result and not (
280
+ isinstance(result, tuple) and len(result) == 2 and result[0] is None
281
+ ):
282
+ self.cache[key] = result
283
+ logger.info(f"Stored result in cache for key: {key}")
284
+ else:
285
+ logger.info(f"None result. Bypassing caching for key: {key}")
286
+
287
  return result
288
 
289
  async def async_get_or_set(self, key, compute_fn, no_cache=False, refresh=False):
 
501
 
502
  schema_text = ""
503
  for table in schema["tables"]:
504
+ schema_text += f"Table: {table['name'] if 'name' in table else table['table_name']} has columns: {[col['name'] if 'name' in col else col['column_name'] for col in table['columns']]}\n"
505
 
506
  return schema_text
507
 
standard.py CHANGED
@@ -503,7 +503,7 @@ class DatasetRecipe(SourceSequentialOperator):
503
  loader = self.card.loader
504
  if self.loader_limit:
505
  loader.loader_limit = self.loader_limit
506
- logger.info(f"Loader line limit was set to {self.loader_limit}")
507
  self.loading.steps.append(loader)
508
 
509
  # This is required in case loader_limit is not enforced by the loader
 
503
  loader = self.card.loader
504
  if self.loader_limit:
505
  loader.loader_limit = self.loader_limit
506
+ # logger.info(f"Loader line limit was set to {self.loader_limit}")
507
  self.loading.steps.append(loader)
508
 
509
  # This is required in case loader_limit is not enforced by the loader
string_operators.py CHANGED
@@ -9,6 +9,7 @@ from typing import (
9
 
10
  from .operators import FieldOperator, InstanceOperator
11
  from .settings_utils import get_settings
 
12
 
13
  settings = get_settings()
14
 
@@ -50,6 +51,7 @@ class TokensSlice(FieldOperator):
50
 
51
  _requirements_list = ["transformers"]
52
 
 
53
  def prepare(self):
54
  super().prepare()
55
  from transformers import AutoTokenizer
 
9
 
10
  from .operators import FieldOperator, InstanceOperator
11
  from .settings_utils import get_settings
12
+ from .utils import retry_connection_with_exponential_backoff
13
 
14
  settings = get_settings()
15
 
 
51
 
52
  _requirements_list = ["transformers"]
53
 
54
+ @retry_connection_with_exponential_backoff(backoff_factor=2)
55
  def prepare(self):
56
  super().prepare()
57
  from transformers import AutoTokenizer
utils.py CHANGED
@@ -1,15 +1,92 @@
1
  import copy
 
2
  import importlib.util
3
  import json
 
4
  import os
 
5
  import re
6
  import threading
 
7
  from collections import OrderedDict
8
  from functools import lru_cache
9
  from typing import Any, Dict
 
10
 
 
 
 
 
11
  from .text_utils import is_made_of_sub_strings
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  class Singleton(type):
15
  _instances = {}
 
1
  import copy
2
+ import functools
3
  import importlib.util
4
  import json
5
+ import logging
6
  import os
7
+ import random
8
  import re
9
  import threading
10
+ import time
11
  from collections import OrderedDict
12
  from functools import lru_cache
13
  from typing import Any, Dict
14
+ from urllib.error import HTTPError as UrllibHTTPError
15
 
16
+ from requests.exceptions import ConnectionError, HTTPError
17
+ from requests.exceptions import Timeout as TimeoutError
18
+
19
+ from .settings_utils import get_settings
20
  from .text_utils import is_made_of_sub_strings
21
 
22
+ settings = get_settings()
23
+
24
+ def retry_connection_with_exponential_backoff(max_retries=None,
25
+ retry_exceptions=(ConnectionError, TimeoutError, HTTPError, FileNotFoundError, UrllibHTTPError),
26
+ backoff_factor=1):
27
+ """Decorator that implements retry with exponential backoff for network operations.
28
+
29
+ Also handles errors that were triggered by the specified retry exceptions,
30
+ whether they're direct causes or part of the exception context.
31
+
32
+ Args:
33
+ max_retries: Maximum number of retry attempts (falls back to settings if None)
34
+ retry_exceptions: Tuple of exceptions that should trigger a retry
35
+ backoff_factor: Base delay factor in seconds for backoff calculation
36
+
37
+ Returns:
38
+ The decorated function with retry logic
39
+ """
40
+ def decorator(func):
41
+ @functools.wraps(func)
42
+ def wrapper(*args, **kwargs):
43
+ # Get max_retries from settings if not provided
44
+ retries = max_retries if max_retries is not None else settings.max_connection_retries
45
+
46
+ for attempt in range(retries):
47
+ try:
48
+ return func(*args, **kwargs)
49
+ except Exception as e:
50
+ # Check if this exception or any of its causes match the retry exceptions
51
+ should_retry = False
52
+ current_exc = e
53
+
54
+ # Check the exception chain for both __cause__ (explicit) and __context__ (implicit)
55
+ visited_exceptions = set() # To prevent infinite loops in rare cyclic exception references
56
+
57
+ while current_exc is not None and id(current_exc) not in visited_exceptions:
58
+ visited_exceptions.add(id(current_exc))
59
+
60
+ if isinstance(current_exc, retry_exceptions):
61
+ should_retry = True
62
+ break
63
+
64
+ # First check __cause__ (from "raise X from Y")
65
+ if current_exc.__cause__ is not None:
66
+ current_exc = current_exc.__cause__
67
+ # Then check __context__ (from "try: ... except: raise X")
68
+ elif current_exc.__context__ is not None:
69
+ current_exc = current_exc.__context__
70
+ else:
71
+ # No more causes in the chain
72
+ break
73
+
74
+ if not should_retry:
75
+ # Not a retry exception or caused by a retry exception, so re-raise
76
+ raise
77
+
78
+ if attempt >= retries - 1: # Last attempt
79
+ raise # Re-raise the last exception
80
+
81
+ # Calculate exponential backoff with jitter
82
+ wait_time = backoff_factor * (2 ** attempt) + random.uniform(0, 1)
83
+ logging.warning(f"{func.__name__} failed (attempt {attempt+1}/{retries}). "
84
+ f"Retrying in {wait_time:.2f}s. Error: {e!s}")
85
+ time.sleep(wait_time)
86
+
87
+ raise ValueError("there was a problem") from None
88
+ return wrapper
89
+ return decorator
90
 
91
  class Singleton(type):
92
  _instances = {}
version.py CHANGED
@@ -1 +1 @@
1
- version = "1.21.0"
 
1
+ version = "1.22.0"