Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Feb 2

Commit

365fb61

verified ·

1 Parent(s): 91ef70a

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

api.py +52 -27
formats.py +50 -0
loaders.py +14 -74
metrics.py +0 -1
operators.py +205 -1
settings_utils.py +4 -2
version.py +1 -1

api.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import inspect
 import json
 from datetime import datetime
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Union
 from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from .artifact import fetch_artifact
 from .card import TaskCard
@@ -19,7 +22,7 @@ from .loaders import LoadFromDictionary
 from .logging_utils import get_logger
 from .metric_utils import EvaluationResults, _compute, _inference_post_process
 from .operator import SourceOperator
-from .schema import UNITXT_DATASET_SCHEMA, loads_instance
 from .settings_utils import get_constants, get_settings
 from .standard import DatasetRecipe
 from .task import Task
@@ -29,13 +32,9 @@ constants = get_constants()
 settings = get_settings()
-def load(source: Union[SourceOperator, str]):
-    assert isinstance(
-        source, (SourceOperator, str)
-    ), "source must be a SourceOperator or a string"
-    if isinstance(source, str):
-        source, _ = fetch_artifact(source)
-    return source().to_dataset()
 def _get_recipe_from_query(dataset_query: str) -> DatasetRecipe:
@@ -135,11 +134,44 @@ def create_dataset(
     return load_dataset(card=card, split=split, **kwargs)
 def load_dataset(
     dataset_query: Optional[str] = None,
     split: Optional[str] = None,
     streaming: bool = False,
-    disable_cache: Optional[bool] = None,
     **kwargs,
 ) -> Union[DatasetDict, IterableDatasetDict, Dataset, IterableDataset]:
     """Loads dataset.
@@ -156,11 +188,16 @@ def load_dataset(
             local catalog or name of specific recipe or benchmark in the catalog. For
             example, ``"card=cards.wnli,template=templates.classification.multi_class.relation.default"``.
         streaming (bool, False):
-            When True yields the data as Unitxt streams dictionary
         split (str, optional):
             The split of the data to load
-        disable_cache (str, optional):
-            Disable caching process of the data
         **kwargs:
             Arguments used to load dataset from provided card, which is not present in local catalog.
@@ -184,21 +221,9 @@ def load_dataset(
     """
     recipe = load_recipe(dataset_query, **kwargs)
-    stream = recipe()
-    if split is not None:
-        stream = stream[split]
-    if disable_cache is None:
-        disable_cache = settings.disable_hf_datasets_cache
-    if streaming:
-        dataset = stream.to_iterable_dataset(
-            features=UNITXT_DATASET_SCHEMA,
-        ).map(loads_instance, batched=True)
-    else:
-        dataset = stream.to_dataset(
-            features=UNITXT_DATASET_SCHEMA, disable_cache=disable_cache
-        ).with_transform(loads_instance)
     frame = inspect.currentframe()
     args, _, _, values = inspect.getargvalues(frame)

+import hashlib
 import inspect
 import json
+import tempfile
 from datetime import datetime
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Union
 from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
+from datasets.exceptions import DatasetGenerationError
 from .artifact import fetch_artifact
 from .card import TaskCard
 from .logging_utils import get_logger
 from .metric_utils import EvaluationResults, _compute, _inference_post_process
 from .operator import SourceOperator
+from .schema import loads_instance
 from .settings_utils import get_constants, get_settings
 from .standard import DatasetRecipe
 from .task import Task
 settings = get_settings()
+def short_hex_hash(value, length=8):
+    h = hashlib.sha256(value.encode()).hexdigest()  # Full 64-character hex
+    return h[:length]
 def _get_recipe_from_query(dataset_query: str) -> DatasetRecipe:
     return load_dataset(card=card, split=split, **kwargs)
+def _source_to_dataset(
+    source: SourceOperator, split=None, use_cache=False, streaming=False
+):
+    from .dataset import Dataset as UnitxtDataset
+    stream = source()
+    with tempfile.TemporaryDirectory() as dir_to_be_deleted:
+        cache_dir = dir_to_be_deleted if not use_cache else None
+        ds_builder = UnitxtDataset(
+            dataset_name="unitxt",
+            config_name="recipe-" + short_hex_hash(source.to_json()),
+            hash=hash(source.to_json()),
+            version=constants.version,
+            cache_dir=cache_dir,
+        )
+        if split is not None:
+            stream = {split: stream[split]}
+        ds_builder._generators = stream
+        try:
+            ds_builder.download_and_prepare()
+            if streaming:
+                return ds_builder.as_streaming_dataset(split=split)
+            return ds_builder.as_dataset(
+                split=split, run_post_process=False, verification_mode="no_checks"
+            )
+        except DatasetGenerationError as e:
+            raise e.__cause__
 def load_dataset(
     dataset_query: Optional[str] = None,
     split: Optional[str] = None,
     streaming: bool = False,
+    use_cache: Optional[bool] = False,
     **kwargs,
 ) -> Union[DatasetDict, IterableDatasetDict, Dataset, IterableDataset]:
     """Loads dataset.
             local catalog or name of specific recipe or benchmark in the catalog. For
             example, ``"card=cards.wnli,template=templates.classification.multi_class.relation.default"``.
         streaming (bool, False):
+            When True yields the data as a stream.
+            This is useful when loading very large datasets.
+            Loading datasets as streams avoid loading all the data to memory, but requires the dataset's loader to support streaming.
         split (str, optional):
             The split of the data to load
+        use_cache (bool, optional):
+            If set to True, the returned Huggingface dataset is cached on local disk such that if the same dataset is loaded again, it will be loaded from local disk, resulting in faster runs.
+            If set to False (default), the returned dataset is not cached.
+            Note that if caching is enabled and the dataset card definition is changed, the old version in the cache may be returned.
+            Enable caching only if you are sure you are working with fixed Unitxt datasets and definitions (e.g. running using predefined datasets from the Unitxt catalog).
         **kwargs:
             Arguments used to load dataset from provided card, which is not present in local catalog.
     """
     recipe = load_recipe(dataset_query, **kwargs)
+    dataset = _source_to_dataset(
+        source=recipe, split=split, use_cache=use_cache, streaming=streaming
+    )
     frame = inspect.currentframe()
     args, _, _, values = inspect.getargvalues(frame)

formats.py CHANGED Viewed

@@ -13,6 +13,7 @@ from typing import (
 from .dataclass import OptionalField
 from .dict_utils import dict_get
 from .image_operators import image_to_data_url
 from .operator import InstanceOperator
 from .settings_utils import get_constants
@@ -25,6 +26,55 @@ class Format(InstanceOperator):
     pass
 def apply_capital_new_line_notation(text: str) -> str:
     r"""Transforms a given string by applying the Capital New Line Notation.

 from .dataclass import OptionalField
 from .dict_utils import dict_get
+from .error_utils import UnitxtError
 from .image_operators import image_to_data_url
 from .operator import InstanceOperator
 from .settings_utils import get_constants
     pass
+class GraniteDocumentsFormat(Format):
+    model: str = "ibm-granite/granite-3.1-8b-instruct"
+    citations: bool = True
+    length: str = "long"
+    _requirements_list = ["transformers"]
+    def prepare(self):
+        super().prepare()
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model)
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        inputs = instance["input_fields"]
+        if "question" not in inputs:
+            raise UnitxtError(
+                "GraniteRAGFormat works only for tasks with field: 'question'"
+            )
+        if "context" not in inputs and "contexts" not in inputs:
+            raise UnitxtError(
+                "GraniteRAGFormat works only for tasks with field: 'context' or 'contexts"
+            )
+        if "context" in inputs:
+            texts = [inputs["context"]]
+        if "contexts" in inputs:
+            texts = inputs["contexts"]
+        documents = []
+        for text in texts:
+            documents.append({"title": "", "text": text})
+        question = inputs["question"]
+        instance["source"] = self.tokenizer.apply_chat_template(
+            [
+                {"role": "user", "content": question},
+            ],
+            documents=documents,
+            controls={"citations": self.citations, "length": self.length},
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        return instance
 def apply_capital_new_line_notation(text: str) -> str:
     r"""Transforms a given string by applying the Capital New Line Notation.

loaders.py CHANGED Viewed

@@ -53,7 +53,7 @@ from typing import (
 import pandas as pd
 import requests
-from datasets import IterableDatasetDict
 from datasets import load_dataset as hf_load_dataset
 from huggingface_hub import HfApi
 from tqdm import tqdm
@@ -210,7 +210,7 @@ class LoadHF(Loader):
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
     revision: Optional[str] = None
-    streaming: bool = True
     filtering_lambda: Optional[str] = None
     num_proc: Optional[int] = None
     requirements_list: List[str] = OptionalField(default_factory=list)
@@ -221,7 +221,7 @@ class LoadHF(Loader):
                 self._requirements_list.append(requirement)
         super().verify()
-    def filter_load(self, dataset):
         if not settings.allow_unverified_code:
             raise ValueError(
                 f"{self.__class__.__name__} cannot run use filtering_lambda expression without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE=True."
@@ -229,9 +229,14 @@ class LoadHF(Loader):
         logger.info(f"\nLoading filtered by: {self.filtering_lambda};")
         return dataset.filter(eval(self.filtering_lambda))
     def stream_dataset(self):
         with tempfile.TemporaryDirectory() as dir_to_be_deleted:
-            if settings.disable_hf_datasets_cache and not self.streaming:
                 cache_dir = dir_to_be_deleted
             else:
                 cache_dir = None
@@ -242,7 +247,7 @@ class LoadHF(Loader):
                     data_dir=self.data_dir,
                     data_files=self.data_files,
                     revision=self.revision,
-                    streaming=self.streaming,
                     cache_dir=cache_dir,
                     split=self.split,
                     trust_remote_code=settings.allow_unverified_code,
@@ -288,11 +293,8 @@ class LoadHF(Loader):
                         f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
                     ) from e
-        if self.split is None:
-            for split in dataset.keys():
-                dataset[split] = dataset[split].to_iterable_dataset()
-        else:
-            dataset = {self.split: dataset.to_iterable_dataset()}
         return dataset
@@ -824,6 +826,8 @@ class LoadFromHFSpace(LoadHF):
     token_env: Optional[str] = None
     requirements_list: List[str] = ["huggingface_hub"]
     def _get_token(self) -> Optional[Union[bool, str]]:
         if self.token_env:
             token = os.getenv(self.token_env)
@@ -954,70 +958,6 @@ class LoadFromHFSpace(LoadHF):
         self.path = self._download_data()
         return super().load_data()
-        # url: str
-        # _requirements_list: List[str] = ["opendatasets"]
-        # data_classification_policy = ["public"]
-        # def verify(self):
-        #     super().verify()
-        #     if not os.path.isfile("kaggle.json"):
-        #         raise MissingKaggleCredentialsError(
-        #             "Please obtain kaggle credentials https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/ and save them to local ./kaggle.json file"
-        #         )
-        #     if self.streaming:
-        #         raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
-        # def prepare(self):
-        #     super().prepare()
-        #     from opendatasets import download
-        #     self.downloader = download
-        # def load_iterables(self):
-        #     with TemporaryDirectory() as temp_directory:
-        #         self.downloader(self.url, temp_directory)
-        #         return hf_load_dataset(temp_directory, streaming=False)
-        # class LoadFromAPI(Loader):
-        #     """Loads data from from API"""
-        #     urls: Dict[str, str]
-        #     chunksize: int = 100000
-        #     loader_limit: Optional[int] = None
-        #     streaming: bool = False
-        #     def _maybe_set_classification_policy(self):
-        #         self.set_default_data_classification(["proprietary"], "when loading from API")
-        #     def load_iterables(self):
-        self.api_key = os.getenv("SQL_API_KEY", None)
-        if not self.api_key:
-            raise ValueError(
-                "The environment variable 'SQL_API_KEY' must be set to use the RemoteDatabaseConnector."
-            )
-        self.base_headers = {
-            "Content-Type": "application/json",
-            "accept": "application/json",
-            "Authorization": f"Bearer {self.api_key}",
-        }
-        iterables = {}
-        for split_name, url in self.urls.items():
-            response = requests.get(
-                url,
-                headers=self.base_headers,
-                verify=True,
-            )
-            iterables[split_name] = pd.DataFrame(
-                json.loads(response.text)["embeddings"]
-            )
-        return iterables
 class LoadFromAPI(Loader):
     """Loads data from from API.

 import pandas as pd
 import requests
+from datasets import DatasetDict, IterableDatasetDict
 from datasets import load_dataset as hf_load_dataset
 from huggingface_hub import HfApi
 from tqdm import tqdm
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
     revision: Optional[str] = None
+    streaming: bool = None
     filtering_lambda: Optional[str] = None
     num_proc: Optional[int] = None
     requirements_list: List[str] = OptionalField(default_factory=list)
                 self._requirements_list.append(requirement)
         super().verify()
+    def filter_load(self, dataset: DatasetDict):
         if not settings.allow_unverified_code:
             raise ValueError(
                 f"{self.__class__.__name__} cannot run use filtering_lambda expression without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE=True."
         logger.info(f"\nLoading filtered by: {self.filtering_lambda};")
         return dataset.filter(eval(self.filtering_lambda))
+    def is_streaming(self) -> bool:
+        if self.streaming is None:
+            return settings.stream_hf_datasets_by_default
+        return self.streaming
     def stream_dataset(self):
         with tempfile.TemporaryDirectory() as dir_to_be_deleted:
+            if settings.disable_hf_datasets_cache and not self.is_streaming():
                 cache_dir = dir_to_be_deleted
             else:
                 cache_dir = None
                     data_dir=self.data_dir,
                     data_files=self.data_files,
                     revision=self.revision,
+                    streaming=self.is_streaming(),
                     cache_dir=cache_dir,
                     split=self.split,
                     trust_remote_code=settings.allow_unverified_code,
                         f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
                     ) from e
+        if self.split is not None:
+            dataset = {self.split: dataset}
         return dataset
     token_env: Optional[str] = None
     requirements_list: List[str] = ["huggingface_hub"]
+    streaming: bool = True
     def _get_token(self) -> Optional[Union[bool, str]]:
         if self.token_env:
             token = os.getenv(self.token_env)
         self.path = self._download_data()
         return super().load_data()
 class LoadFromAPI(Loader):
     """Loads data from from API.

metrics.py CHANGED Viewed

@@ -1886,7 +1886,6 @@ class RelaxedCorrectness(GlobalMetric):
             "relaxed_augmented_split": [],
         }
         for pred, ref, task_data_i in zip(predictions, references, task_data):
-            print(task_data_i)
             type = task_data_i["type"]
             score = self.relaxed_correctness(pred, ref[0])
             score = 1.0 if score else 0.0

             "relaxed_augmented_split": [],
         }
         for pred, ref, task_data_i in zip(predictions, references, task_data):
             type = task_data_i["type"]
             score = self.relaxed_correctness(pred, ref[0])
             score = 1.0 if score else 0.0

operators.py CHANGED Viewed

@@ -67,6 +67,7 @@ from .artifact import Artifact, fetch_artifact
 from .dataclass import NonPositionalField, OptionalField
 from .deprecation_utils import deprecation
 from .dict_utils import dict_delete, dict_get, dict_set, is_subpath
 from .generator_utils import ReusableGenerator
 from .operator import (
     InstanceOperator,
@@ -84,7 +85,7 @@ from .operator import (
 from .random_utils import new_random_generator
 from .settings_utils import get_settings
 from .stream import DynamicStream, Stream
-from .text_utils import nested_tuple_to_string
 from .type_utils import isoftype
 from .utils import (
     LRUCache,
@@ -1476,6 +1477,113 @@ class Intersect(FieldOperator):
         return [e for e in value if e in self.allowed_values]
 class RemoveValues(FieldOperator):
     """Removes elements in a field, which must be a list, using a given list of unallowed.
@@ -2243,6 +2351,102 @@ class CollateInstances(StreamOperator):
             )
 class WikipediaFetcher(FieldOperator):
     mode: Literal["summary", "text"] = "text"
     _requirements_list = ["Wikipedia-API"]

 from .dataclass import NonPositionalField, OptionalField
 from .deprecation_utils import deprecation
 from .dict_utils import dict_delete, dict_get, dict_set, is_subpath
+from .error_utils import UnitxtError
 from .generator_utils import ReusableGenerator
 from .operator import (
     InstanceOperator,
 from .random_utils import new_random_generator
 from .settings_utils import get_settings
 from .stream import DynamicStream, Stream
+from .text_utils import nested_tuple_to_string, to_pretty_string
 from .type_utils import isoftype
 from .utils import (
     LRUCache,
         return [e for e in value if e in self.allowed_values]
+class IntersectCorrespondingFields(InstanceOperator):
+    """Intersects the value of a field, which must be a list, with a given list , and removes corresponding elements from other list fields.
+    For example:
+    Assume the instances contain a field of 'labels' and a field with the labels' corresponding 'positions' in the text.
+    IntersectCorrespondingFields(field="label",
+                                 allowed_values=["b", "f"],
+                                 corresponding_fields_to_intersect=["position"])
+    would keep only "b" and "f" values in 'labels' field and
+    their respective values in the 'position' field.
+    (All other fields are not effected)
+    Given this input:
+    [
+        {"label": ["a", "b"],"position": [0,1],"other" : "not"},
+        {"label": ["a", "c", "d"], "position": [0,1,2], "other" : "relevant"},
+        {"label": ["a", "b", "f"], "position": [0,1,2], "other" : "field"}
+    ]
+    So the output would be:
+    [
+            {"label": ["b"], "position":[1],"other" : "not"},
+            {"label": [], "position": [], "other" : "relevant"},
+            {"label": ["b", "f"],"position": [1,2], "other" : "field"},
+    ]
+    Args:
+        field - the field to intersected (must contain list values)
+        allowed_values (list) - list of values to keep
+        corresponding_fields_to_intersect (list) - additional list fields from which values
+        are removed based the corresponding indices of values removed from the 'field'
+    """
+    field: str
+    allowed_values: List[str]
+    corresponding_fields_to_intersect: List[str]
+    def verify(self):
+        super().verify()
+        if not isinstance(self.allowed_values, list):
+            raise ValueError(
+                f"The allowed_field_values is not a type list but '{type(self.allowed_field_values)}'"
+            )
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        if self.field not in instance:
+            raise ValueError(
+                f"Field '{self.field}' is not in provided instance.\n"
+                + to_pretty_string(instance)
+            )
+        for corresponding_field in self.corresponding_fields_to_intersect:
+            if corresponding_field not in instance:
+                raise ValueError(
+                    f"Field '{corresponding_field}' is not in provided instance.\n"
+                    + to_pretty_string(instance)
+                )
+        if not isinstance(instance[self.field], list):
+            raise ValueError(
+                f"Value of field '{self.field}' is not a list, so IntersectCorrespondingFields can not intersect with allowed values. Field value:\n"
+                + to_pretty_string(instance, keys=[self.field])
+            )
+        num_values_in_field = len(instance[self.field])
+        if set(self.allowed_values) == set(instance[self.field]):
+            return instance
+        indices_to_keep = [
+            i
+            for i, value in enumerate(instance[self.field])
+            if value in set(self.allowed_values)
+        ]
+        result_instance = {}
+        for field_name, field_value in instance.items():
+            if (
+                field_name in self.corresponding_fields_to_intersect
+                or field_name == self.field
+            ):
+                if not isinstance(field_value, list):
+                    raise ValueError(
+                        f"Value of field '{field_name}' is not a list, IntersectCorrespondingFields can not intersect with allowed values."
+                    )
+                if len(field_value) != num_values_in_field:
+                    raise ValueError(
+                        f"Number of elements in field '{field_name}' is not the same as the number of elements in field '{self.field}' so the IntersectCorrespondingFields can not remove corresponding values.\n"
+                        + to_pretty_string(instance, keys=[self.field, field_name])
+                    )
+                result_instance[field_name] = [
+                    value
+                    for index, value in enumerate(field_value)
+                    if index in indices_to_keep
+                ]
+            else:
+                result_instance[field_name] = field_value
+        return result_instance
 class RemoveValues(FieldOperator):
     """Removes elements in a field, which must be a list, using a given list of unallowed.
             )
+class CollateInstancesByField(StreamOperator):
+    """Groups a list of instances by a specified field, aggregates specified fields into lists, and ensures consistency for all other non-aggregated fields.
+    Args:
+        by_field str: the name of the field to group data by.
+        aggregate_fields list(str): the field names to aggregate into lists.
+    Returns:
+        A stream of instances grouped and aggregated by the specified field.
+    Raises:
+        UnitxtError: If non-aggregate fields have inconsistent values.
+    Example:
+        Collate the instances based on field "category" and aggregate fields "value" and "id".
+        CollateInstancesByField(by_field="category", aggregate_fields=["value", "id"])
+        given input:
+        [
+            {"id": 1, "category": "A", "value": 10", "flag" : True},
+            {"id": 2, "category": "B", "value": 20", "flag" : False},
+            {"id": 3, "category": "A", "value": 30", "flag" : True},
+            {"id": 4, "category": "B", "value": 40", "flag" : False}
+        ]
+        the output is:
+        [
+            {"category": "A", "id": [1, 3], "value": [10, 30], "info": True},
+            {"category": "B", "id": [2, 4], "value": [20, 40], "info": False}
+        ]
+        Note that the "flag" field is not aggregated, and must be the same
+        in all instances in the same category, or an error is raised.
+    """
+    by_field: str = NonPositionalField(required=True)
+    aggregate_fields: List[str] = NonPositionalField(required=True)
+    def prepare(self):
+        super().prepare()
+    def verify(self):
+        super().verify()
+        if not isinstance(self.by_field, str):
+            raise UnitxtError(
+                f"The 'by_field' value is not a string but '{type(self.by_field)}'"
+            )
+        if not isinstance(self.aggregate_fields, list):
+            raise UnitxtError(
+                f"The 'allowed_field_values' is not a list but '{type(self.aggregate_fields)}'"
+            )
+    def process(self, stream: Stream, stream_name: Optional[str] = None):
+        grouped_data = {}
+        for instance in stream:
+            if self.by_field not in instance:
+                raise UnitxtError(
+                    f"The field '{self.by_field}' specified by CollateInstancesByField's 'by_field' argument is not found in instance."
+                )
+            for k in self.aggregate_fields:
+                if k not in instance:
+                    raise UnitxtError(
+                        f"The field '{k}' specified in CollateInstancesByField's 'aggregate_fields' argument is not found in instance."
+                    )
+            key = instance[self.by_field]
+            if key not in grouped_data:
+                grouped_data[key] = {
+                    k: v for k, v in instance.items() if k not in self.aggregate_fields
+                }
+                # Add empty lists for fields to aggregate
+                for agg_field in self.aggregate_fields:
+                    if agg_field in instance:
+                        grouped_data[key][agg_field] = []
+            for k, v in instance.items():
+                # Merge classification policy list across instance with same key
+                if k == "data_classification_policy" and instance[k]:
+                    grouped_data[key][k] = sorted(set(grouped_data[key][k] + v))
+                # Check consistency for all non-aggregate fields
+                elif k != self.by_field and k not in self.aggregate_fields:
+                    if k in grouped_data[key] and grouped_data[key][k] != v:
+                        raise ValueError(
+                            f"Inconsistent value for field '{k}' in group '{key}': "
+                            f"'{grouped_data[key][k]}' vs '{v}'. Ensure that all non-aggregated fields in CollateInstancesByField are consistent across all instances."
+                        )
+                # Aggregate fields
+                elif k in self.aggregate_fields:
+                    grouped_data[key][k].append(instance[k])
+        yield from grouped_data.values()
 class WikipediaFetcher(FieldOperator):
     mode: Literal["summary", "text"] = "text"
     _requirements_list = ["Wikipedia-API"]

settings_utils.py CHANGED Viewed

@@ -149,8 +149,10 @@ if Settings.is_uninitilized():
     settings.skip_artifacts_prepare_and_verify = (bool, False)
     settings.data_classification_policy = None
     settings.mock_inference_mode = (bool, False)
-    settings.disable_hf_datasets_cache = (bool, True)
-    settings.loader_cache_size = (int, 1)
     settings.task_data_as_text = (bool, True)
     settings.default_provider = "watsonx"
     settings.default_format = None

     settings.skip_artifacts_prepare_and_verify = (bool, False)
     settings.data_classification_policy = None
     settings.mock_inference_mode = (bool, False)
+    settings.disable_hf_datasets_cache = (bool, False)
+    settings.stream_hf_datasets_by_default = (bool, False)
+    settings.loader_cache_size = (int, 10)
     settings.task_data_as_text = (bool, True)
     settings.default_provider = "watsonx"
     settings.default_format = None

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.17.1"


1	+ version = "1.17.2"