metric / dataset.py
Elron's picture
Upload folder using huggingface_hub
88c61d3 verified
import os
from typing import Optional, Union
import datasets
from .api import __file__ as _
from .artifact import __file__ as _
from .augmentors import __file__ as _
from .benchmark import __file__ as _
from .blocks import __file__ as _
from .card import __file__ as _
from .catalog import __file__ as _
from .collections import __file__ as _
from .collections_operators import __file__ as _
from .dataclass import __file__ as _
from .dataset_utils import __file__ as _
from .dataset_utils import get_dataset_artifact
from .deprecation_utils import __file__ as _
from .dialog_operators import __file__ as _
from .dict_utils import __file__ as _
from .error_utils import __file__ as _
from .eval_utils import __file__ as _
from .file_utils import __file__ as _
from .formats import __file__ as _
from .fusion import __file__ as _
from .generator_utils import __file__ as _
from .hf_utils import __file__ as _
from .hf_utils import verify_versions_compatibility
from .image_operators import __file__ as _
from .inference import __file__ as _
from .instructions import __file__ as _
from .llm_as_judge import __file__ as _
from .llm_as_judge_chat_templates import __file__ as _
from .llm_as_judge_constants import __file__ as _
from .llm_as_judge_from_template import __file__ as _
from .llm_as_judge_operators import __file__ as _
from .llm_as_judge_utils import __file__ as _
from .loaders import __file__ as _
from .logging_utils import __file__ as _
from .logging_utils import get_logger
from .metric import __file__ as _
from .metric_utils import __file__ as _
from .metrics import __file__ as _
from .normalizers import __file__ as _
from .operator import __file__ as _
from .operators import __file__ as _
from .parsing_utils import __file__ as _
from .processors import __file__ as _
from .random_utils import __file__ as _
from .recipe import __file__ as _
from .register import __file__ as _
from .schema import __file__ as _
from .schema import loads_instance
from .serializers import __file__ as _
from .settings_utils import __file__ as _
from .settings_utils import get_constants
from .span_lableing_operators import __file__ as _
from .split_utils import __file__ as _
from .splitters import __file__ as _
from .standard import __file__ as _
from .stream import __file__ as _
from .stream_operators import __file__ as _
from .string_operators import __file__ as _
from .struct_data_operators import __file__ as _
from .system_prompts import __file__ as _
from .task import __file__ as _
from .templates import __file__ as _
from .text_utils import __file__ as _
from .type_utils import __file__ as _
from .types import __file__ as _
from .utils import __file__ as _
from .utils import is_package_installed
from .validate import __file__ as _
from .version import __file__ as _
logger = get_logger()
constants = get_constants()
class Dataset(datasets.GeneratorBasedBuilder):
VERSION = constants.version
@property
def generators(self):
if not hasattr(self, "_generators") or self._generators is None:
if is_package_installed("unitxt"):
verify_versions_compatibility("dataset", self.VERSION)
from unitxt.dataset_utils import \
get_dataset_artifact as get_dataset_artifact_installed
logger.info("Loading with installed unitxt library...")
dataset = get_dataset_artifact_installed(self.config.name)
else:
logger.info("Loading with huggingface unitxt copy...")
dataset = get_dataset_artifact(self.config.name)
self._generators = dataset()
return self._generators
def _info(self):
return datasets.DatasetInfo()
def _split_generators(self, _):
return [
datasets.SplitGenerator(name=name, gen_kwargs={"split_name": name})
for name in self.generators.keys()
]
def _generate_examples(self, split_name):
generator = self.generators[split_name]
yield from enumerate(generator)
def _download_and_prepare(
self, dl_manager, verification_mode, **prepare_splits_kwargs
):
return super()._download_and_prepare(
dl_manager, "no_checks", **prepare_splits_kwargs
)
def as_dataset(
self,
split: Optional[datasets.Split] = None,
run_post_process=True,
verification_mode: Optional[Union[datasets.VerificationMode, str]] = None,
in_memory=False,
) -> Union[datasets.Dataset, datasets.DatasetDict]:
"""Return a Dataset for the specified split.
Args:
split (`datasets.Split`):
Which subset of the data to return.
run_post_process (`bool`, defaults to `True`):
Whether to run post-processing dataset transforms and/or add
indexes.
verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
Verification mode determining the checks to run on the
downloaded/processed dataset information (checksums/size/splits/...).
in_memory (`bool`, defaults to `False`):
Whether to copy the data in-memory.
Returns:
datasets.Dataset
:Example:
.. code-block:: python
from datasets import load_dataset_builder
builder = load_dataset_builder('rotten_tomatoes')
builder.download_and_prepare()
ds = builder.as_dataset(split='train')
print(ds)
# prints:
# Dataset({
# features: ['text', 'label'],
# num_rows: 8530
# })
"""
return (
super()
.as_dataset(split, run_post_process, verification_mode, in_memory)
.with_transform(loads_instance)
)