#!/usr/bin/env python # coding=utf-8 """This Python code defines a class Dataset with methods for initializing, loading, and manipulating datasets from different backends such as Hugging Face and JSON. The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging Face dataset, mapping datasets, and retrieving the backend dataset and arguments. """ # Importing necessary libraries and modules import json from pathlib import Path from typing import Optional from datasets import load_dataset from datasets import Dataset as HFDataset from lmflow.args import DatasetArguments DATASET_TYPES = [ "text_only", "text2text", ] KEY_TYPE = "type" KEY_INSTANCES = "instances" class Dataset: r""" Initializes the Dataset object with the given parameters. Parameters ------------ data_args : DatasetArguments object. Contains the arguments required to load the dataset. backend : str, default="huggingface" A string representing the dataset backend. Defaults to "huggingface". args : Optional. Positional arguments. kwargs : Optional. Keyword arguments. """ def __init__(self, data_args=None, backend: str="huggingface", *args, **kwargs): self.data_args = data_args self.backend = backend self.backend_dataset = None self.type = None # Original type of the dataset self.dataset_path = data_args.dataset_path if data_args.dataset_path is None: return if backend == "huggingface": data_files = [ x.absolute().as_posix() for x in Path(self.dataset_path).glob("*.json") ] # Iterate through all the files and ensure they have the same data type for single_file in data_files: with open(single_file) as fin: json_data = json.load(fin) if KEY_TYPE not in json_data.keys(): raise ValueError( f'"{KEY_TYPE}" field must be specified for data, e.g.' '{\n' f' "{KEY_TYPE}: "text_only",\n' f' "{KEY_INSTANCES}": [\n' ' { "text": "Sentence 1: This is a sentence." }\n' ' { "text": "Sentence 2: This is another sentence." }\n' f' ]\n' '}' ) if self.type is None: self.type = json_data[KEY_TYPE] elif self.type != json_data[KEY_TYPE]: raise ValueError( 'All task files must have same data types. Previous' f' files have type "{self.type}", but in file' f' {single_file}, it has type "{self.type}".' ) # Load the dataset using the HuggingFace dataset library extensions = "json" raw_dataset = load_dataset( extensions, data_files=data_files, field=KEY_INSTANCES, split="train", use_auth_token=None, ) self.backend_dataset = raw_dataset elif backend == "json": # TODO (@Jiachun) pass else: raise NotImplementedError(f'Unsupported dataset backend "{backend}"') def _check_data_type(self): # TODO: check if data type and data structure matches, raise messages # with hints pass def from_dict(self, dict_obj: dict, *args, **kwargs): r""" Create a Dataset object from a dictionary. Return a Dataset given a dict with format: { "type": TYPE, "instances": [ { "key_1": VALUE_1.1, "key_2": VALUE_1.2, ... }, { "key_1": VALUE_2.1, "key_2": VALUE_2.2, ... }, ... ] } Parameters ----------- dict_obj : dict. A dictionary containing the dataset information. args : Optional. Positional arguments. kwargs : Optional. Keyword arguments. Returns --------- self : Dataset object. """ if self.backend == "huggingface": if KEY_TYPE not in dict_obj: raise ValueError( f'"{KEY_TYPE}" must be provided to initialize a dataset' ) if KEY_INSTANCES not in dict_obj: raise ValueError( f'"{KEY_INSTANCES}" must be provided to initialize a dataset' ) self.type = dict_obj[KEY_TYPE] hf_dict = {} if len(dict_obj[KEY_INSTANCES]) > 0: for key in dict_obj[KEY_INSTANCES][0].keys(): hf_dict[key] = [ instance[key] for instance in dict_obj[KEY_INSTANCES] ] self.backend_dataset = HFDataset.from_dict(hf_dict, *args, **kwargs) return self else: raise NotImplementedError( f'Currently .from_dict is not supported for backend "{backend}"' ) @classmethod def create_from_dict(cls, dict_obj, *args, **kwargs): r""" Returns -------- Returns a Dataset object given a dict. """ empty_data_args = DatasetArguments(dataset_path=None) dataset = Dataset(empty_data_args) return dataset.from_dict(dict_obj) def to_dict(self): r""" Returns --------- Return a dict represents the dataset: { "type": TYPE, "instances": [ { "key_1": VALUE_1.1, "key_2": VALUE_1.2, ... }, { "key_1": VALUE_2.1, "key_2": VALUE_2.2, ... }, ... ] } A python dict object represents the content of this dataset. """ if self.backend == "huggingface": dict_obj = {} dict_obj[KEY_TYPE] = self.get_type() hf_dict = self.backend_dataset.to_dict() dict_obj[KEY_INSTANCES] = [] first_key = None for key in hf_dict.keys(): first_key = key break if first_key is not None: num_instances = len(hf_dict[first_key]) dict_obj[KEY_INSTANCES] = [ { key: hf_dict[key][i] for key in hf_dict.keys() } for i in range(num_instances) ] return dict_obj else: raise NotImplementedError( f'Current .to_dict is not supported for backend "{backend}"' ) def map(self, *args, **kwargs): r""" Parameters ------------ args : Optional. Positional arguments. kwargs : Optional. Keyword arguments. Returns --------- self : Dataset object. """ # If the dataset uses Hugging Face as the backend, # call the `map()` function of the Hugging Face backend dataset if self.backend == "huggingface": # Set the mapped dataset as the backend dataset of the current dataset mapped_backend_dataset = self.backend_dataset.map(*args, **kwargs) self.backend_dataset = mapped_backend_dataset return self else: # If the backend is not Hugging Face, raise a NotImplementedError raise NotImplementedError( f'Currently .map is not supported for backend "{backend}"' ) def get_backend(self) -> Optional[str]: r""" Returns --------- self.backend """ return self.backend def get_backend_dataset(self): r""" Returns --------- self.backend_dataset """ return self.backend_dataset def get_data_args(self): r""" Returns --------- self.data_args """ return self.data_args def get_type(self): r""" Returns --------- self.type """ return self.type