File size: 669 Bytes
26a73a2
e3ab2c6
 
 
26a73a2
 
 
e3ab2c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from typing import Mapping, Optional, Sequence, Union

from datasets import load_dataset as hf_load_dataset

from .operator import SourceOperator
from .stream import MultiStream


class Loader(SourceOperator):
    pass


class LoadHF(Loader):
    path: str
    name: Optional[str] = None
    data_dir: Optional[str] = None
    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None
    cached = False

    def process(self):
        dataset = hf_load_dataset(
            self.path, name=self.name, data_dir=self.data_dir, data_files=self.data_files, streaming=True
        )

        return MultiStream.from_iterables(dataset)