Upload loaders.py with huggingface_hub
Browse files- loaders.py +24 -0
loaders.py
CHANGED
@@ -34,6 +34,7 @@ from datasets import load_dataset as hf_load_dataset
|
|
34 |
from tqdm import tqdm
|
35 |
|
36 |
from .dataclass import InternalField, OptionalField
|
|
|
37 |
from .logging_utils import get_logger
|
38 |
from .operator import SourceOperator
|
39 |
from .settings_utils import get_settings
|
@@ -449,3 +450,26 @@ class LoadFromIBMCloud(Loader):
|
|
449 |
)
|
450 |
|
451 |
return MultiStream.from_iterables(dataset)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
from tqdm import tqdm
|
35 |
|
36 |
from .dataclass import InternalField, OptionalField
|
37 |
+
from .fusion import FixedFusion
|
38 |
from .logging_utils import get_logger
|
39 |
from .operator import SourceOperator
|
40 |
from .settings_utils import get_settings
|
|
|
450 |
)
|
451 |
|
452 |
return MultiStream.from_iterables(dataset)
|
453 |
+
|
454 |
+
|
455 |
+
class MultipleSourceLoader(Loader):
|
456 |
+
"""Allow loading data from multiple sources.
|
457 |
+
|
458 |
+
Examples:
|
459 |
+
1) Loading the train split from Huggingface hub and the test set from a local file:
|
460 |
+
|
461 |
+
MultipleSourceLoader(loaders = [ LoadHF(path="public/data",split="train"), LoadCSV({"test": "mytest.csv"}) ])
|
462 |
+
|
463 |
+
2) Loading a test set combined from two files
|
464 |
+
|
465 |
+
MultipleSourceLoader(loaders = [ LoadCSV({"test": "mytest1.csv"}, LoadCSV({"test": "mytest2.csv"}) ])
|
466 |
+
|
467 |
+
|
468 |
+
"""
|
469 |
+
|
470 |
+
sources: List[Loader]
|
471 |
+
|
472 |
+
def process(self):
|
473 |
+
return FixedFusion(
|
474 |
+
origins=self.sources, max_instances_per_origin=self.get_limit()
|
475 |
+
).process()
|