Elron commited on
Commit
a873536
·
verified ·
1 Parent(s): 3128771

Upload loaders.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. loaders.py +24 -0
loaders.py CHANGED
@@ -34,6 +34,7 @@ from datasets import load_dataset as hf_load_dataset
34
  from tqdm import tqdm
35
 
36
  from .dataclass import InternalField, OptionalField
 
37
  from .logging_utils import get_logger
38
  from .operator import SourceOperator
39
  from .settings_utils import get_settings
@@ -449,3 +450,26 @@ class LoadFromIBMCloud(Loader):
449
  )
450
 
451
  return MultiStream.from_iterables(dataset)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  from tqdm import tqdm
35
 
36
  from .dataclass import InternalField, OptionalField
37
+ from .fusion import FixedFusion
38
  from .logging_utils import get_logger
39
  from .operator import SourceOperator
40
  from .settings_utils import get_settings
 
450
  )
451
 
452
  return MultiStream.from_iterables(dataset)
453
+
454
+
455
+ class MultipleSourceLoader(Loader):
456
+ """Allow loading data from multiple sources.
457
+
458
+ Examples:
459
+ 1) Loading the train split from Huggingface hub and the test set from a local file:
460
+
461
+ MultipleSourceLoader(loaders = [ LoadHF(path="public/data",split="train"), LoadCSV({"test": "mytest.csv"}) ])
462
+
463
+ 2) Loading a test set combined from two files
464
+
465
+ MultipleSourceLoader(loaders = [ LoadCSV({"test": "mytest1.csv"}, LoadCSV({"test": "mytest2.csv"}) ])
466
+
467
+
468
+ """
469
+
470
+ sources: List[Loader]
471
+
472
+ def process(self):
473
+ return FixedFusion(
474
+ origins=self.sources, max_instances_per_origin=self.get_limit()
475
+ ).process()