Elron commited on
Commit
18ed1aa
·
verified ·
1 Parent(s): 7e34b8a

Upload loaders.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. loaders.py +32 -13
loaders.py CHANGED
@@ -36,9 +36,12 @@ from tqdm import tqdm
36
 
37
  from .logging_utils import get_logger
38
  from .operator import SourceOperator
 
39
  from .stream import MultiStream, Stream
40
 
41
  logger = get_logger()
 
 
42
  try:
43
  import ibm_boto3
44
 
@@ -88,16 +91,23 @@ class LoadHF(Loader):
88
  NotImplementedError
89
  ): # streaming is not supported for zipped files so we load without streaming
90
  with tempfile.TemporaryDirectory() as dir_to_be_deleted:
91
- dataset = hf_load_dataset(
92
- self.path,
93
- name=self.name,
94
- data_dir=self.data_dir,
95
- data_files=self.data_files,
96
- streaming=False,
97
- keep_in_memory=True,
98
- cache_dir=dir_to_be_deleted,
99
- split=self.split,
100
- )
 
 
 
 
 
 
 
101
  if self.split is None:
102
  for split in dataset.keys():
103
  dataset[split] = dataset[split].to_iterable_dataset()
@@ -268,9 +278,18 @@ class LoadFromIBMCloud(Loader):
268
  if self.data_dir is not None
269
  else data_file
270
  )
271
- self._download_from_cos(
272
- cos, self.bucket_name, object_key, local_dir + "/" + data_file
273
- )
 
 
 
 
 
 
 
 
 
274
 
275
  if isinstance(self.data_files, list):
276
  dataset = hf_load_dataset(local_dir, streaming=False)
 
36
 
37
  from .logging_utils import get_logger
38
  from .operator import SourceOperator
39
+ from .settings_utils import get_settings
40
  from .stream import MultiStream, Stream
41
 
42
  logger = get_logger()
43
+ settings = get_settings()
44
+
45
  try:
46
  import ibm_boto3
47
 
 
91
  NotImplementedError
92
  ): # streaming is not supported for zipped files so we load without streaming
93
  with tempfile.TemporaryDirectory() as dir_to_be_deleted:
94
+ try:
95
+ dataset = hf_load_dataset(
96
+ self.path,
97
+ name=self.name,
98
+ data_dir=self.data_dir,
99
+ data_files=self.data_files,
100
+ streaming=False,
101
+ keep_in_memory=True,
102
+ cache_dir=dir_to_be_deleted,
103
+ split=self.split,
104
+ truse_remote_code=settings.allow_unverified_code,
105
+ )
106
+ except ValueError as e:
107
+ if "trust_remote_code" in str(e):
108
+ raise ValueError(
109
+ f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
110
+ ) from e
111
  if self.split is None:
112
  for split in dataset.keys():
113
  dataset[split] = dataset[split].to_iterable_dataset()
 
278
  if self.data_dir is not None
279
  else data_file
280
  )
281
+ with tempfile.NamedTemporaryFile() as temp_file:
282
+ # Download to a temporary file in same file partition, and then do an atomic move
283
+ self._download_from_cos(
284
+ cos,
285
+ self.bucket_name,
286
+ object_key,
287
+ local_dir + "/" + os.path.basename(temp_file.name),
288
+ )
289
+ os.rename(
290
+ local_dir + "/" + os.path.basename(temp_file.name),
291
+ local_dir + "/" + data_file,
292
+ )
293
 
294
  if isinstance(self.data_files, list):
295
  dataset = hf_load_dataset(local_dir, streaming=False)