nisten commited on
Commit
04f9ed1
·
verified ·
1 Parent(s): dbe1b18

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. meow/lib/python3.13/site-packages/__pycache__/typing_extensions.cpython-313.pyc +0 -0
  2. meow/lib/python3.13/site-packages/charset_normalizer/__init__.py +48 -0
  3. meow/lib/python3.13/site-packages/charset_normalizer/api.py +668 -0
  4. meow/lib/python3.13/site-packages/charset_normalizer/legacy.py +66 -0
  5. meow/lib/python3.13/site-packages/charset_normalizer/md.py +630 -0
  6. meow/lib/python3.13/site-packages/charset_normalizer/models.py +360 -0
  7. meow/lib/python3.13/site-packages/charset_normalizer/utils.py +408 -0
  8. meow/lib/python3.13/site-packages/charset_normalizer/version.py +8 -0
  9. meow/lib/python3.13/site-packages/filelock/__init__.py +70 -0
  10. meow/lib/python3.13/site-packages/filelock/_api.py +403 -0
  11. meow/lib/python3.13/site-packages/filelock/_error.py +30 -0
  12. meow/lib/python3.13/site-packages/filelock/_soft.py +47 -0
  13. meow/lib/python3.13/site-packages/filelock/_unix.py +68 -0
  14. meow/lib/python3.13/site-packages/filelock/_util.py +52 -0
  15. meow/lib/python3.13/site-packages/filelock/_windows.py +65 -0
  16. meow/lib/python3.13/site-packages/filelock/asyncio.py +342 -0
  17. meow/lib/python3.13/site-packages/filelock/py.typed +0 -0
  18. meow/lib/python3.13/site-packages/filelock/version.py +16 -0
  19. meow/lib/python3.13/site-packages/fsspec/__init__.py +69 -0
  20. meow/lib/python3.13/site-packages/fsspec/_version.py +16 -0
  21. meow/lib/python3.13/site-packages/fsspec/archive.py +73 -0
  22. meow/lib/python3.13/site-packages/fsspec/asyn.py +1098 -0
  23. meow/lib/python3.13/site-packages/fsspec/caching.py +966 -0
  24. meow/lib/python3.13/site-packages/fsspec/callbacks.py +324 -0
  25. meow/lib/python3.13/site-packages/fsspec/compression.py +175 -0
  26. meow/lib/python3.13/site-packages/fsspec/config.py +131 -0
  27. meow/lib/python3.13/site-packages/fsspec/conftest.py +55 -0
  28. meow/lib/python3.13/site-packages/fsspec/core.py +743 -0
  29. meow/lib/python3.13/site-packages/fsspec/dircache.py +98 -0
  30. meow/lib/python3.13/site-packages/fsspec/exceptions.py +18 -0
  31. meow/lib/python3.13/site-packages/fsspec/fuse.py +324 -0
  32. meow/lib/python3.13/site-packages/fsspec/generic.py +411 -0
  33. meow/lib/python3.13/site-packages/fsspec/gui.py +416 -0
  34. meow/lib/python3.13/site-packages/fsspec/json.py +121 -0
  35. meow/lib/python3.13/site-packages/fsspec/mapping.py +251 -0
  36. meow/lib/python3.13/site-packages/fsspec/parquet.py +541 -0
  37. meow/lib/python3.13/site-packages/fsspec/registry.py +315 -0
  38. meow/lib/python3.13/site-packages/fsspec/spec.py +2242 -0
  39. meow/lib/python3.13/site-packages/fsspec/transaction.py +90 -0
  40. meow/lib/python3.13/site-packages/fsspec/utils.py +739 -0
  41. meow/lib/python3.13/site-packages/huggingface_hub-0.27.0.dist-info/INSTALLER +1 -0
  42. meow/lib/python3.13/site-packages/huggingface_hub-0.27.0.dist-info/METADATA +308 -0
  43. meow/lib/python3.13/site-packages/huggingface_hub/__init__.py +1028 -0
  44. meow/lib/python3.13/site-packages/huggingface_hub/_commit_scheduler.py +353 -0
  45. meow/lib/python3.13/site-packages/huggingface_hub/_inference_endpoints.py +396 -0
  46. meow/lib/python3.13/site-packages/huggingface_hub/_local_folder.py +421 -0
  47. meow/lib/python3.13/site-packages/huggingface_hub/_login.py +520 -0
  48. meow/lib/python3.13/site-packages/huggingface_hub/_snapshot_download.py +307 -0
  49. meow/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py +621 -0
  50. meow/lib/python3.13/site-packages/huggingface_hub/community.py +355 -0
meow/lib/python3.13/site-packages/__pycache__/typing_extensions.cpython-313.pyc ADDED
Binary file (143 kB). View file
 
meow/lib/python3.13/site-packages/charset_normalizer/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Charset-Normalizer
3
+ ~~~~~~~~~~~~~~
4
+ The Real First Universal Charset Detector.
5
+ A library that helps you read text from an unknown charset encoding.
6
+ Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7
+ All IANA character set names for which the Python core library provides codecs are supported.
8
+
9
+ Basic usage:
10
+ >>> from charset_normalizer import from_bytes
11
+ >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12
+ >>> best_guess = results.best()
13
+ >>> str(best_guess)
14
+ 'Bсеки човек има право на образование. Oбразованието!'
15
+
16
+ Others methods and usages are available - see the full documentation
17
+ at <https://github.com/Ousret/charset_normalizer>.
18
+ :copyright: (c) 2021 by Ahmed TAHRI
19
+ :license: MIT, see LICENSE for more details.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+
26
+ from .api import from_bytes, from_fp, from_path, is_binary
27
+ from .legacy import detect
28
+ from .models import CharsetMatch, CharsetMatches
29
+ from .utils import set_logging_handler
30
+ from .version import VERSION, __version__
31
+
32
+ __all__ = (
33
+ "from_fp",
34
+ "from_path",
35
+ "from_bytes",
36
+ "is_binary",
37
+ "detect",
38
+ "CharsetMatch",
39
+ "CharsetMatches",
40
+ "__version__",
41
+ "VERSION",
42
+ "set_logging_handler",
43
+ )
44
+
45
+ # Attach a NullHandler to the top level logger by default
46
+ # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47
+
48
+ logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
meow/lib/python3.13/site-packages/charset_normalizer/api.py ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from os import PathLike
5
+ from typing import BinaryIO
6
+
7
+ from .cd import (
8
+ coherence_ratio,
9
+ encoding_languages,
10
+ mb_encoding_languages,
11
+ merge_coherence_ratios,
12
+ )
13
+ from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
14
+ from .md import mess_ratio
15
+ from .models import CharsetMatch, CharsetMatches
16
+ from .utils import (
17
+ any_specified_encoding,
18
+ cut_sequence_chunks,
19
+ iana_name,
20
+ identify_sig_or_bom,
21
+ is_cp_similar,
22
+ is_multi_byte_encoding,
23
+ should_strip_sig_or_bom,
24
+ )
25
+
26
+ logger = logging.getLogger("charset_normalizer")
27
+ explain_handler = logging.StreamHandler()
28
+ explain_handler.setFormatter(
29
+ logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
30
+ )
31
+
32
+
33
+ def from_bytes(
34
+ sequences: bytes | bytearray,
35
+ steps: int = 5,
36
+ chunk_size: int = 512,
37
+ threshold: float = 0.2,
38
+ cp_isolation: list[str] | None = None,
39
+ cp_exclusion: list[str] | None = None,
40
+ preemptive_behaviour: bool = True,
41
+ explain: bool = False,
42
+ language_threshold: float = 0.1,
43
+ enable_fallback: bool = True,
44
+ ) -> CharsetMatches:
45
+ """
46
+ Given a raw bytes sequence, return the best possibles charset usable to render str objects.
47
+ If there is no results, it is a strong indicator that the source is binary/not text.
48
+ By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
49
+ And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
50
+
51
+ The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
52
+ but never take it for granted. Can improve the performance.
53
+
54
+ You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
55
+ purpose.
56
+
57
+ This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
58
+ By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
59
+ toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
60
+ Custom logging format and handler can be set manually.
61
+ """
62
+
63
+ if not isinstance(sequences, (bytearray, bytes)):
64
+ raise TypeError(
65
+ "Expected object of type bytes or bytearray, got: {}".format(
66
+ type(sequences)
67
+ )
68
+ )
69
+
70
+ if explain:
71
+ previous_logger_level: int = logger.level
72
+ logger.addHandler(explain_handler)
73
+ logger.setLevel(TRACE)
74
+
75
+ length: int = len(sequences)
76
+
77
+ if length == 0:
78
+ logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
79
+ if explain: # Defensive: ensure exit path clean handler
80
+ logger.removeHandler(explain_handler)
81
+ logger.setLevel(previous_logger_level or logging.WARNING)
82
+ return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
83
+
84
+ if cp_isolation is not None:
85
+ logger.log(
86
+ TRACE,
87
+ "cp_isolation is set. use this flag for debugging purpose. "
88
+ "limited list of encoding allowed : %s.",
89
+ ", ".join(cp_isolation),
90
+ )
91
+ cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
92
+ else:
93
+ cp_isolation = []
94
+
95
+ if cp_exclusion is not None:
96
+ logger.log(
97
+ TRACE,
98
+ "cp_exclusion is set. use this flag for debugging purpose. "
99
+ "limited list of encoding excluded : %s.",
100
+ ", ".join(cp_exclusion),
101
+ )
102
+ cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
103
+ else:
104
+ cp_exclusion = []
105
+
106
+ if length <= (chunk_size * steps):
107
+ logger.log(
108
+ TRACE,
109
+ "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
110
+ steps,
111
+ chunk_size,
112
+ length,
113
+ )
114
+ steps = 1
115
+ chunk_size = length
116
+
117
+ if steps > 1 and length / steps < chunk_size:
118
+ chunk_size = int(length / steps)
119
+
120
+ is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
121
+ is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
122
+
123
+ if is_too_small_sequence:
124
+ logger.log(
125
+ TRACE,
126
+ "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
127
+ length
128
+ ),
129
+ )
130
+ elif is_too_large_sequence:
131
+ logger.log(
132
+ TRACE,
133
+ "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
134
+ length
135
+ ),
136
+ )
137
+
138
+ prioritized_encodings: list[str] = []
139
+
140
+ specified_encoding: str | None = (
141
+ any_specified_encoding(sequences) if preemptive_behaviour else None
142
+ )
143
+
144
+ if specified_encoding is not None:
145
+ prioritized_encodings.append(specified_encoding)
146
+ logger.log(
147
+ TRACE,
148
+ "Detected declarative mark in sequence. Priority +1 given for %s.",
149
+ specified_encoding,
150
+ )
151
+
152
+ tested: set[str] = set()
153
+ tested_but_hard_failure: list[str] = []
154
+ tested_but_soft_failure: list[str] = []
155
+
156
+ fallback_ascii: CharsetMatch | None = None
157
+ fallback_u8: CharsetMatch | None = None
158
+ fallback_specified: CharsetMatch | None = None
159
+
160
+ results: CharsetMatches = CharsetMatches()
161
+
162
+ early_stop_results: CharsetMatches = CharsetMatches()
163
+
164
+ sig_encoding, sig_payload = identify_sig_or_bom(sequences)
165
+
166
+ if sig_encoding is not None:
167
+ prioritized_encodings.append(sig_encoding)
168
+ logger.log(
169
+ TRACE,
170
+ "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
171
+ len(sig_payload),
172
+ sig_encoding,
173
+ )
174
+
175
+ prioritized_encodings.append("ascii")
176
+
177
+ if "utf_8" not in prioritized_encodings:
178
+ prioritized_encodings.append("utf_8")
179
+
180
+ for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
181
+ if cp_isolation and encoding_iana not in cp_isolation:
182
+ continue
183
+
184
+ if cp_exclusion and encoding_iana in cp_exclusion:
185
+ continue
186
+
187
+ if encoding_iana in tested:
188
+ continue
189
+
190
+ tested.add(encoding_iana)
191
+
192
+ decoded_payload: str | None = None
193
+ bom_or_sig_available: bool = sig_encoding == encoding_iana
194
+ strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
195
+ encoding_iana
196
+ )
197
+
198
+ if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
199
+ logger.log(
200
+ TRACE,
201
+ "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
202
+ encoding_iana,
203
+ )
204
+ continue
205
+ if encoding_iana in {"utf_7"} and not bom_or_sig_available:
206
+ logger.log(
207
+ TRACE,
208
+ "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
209
+ encoding_iana,
210
+ )
211
+ continue
212
+
213
+ try:
214
+ is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
215
+ except (ModuleNotFoundError, ImportError):
216
+ logger.log(
217
+ TRACE,
218
+ "Encoding %s does not provide an IncrementalDecoder",
219
+ encoding_iana,
220
+ )
221
+ continue
222
+
223
+ try:
224
+ if is_too_large_sequence and is_multi_byte_decoder is False:
225
+ str(
226
+ (
227
+ sequences[: int(50e4)]
228
+ if strip_sig_or_bom is False
229
+ else sequences[len(sig_payload) : int(50e4)]
230
+ ),
231
+ encoding=encoding_iana,
232
+ )
233
+ else:
234
+ decoded_payload = str(
235
+ (
236
+ sequences
237
+ if strip_sig_or_bom is False
238
+ else sequences[len(sig_payload) :]
239
+ ),
240
+ encoding=encoding_iana,
241
+ )
242
+ except (UnicodeDecodeError, LookupError) as e:
243
+ if not isinstance(e, LookupError):
244
+ logger.log(
245
+ TRACE,
246
+ "Code page %s does not fit given bytes sequence at ALL. %s",
247
+ encoding_iana,
248
+ str(e),
249
+ )
250
+ tested_but_hard_failure.append(encoding_iana)
251
+ continue
252
+
253
+ similar_soft_failure_test: bool = False
254
+
255
+ for encoding_soft_failed in tested_but_soft_failure:
256
+ if is_cp_similar(encoding_iana, encoding_soft_failed):
257
+ similar_soft_failure_test = True
258
+ break
259
+
260
+ if similar_soft_failure_test:
261
+ logger.log(
262
+ TRACE,
263
+ "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
264
+ encoding_iana,
265
+ encoding_soft_failed,
266
+ )
267
+ continue
268
+
269
+ r_ = range(
270
+ 0 if not bom_or_sig_available else len(sig_payload),
271
+ length,
272
+ int(length / steps),
273
+ )
274
+
275
+ multi_byte_bonus: bool = (
276
+ is_multi_byte_decoder
277
+ and decoded_payload is not None
278
+ and len(decoded_payload) < length
279
+ )
280
+
281
+ if multi_byte_bonus:
282
+ logger.log(
283
+ TRACE,
284
+ "Code page %s is a multi byte encoding table and it appear that at least one character "
285
+ "was encoded using n-bytes.",
286
+ encoding_iana,
287
+ )
288
+
289
+ max_chunk_gave_up: int = int(len(r_) / 4)
290
+
291
+ max_chunk_gave_up = max(max_chunk_gave_up, 2)
292
+ early_stop_count: int = 0
293
+ lazy_str_hard_failure = False
294
+
295
+ md_chunks: list[str] = []
296
+ md_ratios = []
297
+
298
+ try:
299
+ for chunk in cut_sequence_chunks(
300
+ sequences,
301
+ encoding_iana,
302
+ r_,
303
+ chunk_size,
304
+ bom_or_sig_available,
305
+ strip_sig_or_bom,
306
+ sig_payload,
307
+ is_multi_byte_decoder,
308
+ decoded_payload,
309
+ ):
310
+ md_chunks.append(chunk)
311
+
312
+ md_ratios.append(
313
+ mess_ratio(
314
+ chunk,
315
+ threshold,
316
+ explain is True and 1 <= len(cp_isolation) <= 2,
317
+ )
318
+ )
319
+
320
+ if md_ratios[-1] >= threshold:
321
+ early_stop_count += 1
322
+
323
+ if (early_stop_count >= max_chunk_gave_up) or (
324
+ bom_or_sig_available and strip_sig_or_bom is False
325
+ ):
326
+ break
327
+ except (
328
+ UnicodeDecodeError
329
+ ) as e: # Lazy str loading may have missed something there
330
+ logger.log(
331
+ TRACE,
332
+ "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
333
+ encoding_iana,
334
+ str(e),
335
+ )
336
+ early_stop_count = max_chunk_gave_up
337
+ lazy_str_hard_failure = True
338
+
339
+ # We might want to check the sequence again with the whole content
340
+ # Only if initial MD tests passes
341
+ if (
342
+ not lazy_str_hard_failure
343
+ and is_too_large_sequence
344
+ and not is_multi_byte_decoder
345
+ ):
346
+ try:
347
+ sequences[int(50e3) :].decode(encoding_iana, errors="strict")
348
+ except UnicodeDecodeError as e:
349
+ logger.log(
350
+ TRACE,
351
+ "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
352
+ encoding_iana,
353
+ str(e),
354
+ )
355
+ tested_but_hard_failure.append(encoding_iana)
356
+ continue
357
+
358
+ mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
359
+ if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
360
+ tested_but_soft_failure.append(encoding_iana)
361
+ logger.log(
362
+ TRACE,
363
+ "%s was excluded because of initial chaos probing. Gave up %i time(s). "
364
+ "Computed mean chaos is %f %%.",
365
+ encoding_iana,
366
+ early_stop_count,
367
+ round(mean_mess_ratio * 100, ndigits=3),
368
+ )
369
+ # Preparing those fallbacks in case we got nothing.
370
+ if (
371
+ enable_fallback
372
+ and encoding_iana in ["ascii", "utf_8", specified_encoding]
373
+ and not lazy_str_hard_failure
374
+ ):
375
+ fallback_entry = CharsetMatch(
376
+ sequences,
377
+ encoding_iana,
378
+ threshold,
379
+ False,
380
+ [],
381
+ decoded_payload,
382
+ preemptive_declaration=specified_encoding,
383
+ )
384
+ if encoding_iana == specified_encoding:
385
+ fallback_specified = fallback_entry
386
+ elif encoding_iana == "ascii":
387
+ fallback_ascii = fallback_entry
388
+ else:
389
+ fallback_u8 = fallback_entry
390
+ continue
391
+
392
+ logger.log(
393
+ TRACE,
394
+ "%s passed initial chaos probing. Mean measured chaos is %f %%",
395
+ encoding_iana,
396
+ round(mean_mess_ratio * 100, ndigits=3),
397
+ )
398
+
399
+ if not is_multi_byte_decoder:
400
+ target_languages: list[str] = encoding_languages(encoding_iana)
401
+ else:
402
+ target_languages = mb_encoding_languages(encoding_iana)
403
+
404
+ if target_languages:
405
+ logger.log(
406
+ TRACE,
407
+ "{} should target any language(s) of {}".format(
408
+ encoding_iana, str(target_languages)
409
+ ),
410
+ )
411
+
412
+ cd_ratios = []
413
+
414
+ # We shall skip the CD when its about ASCII
415
+ # Most of the time its not relevant to run "language-detection" on it.
416
+ if encoding_iana != "ascii":
417
+ for chunk in md_chunks:
418
+ chunk_languages = coherence_ratio(
419
+ chunk,
420
+ language_threshold,
421
+ ",".join(target_languages) if target_languages else None,
422
+ )
423
+
424
+ cd_ratios.append(chunk_languages)
425
+
426
+ cd_ratios_merged = merge_coherence_ratios(cd_ratios)
427
+
428
+ if cd_ratios_merged:
429
+ logger.log(
430
+ TRACE,
431
+ "We detected language {} using {}".format(
432
+ cd_ratios_merged, encoding_iana
433
+ ),
434
+ )
435
+
436
+ current_match = CharsetMatch(
437
+ sequences,
438
+ encoding_iana,
439
+ mean_mess_ratio,
440
+ bom_or_sig_available,
441
+ cd_ratios_merged,
442
+ (
443
+ decoded_payload
444
+ if (
445
+ is_too_large_sequence is False
446
+ or encoding_iana in [specified_encoding, "ascii", "utf_8"]
447
+ )
448
+ else None
449
+ ),
450
+ preemptive_declaration=specified_encoding,
451
+ )
452
+
453
+ results.append(current_match)
454
+
455
+ if (
456
+ encoding_iana in [specified_encoding, "ascii", "utf_8"]
457
+ and mean_mess_ratio < 0.1
458
+ ):
459
+ # If md says nothing to worry about, then... stop immediately!
460
+ if mean_mess_ratio == 0.0:
461
+ logger.debug(
462
+ "Encoding detection: %s is most likely the one.",
463
+ current_match.encoding,
464
+ )
465
+ if explain: # Defensive: ensure exit path clean handler
466
+ logger.removeHandler(explain_handler)
467
+ logger.setLevel(previous_logger_level)
468
+ return CharsetMatches([current_match])
469
+
470
+ early_stop_results.append(current_match)
471
+
472
+ if (
473
+ len(early_stop_results)
474
+ and (specified_encoding is None or specified_encoding in tested)
475
+ and "ascii" in tested
476
+ and "utf_8" in tested
477
+ ):
478
+ probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
479
+ logger.debug(
480
+ "Encoding detection: %s is most likely the one.",
481
+ probable_result.encoding,
482
+ )
483
+ if explain: # Defensive: ensure exit path clean handler
484
+ logger.removeHandler(explain_handler)
485
+ logger.setLevel(previous_logger_level)
486
+
487
+ return CharsetMatches([probable_result])
488
+
489
+ if encoding_iana == sig_encoding:
490
+ logger.debug(
491
+ "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
492
+ "the beginning of the sequence.",
493
+ encoding_iana,
494
+ )
495
+ if explain: # Defensive: ensure exit path clean handler
496
+ logger.removeHandler(explain_handler)
497
+ logger.setLevel(previous_logger_level)
498
+ return CharsetMatches([results[encoding_iana]])
499
+
500
+ if len(results) == 0:
501
+ if fallback_u8 or fallback_ascii or fallback_specified:
502
+ logger.log(
503
+ TRACE,
504
+ "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
505
+ )
506
+
507
+ if fallback_specified:
508
+ logger.debug(
509
+ "Encoding detection: %s will be used as a fallback match",
510
+ fallback_specified.encoding,
511
+ )
512
+ results.append(fallback_specified)
513
+ elif (
514
+ (fallback_u8 and fallback_ascii is None)
515
+ or (
516
+ fallback_u8
517
+ and fallback_ascii
518
+ and fallback_u8.fingerprint != fallback_ascii.fingerprint
519
+ )
520
+ or (fallback_u8 is not None)
521
+ ):
522
+ logger.debug("Encoding detection: utf_8 will be used as a fallback match")
523
+ results.append(fallback_u8)
524
+ elif fallback_ascii:
525
+ logger.debug("Encoding detection: ascii will be used as a fallback match")
526
+ results.append(fallback_ascii)
527
+
528
+ if results:
529
+ logger.debug(
530
+ "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
531
+ results.best().encoding, # type: ignore
532
+ len(results) - 1,
533
+ )
534
+ else:
535
+ logger.debug("Encoding detection: Unable to determine any suitable charset.")
536
+
537
+ if explain:
538
+ logger.removeHandler(explain_handler)
539
+ logger.setLevel(previous_logger_level)
540
+
541
+ return results
542
+
543
+
544
+ def from_fp(
545
+ fp: BinaryIO,
546
+ steps: int = 5,
547
+ chunk_size: int = 512,
548
+ threshold: float = 0.20,
549
+ cp_isolation: list[str] | None = None,
550
+ cp_exclusion: list[str] | None = None,
551
+ preemptive_behaviour: bool = True,
552
+ explain: bool = False,
553
+ language_threshold: float = 0.1,
554
+ enable_fallback: bool = True,
555
+ ) -> CharsetMatches:
556
+ """
557
+ Same thing than the function from_bytes but using a file pointer that is already ready.
558
+ Will not close the file pointer.
559
+ """
560
+ return from_bytes(
561
+ fp.read(),
562
+ steps,
563
+ chunk_size,
564
+ threshold,
565
+ cp_isolation,
566
+ cp_exclusion,
567
+ preemptive_behaviour,
568
+ explain,
569
+ language_threshold,
570
+ enable_fallback,
571
+ )
572
+
573
+
574
+ def from_path(
575
+ path: str | bytes | PathLike, # type: ignore[type-arg]
576
+ steps: int = 5,
577
+ chunk_size: int = 512,
578
+ threshold: float = 0.20,
579
+ cp_isolation: list[str] | None = None,
580
+ cp_exclusion: list[str] | None = None,
581
+ preemptive_behaviour: bool = True,
582
+ explain: bool = False,
583
+ language_threshold: float = 0.1,
584
+ enable_fallback: bool = True,
585
+ ) -> CharsetMatches:
586
+ """
587
+ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
588
+ Can raise IOError.
589
+ """
590
+ with open(path, "rb") as fp:
591
+ return from_fp(
592
+ fp,
593
+ steps,
594
+ chunk_size,
595
+ threshold,
596
+ cp_isolation,
597
+ cp_exclusion,
598
+ preemptive_behaviour,
599
+ explain,
600
+ language_threshold,
601
+ enable_fallback,
602
+ )
603
+
604
+
605
+ def is_binary(
606
+ fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
607
+ steps: int = 5,
608
+ chunk_size: int = 512,
609
+ threshold: float = 0.20,
610
+ cp_isolation: list[str] | None = None,
611
+ cp_exclusion: list[str] | None = None,
612
+ preemptive_behaviour: bool = True,
613
+ explain: bool = False,
614
+ language_threshold: float = 0.1,
615
+ enable_fallback: bool = False,
616
+ ) -> bool:
617
+ """
618
+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
619
+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
620
+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
621
+ """
622
+ if isinstance(fp_or_path_or_payload, (str, PathLike)):
623
+ guesses = from_path(
624
+ fp_or_path_or_payload,
625
+ steps=steps,
626
+ chunk_size=chunk_size,
627
+ threshold=threshold,
628
+ cp_isolation=cp_isolation,
629
+ cp_exclusion=cp_exclusion,
630
+ preemptive_behaviour=preemptive_behaviour,
631
+ explain=explain,
632
+ language_threshold=language_threshold,
633
+ enable_fallback=enable_fallback,
634
+ )
635
+ elif isinstance(
636
+ fp_or_path_or_payload,
637
+ (
638
+ bytes,
639
+ bytearray,
640
+ ),
641
+ ):
642
+ guesses = from_bytes(
643
+ fp_or_path_or_payload,
644
+ steps=steps,
645
+ chunk_size=chunk_size,
646
+ threshold=threshold,
647
+ cp_isolation=cp_isolation,
648
+ cp_exclusion=cp_exclusion,
649
+ preemptive_behaviour=preemptive_behaviour,
650
+ explain=explain,
651
+ language_threshold=language_threshold,
652
+ enable_fallback=enable_fallback,
653
+ )
654
+ else:
655
+ guesses = from_fp(
656
+ fp_or_path_or_payload,
657
+ steps=steps,
658
+ chunk_size=chunk_size,
659
+ threshold=threshold,
660
+ cp_isolation=cp_isolation,
661
+ cp_exclusion=cp_exclusion,
662
+ preemptive_behaviour=preemptive_behaviour,
663
+ explain=explain,
664
+ language_threshold=language_threshold,
665
+ enable_fallback=enable_fallback,
666
+ )
667
+
668
+ return not guesses
meow/lib/python3.13/site-packages/charset_normalizer/legacy.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+ from warnings import warn
5
+
6
+ from .api import from_bytes
7
+ from .constant import CHARDET_CORRESPONDENCE
8
+
9
+ # TODO: remove this check when dropping Python 3.7 support
10
+ if TYPE_CHECKING:
11
+ from typing_extensions import TypedDict
12
+
13
+ class ResultDict(TypedDict):
14
+ encoding: str | None
15
+ language: str
16
+ confidence: float | None
17
+
18
+
19
+ def detect(
20
+ byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
21
+ ) -> ResultDict:
22
+ """
23
+ chardet legacy method
24
+ Detect the encoding of the given byte string. It should be mostly backward-compatible.
25
+ Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
26
+ This function is deprecated and should be used to migrate your project easily, consult the documentation for
27
+ further information. Not planned for removal.
28
+
29
+ :param byte_str: The byte sequence to examine.
30
+ :param should_rename_legacy: Should we rename legacy encodings
31
+ to their more modern equivalents?
32
+ """
33
+ if len(kwargs):
34
+ warn(
35
+ f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
36
+ )
37
+
38
+ if not isinstance(byte_str, (bytearray, bytes)):
39
+ raise TypeError( # pragma: nocover
40
+ "Expected object of type bytes or bytearray, got: " "{}".format(
41
+ type(byte_str)
42
+ )
43
+ )
44
+
45
+ if isinstance(byte_str, bytearray):
46
+ byte_str = bytes(byte_str)
47
+
48
+ r = from_bytes(byte_str).best()
49
+
50
+ encoding = r.encoding if r is not None else None
51
+ language = r.language if r is not None and r.language != "Unknown" else ""
52
+ confidence = 1.0 - r.chaos if r is not None else None
53
+
54
+ # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
55
+ # but chardet does return 'utf-8-sig' and it is a valid codec name.
56
+ if r is not None and encoding == "utf_8" and r.bom:
57
+ encoding += "_sig"
58
+
59
+ if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
60
+ encoding = CHARDET_CORRESPONDENCE[encoding]
61
+
62
+ return {
63
+ "encoding": encoding,
64
+ "language": language,
65
+ "confidence": confidence,
66
+ }
meow/lib/python3.13/site-packages/charset_normalizer/md.py ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from logging import getLogger
5
+
6
+ from .constant import (
7
+ COMMON_SAFE_ASCII_CHARACTERS,
8
+ TRACE,
9
+ UNICODE_SECONDARY_RANGE_KEYWORD,
10
+ )
11
+ from .utils import (
12
+ is_accentuated,
13
+ is_arabic,
14
+ is_arabic_isolated_form,
15
+ is_case_variable,
16
+ is_cjk,
17
+ is_emoticon,
18
+ is_hangul,
19
+ is_hiragana,
20
+ is_katakana,
21
+ is_latin,
22
+ is_punctuation,
23
+ is_separator,
24
+ is_symbol,
25
+ is_thai,
26
+ is_unprintable,
27
+ remove_accent,
28
+ unicode_range,
29
+ )
30
+
31
+
32
+ class MessDetectorPlugin:
33
+ """
34
+ Base abstract class used for mess detection plugins.
35
+ All detectors MUST extend and implement given methods.
36
+ """
37
+
38
+ def eligible(self, character: str) -> bool:
39
+ """
40
+ Determine if given character should be fed in.
41
+ """
42
+ raise NotImplementedError # pragma: nocover
43
+
44
+ def feed(self, character: str) -> None:
45
+ """
46
+ The main routine to be executed upon character.
47
+ Insert the logic in witch the text would be considered chaotic.
48
+ """
49
+ raise NotImplementedError # pragma: nocover
50
+
51
+ def reset(self) -> None: # pragma: no cover
52
+ """
53
+ Permit to reset the plugin to the initial state.
54
+ """
55
+ raise NotImplementedError
56
+
57
+ @property
58
+ def ratio(self) -> float:
59
+ """
60
+ Compute the chaos ratio based on what your feed() has seen.
61
+ Must NOT be lower than 0.; No restriction gt 0.
62
+ """
63
+ raise NotImplementedError # pragma: nocover
64
+
65
+
66
+ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
67
+ def __init__(self) -> None:
68
+ self._punctuation_count: int = 0
69
+ self._symbol_count: int = 0
70
+ self._character_count: int = 0
71
+
72
+ self._last_printable_char: str | None = None
73
+ self._frenzy_symbol_in_word: bool = False
74
+
75
+ def eligible(self, character: str) -> bool:
76
+ return character.isprintable()
77
+
78
+ def feed(self, character: str) -> None:
79
+ self._character_count += 1
80
+
81
+ if (
82
+ character != self._last_printable_char
83
+ and character not in COMMON_SAFE_ASCII_CHARACTERS
84
+ ):
85
+ if is_punctuation(character):
86
+ self._punctuation_count += 1
87
+ elif (
88
+ character.isdigit() is False
89
+ and is_symbol(character)
90
+ and is_emoticon(character) is False
91
+ ):
92
+ self._symbol_count += 2
93
+
94
+ self._last_printable_char = character
95
+
96
+ def reset(self) -> None: # Abstract
97
+ self._punctuation_count = 0
98
+ self._character_count = 0
99
+ self._symbol_count = 0
100
+
101
+ @property
102
+ def ratio(self) -> float:
103
+ if self._character_count == 0:
104
+ return 0.0
105
+
106
+ ratio_of_punctuation: float = (
107
+ self._punctuation_count + self._symbol_count
108
+ ) / self._character_count
109
+
110
+ return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
111
+
112
+
113
+ class TooManyAccentuatedPlugin(MessDetectorPlugin):
114
+ def __init__(self) -> None:
115
+ self._character_count: int = 0
116
+ self._accentuated_count: int = 0
117
+
118
+ def eligible(self, character: str) -> bool:
119
+ return character.isalpha()
120
+
121
+ def feed(self, character: str) -> None:
122
+ self._character_count += 1
123
+
124
+ if is_accentuated(character):
125
+ self._accentuated_count += 1
126
+
127
+ def reset(self) -> None: # Abstract
128
+ self._character_count = 0
129
+ self._accentuated_count = 0
130
+
131
+ @property
132
+ def ratio(self) -> float:
133
+ if self._character_count < 8:
134
+ return 0.0
135
+
136
+ ratio_of_accentuation: float = self._accentuated_count / self._character_count
137
+ return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
138
+
139
+
140
+ class UnprintablePlugin(MessDetectorPlugin):
141
+ def __init__(self) -> None:
142
+ self._unprintable_count: int = 0
143
+ self._character_count: int = 0
144
+
145
+ def eligible(self, character: str) -> bool:
146
+ return True
147
+
148
+ def feed(self, character: str) -> None:
149
+ if is_unprintable(character):
150
+ self._unprintable_count += 1
151
+ self._character_count += 1
152
+
153
+ def reset(self) -> None: # Abstract
154
+ self._unprintable_count = 0
155
+
156
+ @property
157
+ def ratio(self) -> float:
158
+ if self._character_count == 0:
159
+ return 0.0
160
+
161
+ return (self._unprintable_count * 8) / self._character_count
162
+
163
+
164
+ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
165
+ def __init__(self) -> None:
166
+ self._successive_count: int = 0
167
+ self._character_count: int = 0
168
+
169
+ self._last_latin_character: str | None = None
170
+
171
+ def eligible(self, character: str) -> bool:
172
+ return character.isalpha() and is_latin(character)
173
+
174
+ def feed(self, character: str) -> None:
175
+ self._character_count += 1
176
+ if (
177
+ self._last_latin_character is not None
178
+ and is_accentuated(character)
179
+ and is_accentuated(self._last_latin_character)
180
+ ):
181
+ if character.isupper() and self._last_latin_character.isupper():
182
+ self._successive_count += 1
183
+ # Worse if its the same char duplicated with different accent.
184
+ if remove_accent(character) == remove_accent(self._last_latin_character):
185
+ self._successive_count += 1
186
+ self._last_latin_character = character
187
+
188
+ def reset(self) -> None: # Abstract
189
+ self._successive_count = 0
190
+ self._character_count = 0
191
+ self._last_latin_character = None
192
+
193
+ @property
194
+ def ratio(self) -> float:
195
+ if self._character_count == 0:
196
+ return 0.0
197
+
198
+ return (self._successive_count * 2) / self._character_count
199
+
200
+
201
+ class SuspiciousRange(MessDetectorPlugin):
202
+ def __init__(self) -> None:
203
+ self._suspicious_successive_range_count: int = 0
204
+ self._character_count: int = 0
205
+ self._last_printable_seen: str | None = None
206
+
207
+ def eligible(self, character: str) -> bool:
208
+ return character.isprintable()
209
+
210
+ def feed(self, character: str) -> None:
211
+ self._character_count += 1
212
+
213
+ if (
214
+ character.isspace()
215
+ or is_punctuation(character)
216
+ or character in COMMON_SAFE_ASCII_CHARACTERS
217
+ ):
218
+ self._last_printable_seen = None
219
+ return
220
+
221
+ if self._last_printable_seen is None:
222
+ self._last_printable_seen = character
223
+ return
224
+
225
+ unicode_range_a: str | None = unicode_range(self._last_printable_seen)
226
+ unicode_range_b: str | None = unicode_range(character)
227
+
228
+ if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
229
+ self._suspicious_successive_range_count += 1
230
+
231
+ self._last_printable_seen = character
232
+
233
+ def reset(self) -> None: # Abstract
234
+ self._character_count = 0
235
+ self._suspicious_successive_range_count = 0
236
+ self._last_printable_seen = None
237
+
238
+ @property
239
+ def ratio(self) -> float:
240
+ if self._character_count <= 13:
241
+ return 0.0
242
+
243
+ ratio_of_suspicious_range_usage: float = (
244
+ self._suspicious_successive_range_count * 2
245
+ ) / self._character_count
246
+
247
+ return ratio_of_suspicious_range_usage
248
+
249
+
250
+ class SuperWeirdWordPlugin(MessDetectorPlugin):
251
+ def __init__(self) -> None:
252
+ self._word_count: int = 0
253
+ self._bad_word_count: int = 0
254
+ self._foreign_long_count: int = 0
255
+
256
+ self._is_current_word_bad: bool = False
257
+ self._foreign_long_watch: bool = False
258
+
259
+ self._character_count: int = 0
260
+ self._bad_character_count: int = 0
261
+
262
+ self._buffer: str = ""
263
+ self._buffer_accent_count: int = 0
264
+ self._buffer_glyph_count: int = 0
265
+
266
+ def eligible(self, character: str) -> bool:
267
+ return True
268
+
269
+ def feed(self, character: str) -> None:
270
+ if character.isalpha():
271
+ self._buffer += character
272
+ if is_accentuated(character):
273
+ self._buffer_accent_count += 1
274
+ if (
275
+ self._foreign_long_watch is False
276
+ and (is_latin(character) is False or is_accentuated(character))
277
+ and is_cjk(character) is False
278
+ and is_hangul(character) is False
279
+ and is_katakana(character) is False
280
+ and is_hiragana(character) is False
281
+ and is_thai(character) is False
282
+ ):
283
+ self._foreign_long_watch = True
284
+ if (
285
+ is_cjk(character)
286
+ or is_hangul(character)
287
+ or is_katakana(character)
288
+ or is_hiragana(character)
289
+ or is_thai(character)
290
+ ):
291
+ self._buffer_glyph_count += 1
292
+ return
293
+ if not self._buffer:
294
+ return
295
+ if (
296
+ character.isspace() or is_punctuation(character) or is_separator(character)
297
+ ) and self._buffer:
298
+ self._word_count += 1
299
+ buffer_length: int = len(self._buffer)
300
+
301
+ self._character_count += buffer_length
302
+
303
+ if buffer_length >= 4:
304
+ if self._buffer_accent_count / buffer_length >= 0.5:
305
+ self._is_current_word_bad = True
306
+ # Word/Buffer ending with an upper case accentuated letter are so rare,
307
+ # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
308
+ elif (
309
+ is_accentuated(self._buffer[-1])
310
+ and self._buffer[-1].isupper()
311
+ and all(_.isupper() for _ in self._buffer) is False
312
+ ):
313
+ self._foreign_long_count += 1
314
+ self._is_current_word_bad = True
315
+ elif self._buffer_glyph_count == 1:
316
+ self._is_current_word_bad = True
317
+ self._foreign_long_count += 1
318
+ if buffer_length >= 24 and self._foreign_long_watch:
319
+ camel_case_dst = [
320
+ i
321
+ for c, i in zip(self._buffer, range(0, buffer_length))
322
+ if c.isupper()
323
+ ]
324
+ probable_camel_cased: bool = False
325
+
326
+ if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
327
+ probable_camel_cased = True
328
+
329
+ if not probable_camel_cased:
330
+ self._foreign_long_count += 1
331
+ self._is_current_word_bad = True
332
+
333
+ if self._is_current_word_bad:
334
+ self._bad_word_count += 1
335
+ self._bad_character_count += len(self._buffer)
336
+ self._is_current_word_bad = False
337
+
338
+ self._foreign_long_watch = False
339
+ self._buffer = ""
340
+ self._buffer_accent_count = 0
341
+ self._buffer_glyph_count = 0
342
+ elif (
343
+ character not in {"<", ">", "-", "=", "~", "|", "_"}
344
+ and character.isdigit() is False
345
+ and is_symbol(character)
346
+ ):
347
+ self._is_current_word_bad = True
348
+ self._buffer += character
349
+
350
+ def reset(self) -> None: # Abstract
351
+ self._buffer = ""
352
+ self._is_current_word_bad = False
353
+ self._foreign_long_watch = False
354
+ self._bad_word_count = 0
355
+ self._word_count = 0
356
+ self._character_count = 0
357
+ self._bad_character_count = 0
358
+ self._foreign_long_count = 0
359
+
360
+ @property
361
+ def ratio(self) -> float:
362
+ if self._word_count <= 10 and self._foreign_long_count == 0:
363
+ return 0.0
364
+
365
+ return self._bad_character_count / self._character_count
366
+
367
+
368
+ class CjkInvalidStopPlugin(MessDetectorPlugin):
369
+ """
370
+ GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
371
+ can be easily detected. Searching for the overuse of '丅' and '丄'.
372
+ """
373
+
374
+ def __init__(self) -> None:
375
+ self._wrong_stop_count: int = 0
376
+ self._cjk_character_count: int = 0
377
+
378
+ def eligible(self, character: str) -> bool:
379
+ return True
380
+
381
+ def feed(self, character: str) -> None:
382
+ if character in {"丅", "丄"}:
383
+ self._wrong_stop_count += 1
384
+ return
385
+ if is_cjk(character):
386
+ self._cjk_character_count += 1
387
+
388
+ def reset(self) -> None: # Abstract
389
+ self._wrong_stop_count = 0
390
+ self._cjk_character_count = 0
391
+
392
+ @property
393
+ def ratio(self) -> float:
394
+ if self._cjk_character_count < 16:
395
+ return 0.0
396
+ return self._wrong_stop_count / self._cjk_character_count
397
+
398
+
399
+ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
400
+ def __init__(self) -> None:
401
+ self._buf: bool = False
402
+
403
+ self._character_count_since_last_sep: int = 0
404
+
405
+ self._successive_upper_lower_count: int = 0
406
+ self._successive_upper_lower_count_final: int = 0
407
+
408
+ self._character_count: int = 0
409
+
410
+ self._last_alpha_seen: str | None = None
411
+ self._current_ascii_only: bool = True
412
+
413
+ def eligible(self, character: str) -> bool:
414
+ return True
415
+
416
+ def feed(self, character: str) -> None:
417
+ is_concerned = character.isalpha() and is_case_variable(character)
418
+ chunk_sep = is_concerned is False
419
+
420
+ if chunk_sep and self._character_count_since_last_sep > 0:
421
+ if (
422
+ self._character_count_since_last_sep <= 64
423
+ and character.isdigit() is False
424
+ and self._current_ascii_only is False
425
+ ):
426
+ self._successive_upper_lower_count_final += (
427
+ self._successive_upper_lower_count
428
+ )
429
+
430
+ self._successive_upper_lower_count = 0
431
+ self._character_count_since_last_sep = 0
432
+ self._last_alpha_seen = None
433
+ self._buf = False
434
+ self._character_count += 1
435
+ self._current_ascii_only = True
436
+
437
+ return
438
+
439
+ if self._current_ascii_only is True and character.isascii() is False:
440
+ self._current_ascii_only = False
441
+
442
+ if self._last_alpha_seen is not None:
443
+ if (character.isupper() and self._last_alpha_seen.islower()) or (
444
+ character.islower() and self._last_alpha_seen.isupper()
445
+ ):
446
+ if self._buf is True:
447
+ self._successive_upper_lower_count += 2
448
+ self._buf = False
449
+ else:
450
+ self._buf = True
451
+ else:
452
+ self._buf = False
453
+
454
+ self._character_count += 1
455
+ self._character_count_since_last_sep += 1
456
+ self._last_alpha_seen = character
457
+
458
+ def reset(self) -> None: # Abstract
459
+ self._character_count = 0
460
+ self._character_count_since_last_sep = 0
461
+ self._successive_upper_lower_count = 0
462
+ self._successive_upper_lower_count_final = 0
463
+ self._last_alpha_seen = None
464
+ self._buf = False
465
+ self._current_ascii_only = True
466
+
467
+ @property
468
+ def ratio(self) -> float:
469
+ if self._character_count == 0:
470
+ return 0.0
471
+
472
+ return self._successive_upper_lower_count_final / self._character_count
473
+
474
+
475
+ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
476
+ def __init__(self) -> None:
477
+ self._character_count: int = 0
478
+ self._isolated_form_count: int = 0
479
+
480
+ def reset(self) -> None: # Abstract
481
+ self._character_count = 0
482
+ self._isolated_form_count = 0
483
+
484
+ def eligible(self, character: str) -> bool:
485
+ return is_arabic(character)
486
+
487
+ def feed(self, character: str) -> None:
488
+ self._character_count += 1
489
+
490
+ if is_arabic_isolated_form(character):
491
+ self._isolated_form_count += 1
492
+
493
+ @property
494
+ def ratio(self) -> float:
495
+ if self._character_count < 8:
496
+ return 0.0
497
+
498
+ isolated_form_usage: float = self._isolated_form_count / self._character_count
499
+
500
+ return isolated_form_usage
501
+
502
+
503
+ @lru_cache(maxsize=1024)
504
+ def is_suspiciously_successive_range(
505
+ unicode_range_a: str | None, unicode_range_b: str | None
506
+ ) -> bool:
507
+ """
508
+ Determine if two Unicode range seen next to each other can be considered as suspicious.
509
+ """
510
+ if unicode_range_a is None or unicode_range_b is None:
511
+ return True
512
+
513
+ if unicode_range_a == unicode_range_b:
514
+ return False
515
+
516
+ if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
517
+ return False
518
+
519
+ if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
520
+ return False
521
+
522
+ # Latin characters can be accompanied with a combining diacritical mark
523
+ # eg. Vietnamese.
524
+ if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
525
+ "Combining" in unicode_range_a or "Combining" in unicode_range_b
526
+ ):
527
+ return False
528
+
529
+ keywords_range_a, keywords_range_b = (
530
+ unicode_range_a.split(" "),
531
+ unicode_range_b.split(" "),
532
+ )
533
+
534
+ for el in keywords_range_a:
535
+ if el in UNICODE_SECONDARY_RANGE_KEYWORD:
536
+ continue
537
+ if el in keywords_range_b:
538
+ return False
539
+
540
+ # Japanese Exception
541
+ range_a_jp_chars, range_b_jp_chars = (
542
+ unicode_range_a
543
+ in (
544
+ "Hiragana",
545
+ "Katakana",
546
+ ),
547
+ unicode_range_b in ("Hiragana", "Katakana"),
548
+ )
549
+ if (range_a_jp_chars or range_b_jp_chars) and (
550
+ "CJK" in unicode_range_a or "CJK" in unicode_range_b
551
+ ):
552
+ return False
553
+ if range_a_jp_chars and range_b_jp_chars:
554
+ return False
555
+
556
+ if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
557
+ if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
558
+ return False
559
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
560
+ return False
561
+
562
+ # Chinese/Japanese use dedicated range for punctuation and/or separators.
563
+ if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
564
+ unicode_range_a in ["Katakana", "Hiragana"]
565
+ and unicode_range_b in ["Katakana", "Hiragana"]
566
+ ):
567
+ if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
568
+ return False
569
+ if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
570
+ return False
571
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
572
+ return False
573
+
574
+ return True
575
+
576
+
577
+ @lru_cache(maxsize=2048)
578
+ def mess_ratio(
579
+ decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
580
+ ) -> float:
581
+ """
582
+ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
583
+ """
584
+
585
+ detectors: list[MessDetectorPlugin] = [
586
+ md_class() for md_class in MessDetectorPlugin.__subclasses__()
587
+ ]
588
+
589
+ length: int = len(decoded_sequence) + 1
590
+
591
+ mean_mess_ratio: float = 0.0
592
+
593
+ if length < 512:
594
+ intermediary_mean_mess_ratio_calc: int = 32
595
+ elif length <= 1024:
596
+ intermediary_mean_mess_ratio_calc = 64
597
+ else:
598
+ intermediary_mean_mess_ratio_calc = 128
599
+
600
+ for character, index in zip(decoded_sequence + "\n", range(length)):
601
+ for detector in detectors:
602
+ if detector.eligible(character):
603
+ detector.feed(character)
604
+
605
+ if (
606
+ index > 0 and index % intermediary_mean_mess_ratio_calc == 0
607
+ ) or index == length - 1:
608
+ mean_mess_ratio = sum(dt.ratio for dt in detectors)
609
+
610
+ if mean_mess_ratio >= maximum_threshold:
611
+ break
612
+
613
+ if debug:
614
+ logger = getLogger("charset_normalizer")
615
+
616
+ logger.log(
617
+ TRACE,
618
+ "Mess-detector extended-analysis start. "
619
+ f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
620
+ f"maximum_threshold={maximum_threshold}",
621
+ )
622
+
623
+ if len(decoded_sequence) > 16:
624
+ logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
625
+ logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
626
+
627
+ for dt in detectors:
628
+ logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
629
+
630
+ return round(mean_mess_ratio, 3)
meow/lib/python3.13/site-packages/charset_normalizer/models.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from encodings.aliases import aliases
4
+ from hashlib import sha256
5
+ from json import dumps
6
+ from re import sub
7
+ from typing import Any, Iterator, List, Tuple
8
+
9
+ from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
10
+ from .utils import iana_name, is_multi_byte_encoding, unicode_range
11
+
12
+
13
+ class CharsetMatch:
14
+ def __init__(
15
+ self,
16
+ payload: bytes,
17
+ guessed_encoding: str,
18
+ mean_mess_ratio: float,
19
+ has_sig_or_bom: bool,
20
+ languages: CoherenceMatches,
21
+ decoded_payload: str | None = None,
22
+ preemptive_declaration: str | None = None,
23
+ ):
24
+ self._payload: bytes = payload
25
+
26
+ self._encoding: str = guessed_encoding
27
+ self._mean_mess_ratio: float = mean_mess_ratio
28
+ self._languages: CoherenceMatches = languages
29
+ self._has_sig_or_bom: bool = has_sig_or_bom
30
+ self._unicode_ranges: list[str] | None = None
31
+
32
+ self._leaves: list[CharsetMatch] = []
33
+ self._mean_coherence_ratio: float = 0.0
34
+
35
+ self._output_payload: bytes | None = None
36
+ self._output_encoding: str | None = None
37
+
38
+ self._string: str | None = decoded_payload
39
+
40
+ self._preemptive_declaration: str | None = preemptive_declaration
41
+
42
+ def __eq__(self, other: object) -> bool:
43
+ if not isinstance(other, CharsetMatch):
44
+ if isinstance(other, str):
45
+ return iana_name(other) == self.encoding
46
+ return False
47
+ return self.encoding == other.encoding and self.fingerprint == other.fingerprint
48
+
49
+ def __lt__(self, other: object) -> bool:
50
+ """
51
+ Implemented to make sorted available upon CharsetMatches items.
52
+ """
53
+ if not isinstance(other, CharsetMatch):
54
+ raise ValueError
55
+
56
+ chaos_difference: float = abs(self.chaos - other.chaos)
57
+ coherence_difference: float = abs(self.coherence - other.coherence)
58
+
59
+ # Below 1% difference --> Use Coherence
60
+ if chaos_difference < 0.01 and coherence_difference > 0.02:
61
+ return self.coherence > other.coherence
62
+ elif chaos_difference < 0.01 and coherence_difference <= 0.02:
63
+ # When having a difficult decision, use the result that decoded as many multi-byte as possible.
64
+ # preserve RAM usage!
65
+ if len(self._payload) >= TOO_BIG_SEQUENCE:
66
+ return self.chaos < other.chaos
67
+ return self.multi_byte_usage > other.multi_byte_usage
68
+
69
+ return self.chaos < other.chaos
70
+
71
+ @property
72
+ def multi_byte_usage(self) -> float:
73
+ return 1.0 - (len(str(self)) / len(self.raw))
74
+
75
+ def __str__(self) -> str:
76
+ # Lazy Str Loading
77
+ if self._string is None:
78
+ self._string = str(self._payload, self._encoding, "strict")
79
+ return self._string
80
+
81
+ def __repr__(self) -> str:
82
+ return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
83
+
84
+ def add_submatch(self, other: CharsetMatch) -> None:
85
+ if not isinstance(other, CharsetMatch) or other == self:
86
+ raise ValueError(
87
+ "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
88
+ other.__class__
89
+ )
90
+ )
91
+
92
+ other._string = None # Unload RAM usage; dirty trick.
93
+ self._leaves.append(other)
94
+
95
+ @property
96
+ def encoding(self) -> str:
97
+ return self._encoding
98
+
99
+ @property
100
+ def encoding_aliases(self) -> list[str]:
101
+ """
102
+ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
103
+ """
104
+ also_known_as: list[str] = []
105
+ for u, p in aliases.items():
106
+ if self.encoding == u:
107
+ also_known_as.append(p)
108
+ elif self.encoding == p:
109
+ also_known_as.append(u)
110
+ return also_known_as
111
+
112
+ @property
113
+ def bom(self) -> bool:
114
+ return self._has_sig_or_bom
115
+
116
+ @property
117
+ def byte_order_mark(self) -> bool:
118
+ return self._has_sig_or_bom
119
+
120
+ @property
121
+ def languages(self) -> list[str]:
122
+ """
123
+ Return the complete list of possible languages found in decoded sequence.
124
+ Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
125
+ """
126
+ return [e[0] for e in self._languages]
127
+
128
+ @property
129
+ def language(self) -> str:
130
+ """
131
+ Most probable language found in decoded sequence. If none were detected or inferred, the property will return
132
+ "Unknown".
133
+ """
134
+ if not self._languages:
135
+ # Trying to infer the language based on the given encoding
136
+ # Its either English or we should not pronounce ourselves in certain cases.
137
+ if "ascii" in self.could_be_from_charset:
138
+ return "English"
139
+
140
+ # doing it there to avoid circular import
141
+ from charset_normalizer.cd import encoding_languages, mb_encoding_languages
142
+
143
+ languages = (
144
+ mb_encoding_languages(self.encoding)
145
+ if is_multi_byte_encoding(self.encoding)
146
+ else encoding_languages(self.encoding)
147
+ )
148
+
149
+ if len(languages) == 0 or "Latin Based" in languages:
150
+ return "Unknown"
151
+
152
+ return languages[0]
153
+
154
+ return self._languages[0][0]
155
+
156
+ @property
157
+ def chaos(self) -> float:
158
+ return self._mean_mess_ratio
159
+
160
+ @property
161
+ def coherence(self) -> float:
162
+ if not self._languages:
163
+ return 0.0
164
+ return self._languages[0][1]
165
+
166
+ @property
167
+ def percent_chaos(self) -> float:
168
+ return round(self.chaos * 100, ndigits=3)
169
+
170
+ @property
171
+ def percent_coherence(self) -> float:
172
+ return round(self.coherence * 100, ndigits=3)
173
+
174
+ @property
175
+ def raw(self) -> bytes:
176
+ """
177
+ Original untouched bytes.
178
+ """
179
+ return self._payload
180
+
181
+ @property
182
+ def submatch(self) -> list[CharsetMatch]:
183
+ return self._leaves
184
+
185
+ @property
186
+ def has_submatch(self) -> bool:
187
+ return len(self._leaves) > 0
188
+
189
+ @property
190
+ def alphabets(self) -> list[str]:
191
+ if self._unicode_ranges is not None:
192
+ return self._unicode_ranges
193
+ # list detected ranges
194
+ detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
195
+ # filter and sort
196
+ self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
197
+ return self._unicode_ranges
198
+
199
+ @property
200
+ def could_be_from_charset(self) -> list[str]:
201
+ """
202
+ The complete list of encoding that output the exact SAME str result and therefore could be the originating
203
+ encoding.
204
+ This list does include the encoding available in property 'encoding'.
205
+ """
206
+ return [self._encoding] + [m.encoding for m in self._leaves]
207
+
208
+ def output(self, encoding: str = "utf_8") -> bytes:
209
+ """
210
+ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
211
+ Any errors will be simply ignored by the encoder NOT replaced.
212
+ """
213
+ if self._output_encoding is None or self._output_encoding != encoding:
214
+ self._output_encoding = encoding
215
+ decoded_string = str(self)
216
+ if (
217
+ self._preemptive_declaration is not None
218
+ and self._preemptive_declaration.lower()
219
+ not in ["utf-8", "utf8", "utf_8"]
220
+ ):
221
+ patched_header = sub(
222
+ RE_POSSIBLE_ENCODING_INDICATION,
223
+ lambda m: m.string[m.span()[0] : m.span()[1]].replace(
224
+ m.groups()[0],
225
+ iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
226
+ ),
227
+ decoded_string[:8192],
228
+ count=1,
229
+ )
230
+
231
+ decoded_string = patched_header + decoded_string[8192:]
232
+
233
+ self._output_payload = decoded_string.encode(encoding, "replace")
234
+
235
+ return self._output_payload # type: ignore
236
+
237
+ @property
238
+ def fingerprint(self) -> str:
239
+ """
240
+ Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
241
+ """
242
+ return sha256(self.output()).hexdigest()
243
+
244
+
245
+ class CharsetMatches:
246
+ """
247
+ Container with every CharsetMatch items ordered by default from most probable to the less one.
248
+ Act like a list(iterable) but does not implements all related methods.
249
+ """
250
+
251
+ def __init__(self, results: list[CharsetMatch] | None = None):
252
+ self._results: list[CharsetMatch] = sorted(results) if results else []
253
+
254
+ def __iter__(self) -> Iterator[CharsetMatch]:
255
+ yield from self._results
256
+
257
+ def __getitem__(self, item: int | str) -> CharsetMatch:
258
+ """
259
+ Retrieve a single item either by its position or encoding name (alias may be used here).
260
+ Raise KeyError upon invalid index or encoding not present in results.
261
+ """
262
+ if isinstance(item, int):
263
+ return self._results[item]
264
+ if isinstance(item, str):
265
+ item = iana_name(item, False)
266
+ for result in self._results:
267
+ if item in result.could_be_from_charset:
268
+ return result
269
+ raise KeyError
270
+
271
+ def __len__(self) -> int:
272
+ return len(self._results)
273
+
274
+ def __bool__(self) -> bool:
275
+ return len(self._results) > 0
276
+
277
+ def append(self, item: CharsetMatch) -> None:
278
+ """
279
+ Insert a single match. Will be inserted accordingly to preserve sort.
280
+ Can be inserted as a submatch.
281
+ """
282
+ if not isinstance(item, CharsetMatch):
283
+ raise ValueError(
284
+ "Cannot append instance '{}' to CharsetMatches".format(
285
+ str(item.__class__)
286
+ )
287
+ )
288
+ # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
289
+ if len(item.raw) < TOO_BIG_SEQUENCE:
290
+ for match in self._results:
291
+ if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
292
+ match.add_submatch(item)
293
+ return
294
+ self._results.append(item)
295
+ self._results = sorted(self._results)
296
+
297
+ def best(self) -> CharsetMatch | None:
298
+ """
299
+ Simply return the first match. Strict equivalent to matches[0].
300
+ """
301
+ if not self._results:
302
+ return None
303
+ return self._results[0]
304
+
305
+ def first(self) -> CharsetMatch | None:
306
+ """
307
+ Redundant method, call the method best(). Kept for BC reasons.
308
+ """
309
+ return self.best()
310
+
311
+
312
+ CoherenceMatch = Tuple[str, float]
313
+ CoherenceMatches = List[CoherenceMatch]
314
+
315
+
316
+ class CliDetectionResult:
317
+ def __init__(
318
+ self,
319
+ path: str,
320
+ encoding: str | None,
321
+ encoding_aliases: list[str],
322
+ alternative_encodings: list[str],
323
+ language: str,
324
+ alphabets: list[str],
325
+ has_sig_or_bom: bool,
326
+ chaos: float,
327
+ coherence: float,
328
+ unicode_path: str | None,
329
+ is_preferred: bool,
330
+ ):
331
+ self.path: str = path
332
+ self.unicode_path: str | None = unicode_path
333
+ self.encoding: str | None = encoding
334
+ self.encoding_aliases: list[str] = encoding_aliases
335
+ self.alternative_encodings: list[str] = alternative_encodings
336
+ self.language: str = language
337
+ self.alphabets: list[str] = alphabets
338
+ self.has_sig_or_bom: bool = has_sig_or_bom
339
+ self.chaos: float = chaos
340
+ self.coherence: float = coherence
341
+ self.is_preferred: bool = is_preferred
342
+
343
+ @property
344
+ def __dict__(self) -> dict[str, Any]: # type: ignore
345
+ return {
346
+ "path": self.path,
347
+ "encoding": self.encoding,
348
+ "encoding_aliases": self.encoding_aliases,
349
+ "alternative_encodings": self.alternative_encodings,
350
+ "language": self.language,
351
+ "alphabets": self.alphabets,
352
+ "has_sig_or_bom": self.has_sig_or_bom,
353
+ "chaos": self.chaos,
354
+ "coherence": self.coherence,
355
+ "unicode_path": self.unicode_path,
356
+ "is_preferred": self.is_preferred,
357
+ }
358
+
359
+ def to_json(self) -> str:
360
+ return dumps(self.__dict__, ensure_ascii=True, indent=4)
meow/lib/python3.13/site-packages/charset_normalizer/utils.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import unicodedata
6
+ from codecs import IncrementalDecoder
7
+ from encodings.aliases import aliases
8
+ from functools import lru_cache
9
+ from re import findall
10
+ from typing import Generator
11
+
12
+ from _multibytecodec import ( # type: ignore[import-not-found,import]
13
+ MultibyteIncrementalDecoder,
14
+ )
15
+
16
+ from .constant import (
17
+ ENCODING_MARKS,
18
+ IANA_SUPPORTED_SIMILAR,
19
+ RE_POSSIBLE_ENCODING_INDICATION,
20
+ UNICODE_RANGES_COMBINED,
21
+ UNICODE_SECONDARY_RANGE_KEYWORD,
22
+ UTF8_MAXIMAL_ALLOCATION,
23
+ )
24
+
25
+
26
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
27
+ def is_accentuated(character: str) -> bool:
28
+ try:
29
+ description: str = unicodedata.name(character)
30
+ except ValueError: # Defensive: unicode database outdated?
31
+ return False
32
+ return (
33
+ "WITH GRAVE" in description
34
+ or "WITH ACUTE" in description
35
+ or "WITH CEDILLA" in description
36
+ or "WITH DIAERESIS" in description
37
+ or "WITH CIRCUMFLEX" in description
38
+ or "WITH TILDE" in description
39
+ or "WITH MACRON" in description
40
+ or "WITH RING ABOVE" in description
41
+ )
42
+
43
+
44
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
45
+ def remove_accent(character: str) -> str:
46
+ decomposed: str = unicodedata.decomposition(character)
47
+ if not decomposed:
48
+ return character
49
+
50
+ codes: list[str] = decomposed.split(" ")
51
+
52
+ return chr(int(codes[0], 16))
53
+
54
+
55
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
56
+ def unicode_range(character: str) -> str | None:
57
+ """
58
+ Retrieve the Unicode range official name from a single character.
59
+ """
60
+ character_ord: int = ord(character)
61
+
62
+ for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
63
+ if character_ord in ord_range:
64
+ return range_name
65
+
66
+ return None
67
+
68
+
69
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
70
+ def is_latin(character: str) -> bool:
71
+ try:
72
+ description: str = unicodedata.name(character)
73
+ except ValueError: # Defensive: unicode database outdated?
74
+ return False
75
+ return "LATIN" in description
76
+
77
+
78
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
79
+ def is_punctuation(character: str) -> bool:
80
+ character_category: str = unicodedata.category(character)
81
+
82
+ if "P" in character_category:
83
+ return True
84
+
85
+ character_range: str | None = unicode_range(character)
86
+
87
+ if character_range is None:
88
+ return False
89
+
90
+ return "Punctuation" in character_range
91
+
92
+
93
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
94
+ def is_symbol(character: str) -> bool:
95
+ character_category: str = unicodedata.category(character)
96
+
97
+ if "S" in character_category or "N" in character_category:
98
+ return True
99
+
100
+ character_range: str | None = unicode_range(character)
101
+
102
+ if character_range is None:
103
+ return False
104
+
105
+ return "Forms" in character_range and character_category != "Lo"
106
+
107
+
108
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
109
+ def is_emoticon(character: str) -> bool:
110
+ character_range: str | None = unicode_range(character)
111
+
112
+ if character_range is None:
113
+ return False
114
+
115
+ return "Emoticons" in character_range or "Pictographs" in character_range
116
+
117
+
118
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
119
+ def is_separator(character: str) -> bool:
120
+ if character.isspace() or character in {"|", "+", "<", ">"}:
121
+ return True
122
+
123
+ character_category: str = unicodedata.category(character)
124
+
125
+ return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
126
+
127
+
128
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
129
+ def is_case_variable(character: str) -> bool:
130
+ return character.islower() != character.isupper()
131
+
132
+
133
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
134
+ def is_cjk(character: str) -> bool:
135
+ try:
136
+ character_name = unicodedata.name(character)
137
+ except ValueError: # Defensive: unicode database outdated?
138
+ return False
139
+
140
+ return "CJK" in character_name
141
+
142
+
143
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
144
+ def is_hiragana(character: str) -> bool:
145
+ try:
146
+ character_name = unicodedata.name(character)
147
+ except ValueError: # Defensive: unicode database outdated?
148
+ return False
149
+
150
+ return "HIRAGANA" in character_name
151
+
152
+
153
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
154
+ def is_katakana(character: str) -> bool:
155
+ try:
156
+ character_name = unicodedata.name(character)
157
+ except ValueError: # Defensive: unicode database outdated?
158
+ return False
159
+
160
+ return "KATAKANA" in character_name
161
+
162
+
163
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
164
+ def is_hangul(character: str) -> bool:
165
+ try:
166
+ character_name = unicodedata.name(character)
167
+ except ValueError: # Defensive: unicode database outdated?
168
+ return False
169
+
170
+ return "HANGUL" in character_name
171
+
172
+
173
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
174
+ def is_thai(character: str) -> bool:
175
+ try:
176
+ character_name = unicodedata.name(character)
177
+ except ValueError: # Defensive: unicode database outdated?
178
+ return False
179
+
180
+ return "THAI" in character_name
181
+
182
+
183
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
184
+ def is_arabic(character: str) -> bool:
185
+ try:
186
+ character_name = unicodedata.name(character)
187
+ except ValueError: # Defensive: unicode database outdated?
188
+ return False
189
+
190
+ return "ARABIC" in character_name
191
+
192
+
193
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
194
+ def is_arabic_isolated_form(character: str) -> bool:
195
+ try:
196
+ character_name = unicodedata.name(character)
197
+ except ValueError: # Defensive: unicode database outdated?
198
+ return False
199
+
200
+ return "ARABIC" in character_name and "ISOLATED FORM" in character_name
201
+
202
+
203
+ @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
204
+ def is_unicode_range_secondary(range_name: str) -> bool:
205
+ return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
206
+
207
+
208
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
209
+ def is_unprintable(character: str) -> bool:
210
+ return (
211
+ character.isspace() is False # includes \n \t \r \v
212
+ and character.isprintable() is False
213
+ and character != "\x1a" # Why? Its the ASCII substitute character.
214
+ and character != "\ufeff" # bug discovered in Python,
215
+ # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
216
+ )
217
+
218
+
219
+ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
220
+ """
221
+ Extract using ASCII-only decoder any specified encoding in the first n-bytes.
222
+ """
223
+ if not isinstance(sequence, bytes):
224
+ raise TypeError
225
+
226
+ seq_len: int = len(sequence)
227
+
228
+ results: list[str] = findall(
229
+ RE_POSSIBLE_ENCODING_INDICATION,
230
+ sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
231
+ )
232
+
233
+ if len(results) == 0:
234
+ return None
235
+
236
+ for specified_encoding in results:
237
+ specified_encoding = specified_encoding.lower().replace("-", "_")
238
+
239
+ encoding_alias: str
240
+ encoding_iana: str
241
+
242
+ for encoding_alias, encoding_iana in aliases.items():
243
+ if encoding_alias == specified_encoding:
244
+ return encoding_iana
245
+ if encoding_iana == specified_encoding:
246
+ return encoding_iana
247
+
248
+ return None
249
+
250
+
251
+ @lru_cache(maxsize=128)
252
+ def is_multi_byte_encoding(name: str) -> bool:
253
+ """
254
+ Verify is a specific encoding is a multi byte one based on it IANA name
255
+ """
256
+ return name in {
257
+ "utf_8",
258
+ "utf_8_sig",
259
+ "utf_16",
260
+ "utf_16_be",
261
+ "utf_16_le",
262
+ "utf_32",
263
+ "utf_32_le",
264
+ "utf_32_be",
265
+ "utf_7",
266
+ } or issubclass(
267
+ importlib.import_module(f"encodings.{name}").IncrementalDecoder,
268
+ MultibyteIncrementalDecoder,
269
+ )
270
+
271
+
272
+ def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
273
+ """
274
+ Identify and extract SIG/BOM in given sequence.
275
+ """
276
+
277
+ for iana_encoding in ENCODING_MARKS:
278
+ marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
279
+
280
+ if isinstance(marks, bytes):
281
+ marks = [marks]
282
+
283
+ for mark in marks:
284
+ if sequence.startswith(mark):
285
+ return iana_encoding, mark
286
+
287
+ return None, b""
288
+
289
+
290
+ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
291
+ return iana_encoding not in {"utf_16", "utf_32"}
292
+
293
+
294
+ def iana_name(cp_name: str, strict: bool = True) -> str:
295
+ """Returns the Python normalized encoding name (Not the IANA official name)."""
296
+ cp_name = cp_name.lower().replace("-", "_")
297
+
298
+ encoding_alias: str
299
+ encoding_iana: str
300
+
301
+ for encoding_alias, encoding_iana in aliases.items():
302
+ if cp_name in [encoding_alias, encoding_iana]:
303
+ return encoding_iana
304
+
305
+ if strict:
306
+ raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
307
+
308
+ return cp_name
309
+
310
+
311
+ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
312
+ if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
313
+ return 0.0
314
+
315
+ decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
316
+ decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
317
+
318
+ id_a: IncrementalDecoder = decoder_a(errors="ignore")
319
+ id_b: IncrementalDecoder = decoder_b(errors="ignore")
320
+
321
+ character_match_count: int = 0
322
+
323
+ for i in range(255):
324
+ to_be_decoded: bytes = bytes([i])
325
+ if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
326
+ character_match_count += 1
327
+
328
+ return character_match_count / 254
329
+
330
+
331
+ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
332
+ """
333
+ Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
334
+ the function cp_similarity.
335
+ """
336
+ return (
337
+ iana_name_a in IANA_SUPPORTED_SIMILAR
338
+ and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
339
+ )
340
+
341
+
342
+ def set_logging_handler(
343
+ name: str = "charset_normalizer",
344
+ level: int = logging.INFO,
345
+ format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
346
+ ) -> None:
347
+ logger = logging.getLogger(name)
348
+ logger.setLevel(level)
349
+
350
+ handler = logging.StreamHandler()
351
+ handler.setFormatter(logging.Formatter(format_string))
352
+ logger.addHandler(handler)
353
+
354
+
355
+ def cut_sequence_chunks(
356
+ sequences: bytes,
357
+ encoding_iana: str,
358
+ offsets: range,
359
+ chunk_size: int,
360
+ bom_or_sig_available: bool,
361
+ strip_sig_or_bom: bool,
362
+ sig_payload: bytes,
363
+ is_multi_byte_decoder: bool,
364
+ decoded_payload: str | None = None,
365
+ ) -> Generator[str, None, None]:
366
+ if decoded_payload and is_multi_byte_decoder is False:
367
+ for i in offsets:
368
+ chunk = decoded_payload[i : i + chunk_size]
369
+ if not chunk:
370
+ break
371
+ yield chunk
372
+ else:
373
+ for i in offsets:
374
+ chunk_end = i + chunk_size
375
+ if chunk_end > len(sequences) + 8:
376
+ continue
377
+
378
+ cut_sequence = sequences[i : i + chunk_size]
379
+
380
+ if bom_or_sig_available and strip_sig_or_bom is False:
381
+ cut_sequence = sig_payload + cut_sequence
382
+
383
+ chunk = cut_sequence.decode(
384
+ encoding_iana,
385
+ errors="ignore" if is_multi_byte_decoder else "strict",
386
+ )
387
+
388
+ # multi-byte bad cutting detector and adjustment
389
+ # not the cleanest way to perform that fix but clever enough for now.
390
+ if is_multi_byte_decoder and i > 0:
391
+ chunk_partial_size_chk: int = min(chunk_size, 16)
392
+
393
+ if (
394
+ decoded_payload
395
+ and chunk[:chunk_partial_size_chk] not in decoded_payload
396
+ ):
397
+ for j in range(i, i - 4, -1):
398
+ cut_sequence = sequences[j:chunk_end]
399
+
400
+ if bom_or_sig_available and strip_sig_or_bom is False:
401
+ cut_sequence = sig_payload + cut_sequence
402
+
403
+ chunk = cut_sequence.decode(encoding_iana, errors="ignore")
404
+
405
+ if chunk[:chunk_partial_size_chk] in decoded_payload:
406
+ break
407
+
408
+ yield chunk
meow/lib/python3.13/site-packages/charset_normalizer/version.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expose version
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ __version__ = "3.4.1"
8
+ VERSION = __version__.split(".")
meow/lib/python3.13/site-packages/filelock/__init__.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A platform independent file lock that supports the with-statement.
3
+
4
+ .. autodata:: filelock.__version__
5
+ :no-value:
6
+
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import sys
12
+ import warnings
13
+ from typing import TYPE_CHECKING
14
+
15
+ from ._api import AcquireReturnProxy, BaseFileLock
16
+ from ._error import Timeout
17
+ from ._soft import SoftFileLock
18
+ from ._unix import UnixFileLock, has_fcntl
19
+ from ._windows import WindowsFileLock
20
+ from .asyncio import (
21
+ AsyncAcquireReturnProxy,
22
+ AsyncSoftFileLock,
23
+ AsyncUnixFileLock,
24
+ AsyncWindowsFileLock,
25
+ BaseAsyncFileLock,
26
+ )
27
+ from .version import version
28
+
29
+ #: version of the project as a string
30
+ __version__: str = version
31
+
32
+
33
+ if sys.platform == "win32": # pragma: win32 cover
34
+ _FileLock: type[BaseFileLock] = WindowsFileLock
35
+ _AsyncFileLock: type[BaseAsyncFileLock] = AsyncWindowsFileLock
36
+ else: # pragma: win32 no cover # noqa: PLR5501
37
+ if has_fcntl:
38
+ _FileLock: type[BaseFileLock] = UnixFileLock
39
+ _AsyncFileLock: type[BaseAsyncFileLock] = AsyncUnixFileLock
40
+ else:
41
+ _FileLock = SoftFileLock
42
+ _AsyncFileLock = AsyncSoftFileLock
43
+ if warnings is not None:
44
+ warnings.warn("only soft file lock is available", stacklevel=2)
45
+
46
+ if TYPE_CHECKING:
47
+ FileLock = SoftFileLock
48
+ AsyncFileLock = AsyncSoftFileLock
49
+ else:
50
+ #: Alias for the lock, which should be used for the current platform.
51
+ FileLock = _FileLock
52
+ AsyncFileLock = _AsyncFileLock
53
+
54
+
55
+ __all__ = [
56
+ "AcquireReturnProxy",
57
+ "AsyncAcquireReturnProxy",
58
+ "AsyncFileLock",
59
+ "AsyncSoftFileLock",
60
+ "AsyncUnixFileLock",
61
+ "AsyncWindowsFileLock",
62
+ "BaseAsyncFileLock",
63
+ "BaseFileLock",
64
+ "FileLock",
65
+ "SoftFileLock",
66
+ "Timeout",
67
+ "UnixFileLock",
68
+ "WindowsFileLock",
69
+ "__version__",
70
+ ]
meow/lib/python3.13/site-packages/filelock/_api.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import inspect
5
+ import logging
6
+ import os
7
+ import time
8
+ import warnings
9
+ from abc import ABCMeta, abstractmethod
10
+ from dataclasses import dataclass
11
+ from threading import local
12
+ from typing import TYPE_CHECKING, Any, cast
13
+ from weakref import WeakValueDictionary
14
+
15
+ from ._error import Timeout
16
+
17
+ if TYPE_CHECKING:
18
+ import sys
19
+ from types import TracebackType
20
+
21
+ if sys.version_info >= (3, 11): # pragma: no cover (py311+)
22
+ from typing import Self
23
+ else: # pragma: no cover (<py311)
24
+ from typing_extensions import Self
25
+
26
+
27
+ _LOGGER = logging.getLogger("filelock")
28
+
29
+
30
+ # This is a helper class which is returned by :meth:`BaseFileLock.acquire` and wraps the lock to make sure __enter__
31
+ # is not called twice when entering the with statement. If we would simply return *self*, the lock would be acquired
32
+ # again in the *__enter__* method of the BaseFileLock, but not released again automatically. issue #37 (memory leak)
33
+ class AcquireReturnProxy:
34
+ """A context-aware object that will release the lock file when exiting."""
35
+
36
+ def __init__(self, lock: BaseFileLock) -> None:
37
+ self.lock = lock
38
+
39
+ def __enter__(self) -> BaseFileLock:
40
+ return self.lock
41
+
42
+ def __exit__(
43
+ self,
44
+ exc_type: type[BaseException] | None,
45
+ exc_value: BaseException | None,
46
+ traceback: TracebackType | None,
47
+ ) -> None:
48
+ self.lock.release()
49
+
50
+
51
+ @dataclass
52
+ class FileLockContext:
53
+ """A dataclass which holds the context for a ``BaseFileLock`` object."""
54
+
55
+ # The context is held in a separate class to allow optional use of thread local storage via the
56
+ # ThreadLocalFileContext class.
57
+
58
+ #: The path to the lock file.
59
+ lock_file: str
60
+
61
+ #: The default timeout value.
62
+ timeout: float
63
+
64
+ #: The mode for the lock files
65
+ mode: int
66
+
67
+ #: Whether the lock should be blocking or not
68
+ blocking: bool
69
+
70
+ #: The file descriptor for the *_lock_file* as it is returned by the os.open() function, not None when lock held
71
+ lock_file_fd: int | None = None
72
+
73
+ #: The lock counter is used for implementing the nested locking mechanism.
74
+ lock_counter: int = 0 # When the lock is acquired is increased and the lock is only released, when this value is 0
75
+
76
+
77
+ class ThreadLocalFileContext(FileLockContext, local):
78
+ """A thread local version of the ``FileLockContext`` class."""
79
+
80
+
81
+ class FileLockMeta(ABCMeta):
82
+ def __call__( # noqa: PLR0913
83
+ cls,
84
+ lock_file: str | os.PathLike[str],
85
+ timeout: float = -1,
86
+ mode: int = 0o644,
87
+ thread_local: bool = True, # noqa: FBT001, FBT002
88
+ *,
89
+ blocking: bool = True,
90
+ is_singleton: bool = False,
91
+ **kwargs: Any, # capture remaining kwargs for subclasses # noqa: ANN401
92
+ ) -> BaseFileLock:
93
+ if is_singleton:
94
+ instance = cls._instances.get(str(lock_file)) # type: ignore[attr-defined]
95
+ if instance:
96
+ params_to_check = {
97
+ "thread_local": (thread_local, instance.is_thread_local()),
98
+ "timeout": (timeout, instance.timeout),
99
+ "mode": (mode, instance.mode),
100
+ "blocking": (blocking, instance.blocking),
101
+ }
102
+
103
+ non_matching_params = {
104
+ name: (passed_param, set_param)
105
+ for name, (passed_param, set_param) in params_to_check.items()
106
+ if passed_param != set_param
107
+ }
108
+ if not non_matching_params:
109
+ return cast(BaseFileLock, instance)
110
+
111
+ # parameters do not match; raise error
112
+ msg = "Singleton lock instances cannot be initialized with differing arguments"
113
+ msg += "\nNon-matching arguments: "
114
+ for param_name, (passed_param, set_param) in non_matching_params.items():
115
+ msg += f"\n\t{param_name} (existing lock has {set_param} but {passed_param} was passed)"
116
+ raise ValueError(msg)
117
+
118
+ # Workaround to make `__init__`'s params optional in subclasses
119
+ # E.g. virtualenv changes the signature of the `__init__` method in the `BaseFileLock` class descendant
120
+ # (https://github.com/tox-dev/filelock/pull/340)
121
+
122
+ all_params = {
123
+ "timeout": timeout,
124
+ "mode": mode,
125
+ "thread_local": thread_local,
126
+ "blocking": blocking,
127
+ "is_singleton": is_singleton,
128
+ **kwargs,
129
+ }
130
+
131
+ present_params = inspect.signature(cls.__init__).parameters # type: ignore[misc]
132
+ init_params = {key: value for key, value in all_params.items() if key in present_params}
133
+
134
+ instance = super().__call__(lock_file, **init_params)
135
+
136
+ if is_singleton:
137
+ cls._instances[str(lock_file)] = instance # type: ignore[attr-defined]
138
+
139
+ return cast(BaseFileLock, instance)
140
+
141
+
142
+ class BaseFileLock(contextlib.ContextDecorator, metaclass=FileLockMeta):
143
+ """Abstract base class for a file lock object."""
144
+
145
+ _instances: WeakValueDictionary[str, BaseFileLock]
146
+
147
+ def __init_subclass__(cls, **kwargs: dict[str, Any]) -> None:
148
+ """Setup unique state for lock subclasses."""
149
+ super().__init_subclass__(**kwargs)
150
+ cls._instances = WeakValueDictionary()
151
+
152
+ def __init__( # noqa: PLR0913
153
+ self,
154
+ lock_file: str | os.PathLike[str],
155
+ timeout: float = -1,
156
+ mode: int = 0o644,
157
+ thread_local: bool = True, # noqa: FBT001, FBT002
158
+ *,
159
+ blocking: bool = True,
160
+ is_singleton: bool = False,
161
+ ) -> None:
162
+ """
163
+ Create a new lock object.
164
+
165
+ :param lock_file: path to the file
166
+ :param timeout: default timeout when acquiring the lock, in seconds. It will be used as fallback value in \
167
+ the acquire method, if no timeout value (``None``) is given. If you want to disable the timeout, set it \
168
+ to a negative value. A timeout of 0 means that there is exactly one attempt to acquire the file lock.
169
+ :param mode: file permissions for the lockfile
170
+ :param thread_local: Whether this object's internal context should be thread local or not. If this is set to \
171
+ ``False`` then the lock will be reentrant across threads.
172
+ :param blocking: whether the lock should be blocking or not
173
+ :param is_singleton: If this is set to ``True`` then only one instance of this class will be created \
174
+ per lock file. This is useful if you want to use the lock object for reentrant locking without needing \
175
+ to pass the same object around.
176
+
177
+ """
178
+ self._is_thread_local = thread_local
179
+ self._is_singleton = is_singleton
180
+
181
+ # Create the context. Note that external code should not work with the context directly and should instead use
182
+ # properties of this class.
183
+ kwargs: dict[str, Any] = {
184
+ "lock_file": os.fspath(lock_file),
185
+ "timeout": timeout,
186
+ "mode": mode,
187
+ "blocking": blocking,
188
+ }
189
+ self._context: FileLockContext = (ThreadLocalFileContext if thread_local else FileLockContext)(**kwargs)
190
+
191
+ def is_thread_local(self) -> bool:
192
+ """:return: a flag indicating if this lock is thread local or not"""
193
+ return self._is_thread_local
194
+
195
+ @property
196
+ def is_singleton(self) -> bool:
197
+ """:return: a flag indicating if this lock is singleton or not"""
198
+ return self._is_singleton
199
+
200
+ @property
201
+ def lock_file(self) -> str:
202
+ """:return: path to the lock file"""
203
+ return self._context.lock_file
204
+
205
+ @property
206
+ def timeout(self) -> float:
207
+ """
208
+ :return: the default timeout value, in seconds
209
+
210
+ .. versionadded:: 2.0.0
211
+ """
212
+ return self._context.timeout
213
+
214
+ @timeout.setter
215
+ def timeout(self, value: float | str) -> None:
216
+ """
217
+ Change the default timeout value.
218
+
219
+ :param value: the new value, in seconds
220
+
221
+ """
222
+ self._context.timeout = float(value)
223
+
224
+ @property
225
+ def blocking(self) -> bool:
226
+ """:return: whether the locking is blocking or not"""
227
+ return self._context.blocking
228
+
229
+ @blocking.setter
230
+ def blocking(self, value: bool) -> None:
231
+ """
232
+ Change the default blocking value.
233
+
234
+ :param value: the new value as bool
235
+
236
+ """
237
+ self._context.blocking = value
238
+
239
+ @property
240
+ def mode(self) -> int:
241
+ """:return: the file permissions for the lockfile"""
242
+ return self._context.mode
243
+
244
+ @abstractmethod
245
+ def _acquire(self) -> None:
246
+ """If the file lock could be acquired, self._context.lock_file_fd holds the file descriptor of the lock file."""
247
+ raise NotImplementedError
248
+
249
+ @abstractmethod
250
+ def _release(self) -> None:
251
+ """Releases the lock and sets self._context.lock_file_fd to None."""
252
+ raise NotImplementedError
253
+
254
+ @property
255
+ def is_locked(self) -> bool:
256
+ """
257
+
258
+ :return: A boolean indicating if the lock file is holding the lock currently.
259
+
260
+ .. versionchanged:: 2.0.0
261
+
262
+ This was previously a method and is now a property.
263
+ """
264
+ return self._context.lock_file_fd is not None
265
+
266
+ @property
267
+ def lock_counter(self) -> int:
268
+ """:return: The number of times this lock has been acquired (but not yet released)."""
269
+ return self._context.lock_counter
270
+
271
+ def acquire(
272
+ self,
273
+ timeout: float | None = None,
274
+ poll_interval: float = 0.05,
275
+ *,
276
+ poll_intervall: float | None = None,
277
+ blocking: bool | None = None,
278
+ ) -> AcquireReturnProxy:
279
+ """
280
+ Try to acquire the file lock.
281
+
282
+ :param timeout: maximum wait time for acquiring the lock, ``None`` means use the default :attr:`~timeout` is and
283
+ if ``timeout < 0``, there is no timeout and this method will block until the lock could be acquired
284
+ :param poll_interval: interval of trying to acquire the lock file
285
+ :param poll_intervall: deprecated, kept for backwards compatibility, use ``poll_interval`` instead
286
+ :param blocking: defaults to True. If False, function will return immediately if it cannot obtain a lock on the
287
+ first attempt. Otherwise, this method will block until the timeout expires or the lock is acquired.
288
+ :raises Timeout: if fails to acquire lock within the timeout period
289
+ :return: a context object that will unlock the file when the context is exited
290
+
291
+ .. code-block:: python
292
+
293
+ # You can use this method in the context manager (recommended)
294
+ with lock.acquire():
295
+ pass
296
+
297
+ # Or use an equivalent try-finally construct:
298
+ lock.acquire()
299
+ try:
300
+ pass
301
+ finally:
302
+ lock.release()
303
+
304
+ .. versionchanged:: 2.0.0
305
+
306
+ This method returns now a *proxy* object instead of *self*,
307
+ so that it can be used in a with statement without side effects.
308
+
309
+ """
310
+ # Use the default timeout, if no timeout is provided.
311
+ if timeout is None:
312
+ timeout = self._context.timeout
313
+
314
+ if blocking is None:
315
+ blocking = self._context.blocking
316
+
317
+ if poll_intervall is not None:
318
+ msg = "use poll_interval instead of poll_intervall"
319
+ warnings.warn(msg, DeprecationWarning, stacklevel=2)
320
+ poll_interval = poll_intervall
321
+
322
+ # Increment the number right at the beginning. We can still undo it, if something fails.
323
+ self._context.lock_counter += 1
324
+
325
+ lock_id = id(self)
326
+ lock_filename = self.lock_file
327
+ start_time = time.perf_counter()
328
+ try:
329
+ while True:
330
+ if not self.is_locked:
331
+ _LOGGER.debug("Attempting to acquire lock %s on %s", lock_id, lock_filename)
332
+ self._acquire()
333
+ if self.is_locked:
334
+ _LOGGER.debug("Lock %s acquired on %s", lock_id, lock_filename)
335
+ break
336
+ if blocking is False:
337
+ _LOGGER.debug("Failed to immediately acquire lock %s on %s", lock_id, lock_filename)
338
+ raise Timeout(lock_filename) # noqa: TRY301
339
+ if 0 <= timeout < time.perf_counter() - start_time:
340
+ _LOGGER.debug("Timeout on acquiring lock %s on %s", lock_id, lock_filename)
341
+ raise Timeout(lock_filename) # noqa: TRY301
342
+ msg = "Lock %s not acquired on %s, waiting %s seconds ..."
343
+ _LOGGER.debug(msg, lock_id, lock_filename, poll_interval)
344
+ time.sleep(poll_interval)
345
+ except BaseException: # Something did go wrong, so decrement the counter.
346
+ self._context.lock_counter = max(0, self._context.lock_counter - 1)
347
+ raise
348
+ return AcquireReturnProxy(lock=self)
349
+
350
+ def release(self, force: bool = False) -> None: # noqa: FBT001, FBT002
351
+ """
352
+ Releases the file lock. Please note, that the lock is only completely released, if the lock counter is 0.
353
+ Also note, that the lock file itself is not automatically deleted.
354
+
355
+ :param force: If true, the lock counter is ignored and the lock is released in every case/
356
+
357
+ """
358
+ if self.is_locked:
359
+ self._context.lock_counter -= 1
360
+
361
+ if self._context.lock_counter == 0 or force:
362
+ lock_id, lock_filename = id(self), self.lock_file
363
+
364
+ _LOGGER.debug("Attempting to release lock %s on %s", lock_id, lock_filename)
365
+ self._release()
366
+ self._context.lock_counter = 0
367
+ _LOGGER.debug("Lock %s released on %s", lock_id, lock_filename)
368
+
369
+ def __enter__(self) -> Self:
370
+ """
371
+ Acquire the lock.
372
+
373
+ :return: the lock object
374
+
375
+ """
376
+ self.acquire()
377
+ return self
378
+
379
+ def __exit__(
380
+ self,
381
+ exc_type: type[BaseException] | None,
382
+ exc_value: BaseException | None,
383
+ traceback: TracebackType | None,
384
+ ) -> None:
385
+ """
386
+ Release the lock.
387
+
388
+ :param exc_type: the exception type if raised
389
+ :param exc_value: the exception value if raised
390
+ :param traceback: the exception traceback if raised
391
+
392
+ """
393
+ self.release()
394
+
395
+ def __del__(self) -> None:
396
+ """Called when the lock object is deleted."""
397
+ self.release(force=True)
398
+
399
+
400
+ __all__ = [
401
+ "AcquireReturnProxy",
402
+ "BaseFileLock",
403
+ ]
meow/lib/python3.13/site-packages/filelock/_error.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+
6
+ class Timeout(TimeoutError): # noqa: N818
7
+ """Raised when the lock could not be acquired in *timeout* seconds."""
8
+
9
+ def __init__(self, lock_file: str) -> None:
10
+ super().__init__()
11
+ self._lock_file = lock_file
12
+
13
+ def __reduce__(self) -> str | tuple[Any, ...]:
14
+ return self.__class__, (self._lock_file,) # Properly pickle the exception
15
+
16
+ def __str__(self) -> str:
17
+ return f"The file lock '{self._lock_file}' could not be acquired."
18
+
19
+ def __repr__(self) -> str:
20
+ return f"{self.__class__.__name__}({self.lock_file!r})"
21
+
22
+ @property
23
+ def lock_file(self) -> str:
24
+ """:return: The path of the file lock."""
25
+ return self._lock_file
26
+
27
+
28
+ __all__ = [
29
+ "Timeout",
30
+ ]
meow/lib/python3.13/site-packages/filelock/_soft.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import suppress
6
+ from errno import EACCES, EEXIST
7
+ from pathlib import Path
8
+
9
+ from ._api import BaseFileLock
10
+ from ._util import ensure_directory_exists, raise_on_not_writable_file
11
+
12
+
13
+ class SoftFileLock(BaseFileLock):
14
+ """Simply watches the existence of the lock file."""
15
+
16
+ def _acquire(self) -> None:
17
+ raise_on_not_writable_file(self.lock_file)
18
+ ensure_directory_exists(self.lock_file)
19
+ # first check for exists and read-only mode as the open will mask this case as EEXIST
20
+ flags = (
21
+ os.O_WRONLY # open for writing only
22
+ | os.O_CREAT
23
+ | os.O_EXCL # together with above raise EEXIST if the file specified by filename exists
24
+ | os.O_TRUNC # truncate the file to zero byte
25
+ )
26
+ try:
27
+ file_handler = os.open(self.lock_file, flags, self._context.mode)
28
+ except OSError as exception: # re-raise unless expected exception
29
+ if not (
30
+ exception.errno == EEXIST # lock already exist
31
+ or (exception.errno == EACCES and sys.platform == "win32") # has no access to this lock
32
+ ): # pragma: win32 no cover
33
+ raise
34
+ else:
35
+ self._context.lock_file_fd = file_handler
36
+
37
+ def _release(self) -> None:
38
+ assert self._context.lock_file_fd is not None # noqa: S101
39
+ os.close(self._context.lock_file_fd) # the lock file is definitely not None
40
+ self._context.lock_file_fd = None
41
+ with suppress(OSError): # the file is already deleted and that's what we want
42
+ Path(self.lock_file).unlink()
43
+
44
+
45
+ __all__ = [
46
+ "SoftFileLock",
47
+ ]
meow/lib/python3.13/site-packages/filelock/_unix.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import suppress
6
+ from errno import ENOSYS
7
+ from pathlib import Path
8
+ from typing import cast
9
+
10
+ from ._api import BaseFileLock
11
+ from ._util import ensure_directory_exists
12
+
13
+ #: a flag to indicate if the fcntl API is available
14
+ has_fcntl = False
15
+ if sys.platform == "win32": # pragma: win32 cover
16
+
17
+ class UnixFileLock(BaseFileLock):
18
+ """Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
19
+
20
+ def _acquire(self) -> None:
21
+ raise NotImplementedError
22
+
23
+ def _release(self) -> None:
24
+ raise NotImplementedError
25
+
26
+ else: # pragma: win32 no cover
27
+ try:
28
+ import fcntl
29
+ except ImportError:
30
+ pass
31
+ else:
32
+ has_fcntl = True
33
+
34
+ class UnixFileLock(BaseFileLock):
35
+ """Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
36
+
37
+ def _acquire(self) -> None:
38
+ ensure_directory_exists(self.lock_file)
39
+ open_flags = os.O_RDWR | os.O_TRUNC
40
+ if not Path(self.lock_file).exists():
41
+ open_flags |= os.O_CREAT
42
+ fd = os.open(self.lock_file, open_flags, self._context.mode)
43
+ with suppress(PermissionError): # This locked is not owned by this UID
44
+ os.fchmod(fd, self._context.mode)
45
+ try:
46
+ fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
47
+ except OSError as exception:
48
+ os.close(fd)
49
+ if exception.errno == ENOSYS: # NotImplemented error
50
+ msg = "FileSystem does not appear to support flock; use SoftFileLock instead"
51
+ raise NotImplementedError(msg) from exception
52
+ else:
53
+ self._context.lock_file_fd = fd
54
+
55
+ def _release(self) -> None:
56
+ # Do not remove the lockfile:
57
+ # https://github.com/tox-dev/py-filelock/issues/31
58
+ # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
59
+ fd = cast(int, self._context.lock_file_fd)
60
+ self._context.lock_file_fd = None
61
+ fcntl.flock(fd, fcntl.LOCK_UN)
62
+ os.close(fd)
63
+
64
+
65
+ __all__ = [
66
+ "UnixFileLock",
67
+ "has_fcntl",
68
+ ]
meow/lib/python3.13/site-packages/filelock/_util.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import stat
5
+ import sys
6
+ from errno import EACCES, EISDIR
7
+ from pathlib import Path
8
+
9
+
10
+ def raise_on_not_writable_file(filename: str) -> None:
11
+ """
12
+ Raise an exception if attempting to open the file for writing would fail.
13
+
14
+ This is done so files that will never be writable can be separated from files that are writable but currently
15
+ locked.
16
+
17
+ :param filename: file to check
18
+ :raises OSError: as if the file was opened for writing.
19
+
20
+ """
21
+ try: # use stat to do exists + can write to check without race condition
22
+ file_stat = os.stat(filename) # noqa: PTH116
23
+ except OSError:
24
+ return # swallow does not exist or other errors
25
+
26
+ if file_stat.st_mtime != 0: # if os.stat returns but modification is zero that's an invalid os.stat - ignore it
27
+ if not (file_stat.st_mode & stat.S_IWUSR):
28
+ raise PermissionError(EACCES, "Permission denied", filename)
29
+
30
+ if stat.S_ISDIR(file_stat.st_mode):
31
+ if sys.platform == "win32": # pragma: win32 cover
32
+ # On Windows, this is PermissionError
33
+ raise PermissionError(EACCES, "Permission denied", filename)
34
+ else: # pragma: win32 no cover # noqa: RET506
35
+ # On linux / macOS, this is IsADirectoryError
36
+ raise IsADirectoryError(EISDIR, "Is a directory", filename)
37
+
38
+
39
+ def ensure_directory_exists(filename: Path | str) -> None:
40
+ """
41
+ Ensure the directory containing the file exists (create it if necessary).
42
+
43
+ :param filename: file.
44
+
45
+ """
46
+ Path(filename).parent.mkdir(parents=True, exist_ok=True)
47
+
48
+
49
+ __all__ = [
50
+ "ensure_directory_exists",
51
+ "raise_on_not_writable_file",
52
+ ]
meow/lib/python3.13/site-packages/filelock/_windows.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import suppress
6
+ from errno import EACCES
7
+ from pathlib import Path
8
+ from typing import cast
9
+
10
+ from ._api import BaseFileLock
11
+ from ._util import ensure_directory_exists, raise_on_not_writable_file
12
+
13
+ if sys.platform == "win32": # pragma: win32 cover
14
+ import msvcrt
15
+
16
+ class WindowsFileLock(BaseFileLock):
17
+ """Uses the :func:`msvcrt.locking` function to hard lock the lock file on Windows systems."""
18
+
19
+ def _acquire(self) -> None:
20
+ raise_on_not_writable_file(self.lock_file)
21
+ ensure_directory_exists(self.lock_file)
22
+ flags = (
23
+ os.O_RDWR # open for read and write
24
+ | os.O_CREAT # create file if not exists
25
+ | os.O_TRUNC # truncate file if not empty
26
+ )
27
+ try:
28
+ fd = os.open(self.lock_file, flags, self._context.mode)
29
+ except OSError as exception:
30
+ if exception.errno != EACCES: # has no access to this lock
31
+ raise
32
+ else:
33
+ try:
34
+ msvcrt.locking(fd, msvcrt.LK_NBLCK, 1)
35
+ except OSError as exception:
36
+ os.close(fd) # close file first
37
+ if exception.errno != EACCES: # file is already locked
38
+ raise
39
+ else:
40
+ self._context.lock_file_fd = fd
41
+
42
+ def _release(self) -> None:
43
+ fd = cast(int, self._context.lock_file_fd)
44
+ self._context.lock_file_fd = None
45
+ msvcrt.locking(fd, msvcrt.LK_UNLCK, 1)
46
+ os.close(fd)
47
+
48
+ with suppress(OSError): # Probably another instance of the application hat acquired the file lock.
49
+ Path(self.lock_file).unlink()
50
+
51
+ else: # pragma: win32 no cover
52
+
53
+ class WindowsFileLock(BaseFileLock):
54
+ """Uses the :func:`msvcrt.locking` function to hard lock the lock file on Windows systems."""
55
+
56
+ def _acquire(self) -> None:
57
+ raise NotImplementedError
58
+
59
+ def _release(self) -> None:
60
+ raise NotImplementedError
61
+
62
+
63
+ __all__ = [
64
+ "WindowsFileLock",
65
+ ]
meow/lib/python3.13/site-packages/filelock/asyncio.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """An asyncio-based implementation of the file lock.""" # noqa: A005
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import contextlib
7
+ import logging
8
+ import os
9
+ import time
10
+ from dataclasses import dataclass
11
+ from threading import local
12
+ from typing import TYPE_CHECKING, Any, Callable, NoReturn, cast
13
+
14
+ from ._api import BaseFileLock, FileLockContext, FileLockMeta
15
+ from ._error import Timeout
16
+ from ._soft import SoftFileLock
17
+ from ._unix import UnixFileLock
18
+ from ._windows import WindowsFileLock
19
+
20
+ if TYPE_CHECKING:
21
+ import sys
22
+ from concurrent import futures
23
+ from types import TracebackType
24
+
25
+ if sys.version_info >= (3, 11): # pragma: no cover (py311+)
26
+ from typing import Self
27
+ else: # pragma: no cover (<py311)
28
+ from typing_extensions import Self
29
+
30
+
31
+ _LOGGER = logging.getLogger("filelock")
32
+
33
+
34
+ @dataclass
35
+ class AsyncFileLockContext(FileLockContext):
36
+ """A dataclass which holds the context for a ``BaseAsyncFileLock`` object."""
37
+
38
+ #: Whether run in executor
39
+ run_in_executor: bool = True
40
+
41
+ #: The executor
42
+ executor: futures.Executor | None = None
43
+
44
+ #: The loop
45
+ loop: asyncio.AbstractEventLoop | None = None
46
+
47
+
48
+ class AsyncThreadLocalFileContext(AsyncFileLockContext, local):
49
+ """A thread local version of the ``FileLockContext`` class."""
50
+
51
+
52
+ class AsyncAcquireReturnProxy:
53
+ """A context-aware object that will release the lock file when exiting."""
54
+
55
+ def __init__(self, lock: BaseAsyncFileLock) -> None: # noqa: D107
56
+ self.lock = lock
57
+
58
+ async def __aenter__(self) -> BaseAsyncFileLock: # noqa: D105
59
+ return self.lock
60
+
61
+ async def __aexit__( # noqa: D105
62
+ self,
63
+ exc_type: type[BaseException] | None,
64
+ exc_value: BaseException | None,
65
+ traceback: TracebackType | None,
66
+ ) -> None:
67
+ await self.lock.release()
68
+
69
+
70
+ class AsyncFileLockMeta(FileLockMeta):
71
+ def __call__( # type: ignore[override] # noqa: PLR0913
72
+ cls, # noqa: N805
73
+ lock_file: str | os.PathLike[str],
74
+ timeout: float = -1,
75
+ mode: int = 0o644,
76
+ thread_local: bool = False, # noqa: FBT001, FBT002
77
+ *,
78
+ blocking: bool = True,
79
+ is_singleton: bool = False,
80
+ loop: asyncio.AbstractEventLoop | None = None,
81
+ run_in_executor: bool = True,
82
+ executor: futures.Executor | None = None,
83
+ ) -> BaseAsyncFileLock:
84
+ if thread_local and run_in_executor:
85
+ msg = "run_in_executor is not supported when thread_local is True"
86
+ raise ValueError(msg)
87
+ instance = super().__call__(
88
+ lock_file=lock_file,
89
+ timeout=timeout,
90
+ mode=mode,
91
+ thread_local=thread_local,
92
+ blocking=blocking,
93
+ is_singleton=is_singleton,
94
+ loop=loop,
95
+ run_in_executor=run_in_executor,
96
+ executor=executor,
97
+ )
98
+ return cast(BaseAsyncFileLock, instance)
99
+
100
+
101
+ class BaseAsyncFileLock(BaseFileLock, metaclass=AsyncFileLockMeta):
102
+ """Base class for asynchronous file locks."""
103
+
104
+ def __init__( # noqa: PLR0913
105
+ self,
106
+ lock_file: str | os.PathLike[str],
107
+ timeout: float = -1,
108
+ mode: int = 0o644,
109
+ thread_local: bool = False, # noqa: FBT001, FBT002
110
+ *,
111
+ blocking: bool = True,
112
+ is_singleton: bool = False,
113
+ loop: asyncio.AbstractEventLoop | None = None,
114
+ run_in_executor: bool = True,
115
+ executor: futures.Executor | None = None,
116
+ ) -> None:
117
+ """
118
+ Create a new lock object.
119
+
120
+ :param lock_file: path to the file
121
+ :param timeout: default timeout when acquiring the lock, in seconds. It will be used as fallback value in \
122
+ the acquire method, if no timeout value (``None``) is given. If you want to disable the timeout, set it \
123
+ to a negative value. A timeout of 0 means that there is exactly one attempt to acquire the file lock.
124
+ :param mode: file permissions for the lockfile
125
+ :param thread_local: Whether this object's internal context should be thread local or not. If this is set to \
126
+ ``False`` then the lock will be reentrant across threads.
127
+ :param blocking: whether the lock should be blocking or not
128
+ :param is_singleton: If this is set to ``True`` then only one instance of this class will be created \
129
+ per lock file. This is useful if you want to use the lock object for reentrant locking without needing \
130
+ to pass the same object around.
131
+ :param loop: The event loop to use. If not specified, the running event loop will be used.
132
+ :param run_in_executor: If this is set to ``True`` then the lock will be acquired in an executor.
133
+ :param executor: The executor to use. If not specified, the default executor will be used.
134
+
135
+ """
136
+ self._is_thread_local = thread_local
137
+ self._is_singleton = is_singleton
138
+
139
+ # Create the context. Note that external code should not work with the context directly and should instead use
140
+ # properties of this class.
141
+ kwargs: dict[str, Any] = {
142
+ "lock_file": os.fspath(lock_file),
143
+ "timeout": timeout,
144
+ "mode": mode,
145
+ "blocking": blocking,
146
+ "loop": loop,
147
+ "run_in_executor": run_in_executor,
148
+ "executor": executor,
149
+ }
150
+ self._context: AsyncFileLockContext = (AsyncThreadLocalFileContext if thread_local else AsyncFileLockContext)(
151
+ **kwargs
152
+ )
153
+
154
+ @property
155
+ def run_in_executor(self) -> bool:
156
+ """::return: whether run in executor."""
157
+ return self._context.run_in_executor
158
+
159
+ @property
160
+ def executor(self) -> futures.Executor | None:
161
+ """::return: the executor."""
162
+ return self._context.executor
163
+
164
+ @executor.setter
165
+ def executor(self, value: futures.Executor | None) -> None: # pragma: no cover
166
+ """
167
+ Change the executor.
168
+
169
+ :param value: the new executor or ``None``
170
+ :type value: futures.Executor | None
171
+
172
+ """
173
+ self._context.executor = value
174
+
175
+ @property
176
+ def loop(self) -> asyncio.AbstractEventLoop | None:
177
+ """::return: the event loop."""
178
+ return self._context.loop
179
+
180
+ async def acquire( # type: ignore[override]
181
+ self,
182
+ timeout: float | None = None,
183
+ poll_interval: float = 0.05,
184
+ *,
185
+ blocking: bool | None = None,
186
+ ) -> AsyncAcquireReturnProxy:
187
+ """
188
+ Try to acquire the file lock.
189
+
190
+ :param timeout: maximum wait time for acquiring the lock, ``None`` means use the default
191
+ :attr:`~BaseFileLock.timeout` is and if ``timeout < 0``, there is no timeout and
192
+ this method will block until the lock could be acquired
193
+ :param poll_interval: interval of trying to acquire the lock file
194
+ :param blocking: defaults to True. If False, function will return immediately if it cannot obtain a lock on the
195
+ first attempt. Otherwise, this method will block until the timeout expires or the lock is acquired.
196
+ :raises Timeout: if fails to acquire lock within the timeout period
197
+ :return: a context object that will unlock the file when the context is exited
198
+
199
+ .. code-block:: python
200
+
201
+ # You can use this method in the context manager (recommended)
202
+ with lock.acquire():
203
+ pass
204
+
205
+ # Or use an equivalent try-finally construct:
206
+ lock.acquire()
207
+ try:
208
+ pass
209
+ finally:
210
+ lock.release()
211
+
212
+ """
213
+ # Use the default timeout, if no timeout is provided.
214
+ if timeout is None:
215
+ timeout = self._context.timeout
216
+
217
+ if blocking is None:
218
+ blocking = self._context.blocking
219
+
220
+ # Increment the number right at the beginning. We can still undo it, if something fails.
221
+ self._context.lock_counter += 1
222
+
223
+ lock_id = id(self)
224
+ lock_filename = self.lock_file
225
+ start_time = time.perf_counter()
226
+ try:
227
+ while True:
228
+ if not self.is_locked:
229
+ _LOGGER.debug("Attempting to acquire lock %s on %s", lock_id, lock_filename)
230
+ await self._run_internal_method(self._acquire)
231
+ if self.is_locked:
232
+ _LOGGER.debug("Lock %s acquired on %s", lock_id, lock_filename)
233
+ break
234
+ if blocking is False:
235
+ _LOGGER.debug("Failed to immediately acquire lock %s on %s", lock_id, lock_filename)
236
+ raise Timeout(lock_filename) # noqa: TRY301
237
+ if 0 <= timeout < time.perf_counter() - start_time:
238
+ _LOGGER.debug("Timeout on acquiring lock %s on %s", lock_id, lock_filename)
239
+ raise Timeout(lock_filename) # noqa: TRY301
240
+ msg = "Lock %s not acquired on %s, waiting %s seconds ..."
241
+ _LOGGER.debug(msg, lock_id, lock_filename, poll_interval)
242
+ await asyncio.sleep(poll_interval)
243
+ except BaseException: # Something did go wrong, so decrement the counter.
244
+ self._context.lock_counter = max(0, self._context.lock_counter - 1)
245
+ raise
246
+ return AsyncAcquireReturnProxy(lock=self)
247
+
248
+ async def release(self, force: bool = False) -> None: # type: ignore[override] # noqa: FBT001, FBT002
249
+ """
250
+ Releases the file lock. Please note, that the lock is only completely released, if the lock counter is 0.
251
+ Also note, that the lock file itself is not automatically deleted.
252
+
253
+ :param force: If true, the lock counter is ignored and the lock is released in every case/
254
+
255
+ """
256
+ if self.is_locked:
257
+ self._context.lock_counter -= 1
258
+
259
+ if self._context.lock_counter == 0 or force:
260
+ lock_id, lock_filename = id(self), self.lock_file
261
+
262
+ _LOGGER.debug("Attempting to release lock %s on %s", lock_id, lock_filename)
263
+ await self._run_internal_method(self._release)
264
+ self._context.lock_counter = 0
265
+ _LOGGER.debug("Lock %s released on %s", lock_id, lock_filename)
266
+
267
+ async def _run_internal_method(self, method: Callable[[], Any]) -> None:
268
+ if asyncio.iscoroutinefunction(method):
269
+ await method()
270
+ elif self.run_in_executor:
271
+ loop = self.loop or asyncio.get_running_loop()
272
+ await loop.run_in_executor(self.executor, method)
273
+ else:
274
+ method()
275
+
276
+ def __enter__(self) -> NoReturn:
277
+ """
278
+ Replace old __enter__ method to avoid using it.
279
+
280
+ NOTE: DO NOT USE `with` FOR ASYNCIO LOCKS, USE `async with` INSTEAD.
281
+
282
+ :return: none
283
+ :rtype: NoReturn
284
+ """
285
+ msg = "Do not use `with` for asyncio locks, use `async with` instead."
286
+ raise NotImplementedError(msg)
287
+
288
+ async def __aenter__(self) -> Self:
289
+ """
290
+ Acquire the lock.
291
+
292
+ :return: the lock object
293
+
294
+ """
295
+ await self.acquire()
296
+ return self
297
+
298
+ async def __aexit__(
299
+ self,
300
+ exc_type: type[BaseException] | None,
301
+ exc_value: BaseException | None,
302
+ traceback: TracebackType | None,
303
+ ) -> None:
304
+ """
305
+ Release the lock.
306
+
307
+ :param exc_type: the exception type if raised
308
+ :param exc_value: the exception value if raised
309
+ :param traceback: the exception traceback if raised
310
+
311
+ """
312
+ await self.release()
313
+
314
+ def __del__(self) -> None:
315
+ """Called when the lock object is deleted."""
316
+ with contextlib.suppress(RuntimeError):
317
+ loop = self.loop or asyncio.get_running_loop()
318
+ if not loop.is_running(): # pragma: no cover
319
+ loop.run_until_complete(self.release(force=True))
320
+ else:
321
+ loop.create_task(self.release(force=True))
322
+
323
+
324
+ class AsyncSoftFileLock(SoftFileLock, BaseAsyncFileLock):
325
+ """Simply watches the existence of the lock file."""
326
+
327
+
328
+ class AsyncUnixFileLock(UnixFileLock, BaseAsyncFileLock):
329
+ """Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
330
+
331
+
332
+ class AsyncWindowsFileLock(WindowsFileLock, BaseAsyncFileLock):
333
+ """Uses the :func:`msvcrt.locking` to hard lock the lock file on windows systems."""
334
+
335
+
336
+ __all__ = [
337
+ "AsyncAcquireReturnProxy",
338
+ "AsyncSoftFileLock",
339
+ "AsyncUnixFileLock",
340
+ "AsyncWindowsFileLock",
341
+ "BaseAsyncFileLock",
342
+ ]
meow/lib/python3.13/site-packages/filelock/py.typed ADDED
File without changes
meow/lib/python3.13/site-packages/filelock/version.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '3.16.1'
16
+ __version_tuple__ = version_tuple = (3, 16, 1)
meow/lib/python3.13/site-packages/fsspec/__init__.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from importlib.metadata import entry_points
2
+
3
+ from . import caching
4
+ from ._version import __version__ # noqa: F401
5
+ from .callbacks import Callback
6
+ from .compression import available_compressions
7
+ from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
8
+ from .exceptions import FSTimeoutError
9
+ from .mapping import FSMap, get_mapper
10
+ from .registry import (
11
+ available_protocols,
12
+ filesystem,
13
+ get_filesystem_class,
14
+ register_implementation,
15
+ registry,
16
+ )
17
+ from .spec import AbstractFileSystem
18
+
19
+ __all__ = [
20
+ "AbstractFileSystem",
21
+ "FSTimeoutError",
22
+ "FSMap",
23
+ "filesystem",
24
+ "register_implementation",
25
+ "get_filesystem_class",
26
+ "get_fs_token_paths",
27
+ "get_mapper",
28
+ "open",
29
+ "open_files",
30
+ "open_local",
31
+ "registry",
32
+ "caching",
33
+ "Callback",
34
+ "available_protocols",
35
+ "available_compressions",
36
+ "url_to_fs",
37
+ ]
38
+
39
+
40
+ def process_entries():
41
+ if entry_points is not None:
42
+ try:
43
+ eps = entry_points()
44
+ except TypeError:
45
+ pass # importlib-metadata < 0.8
46
+ else:
47
+ if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0
48
+ specs = eps.select(group="fsspec.specs")
49
+ else:
50
+ specs = eps.get("fsspec.specs", [])
51
+ registered_names = {}
52
+ for spec in specs:
53
+ err_msg = f"Unable to load filesystem from {spec}"
54
+ name = spec.name
55
+ if name in registered_names:
56
+ continue
57
+ registered_names[name] = True
58
+ register_implementation(
59
+ name,
60
+ spec.value.replace(":", "."),
61
+ errtxt=err_msg,
62
+ # We take our implementations as the ones to overload with if
63
+ # for some reason we encounter some, may be the same, already
64
+ # registered
65
+ clobber=True,
66
+ )
67
+
68
+
69
+ process_entries()
meow/lib/python3.13/site-packages/fsspec/_version.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '2024.12.0'
16
+ __version_tuple__ = version_tuple = (2024, 12, 0)
meow/lib/python3.13/site-packages/fsspec/archive.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fsspec import AbstractFileSystem
2
+ from fsspec.utils import tokenize
3
+
4
+
5
+ class AbstractArchiveFileSystem(AbstractFileSystem):
6
+ """
7
+ A generic superclass for implementing Archive-based filesystems.
8
+
9
+ Currently, it is shared amongst
10
+ :class:`~fsspec.implementations.zip.ZipFileSystem`,
11
+ :class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and
12
+ :class:`~fsspec.implementations.tar.TarFileSystem`.
13
+ """
14
+
15
+ def __str__(self):
16
+ return f"<Archive-like object {type(self).__name__} at {id(self)}>"
17
+
18
+ __repr__ = __str__
19
+
20
+ def ukey(self, path):
21
+ return tokenize(path, self.fo, self.protocol)
22
+
23
+ def _all_dirnames(self, paths):
24
+ """Returns *all* directory names for each path in paths, including intermediate
25
+ ones.
26
+
27
+ Parameters
28
+ ----------
29
+ paths: Iterable of path strings
30
+ """
31
+ if len(paths) == 0:
32
+ return set()
33
+
34
+ dirnames = {self._parent(path) for path in paths} - {self.root_marker}
35
+ return dirnames | self._all_dirnames(dirnames)
36
+
37
+ def info(self, path, **kwargs):
38
+ self._get_dirs()
39
+ path = self._strip_protocol(path)
40
+ if path in {"", "/"} and self.dir_cache:
41
+ return {"name": "", "type": "directory", "size": 0}
42
+ if path in self.dir_cache:
43
+ return self.dir_cache[path]
44
+ elif path + "/" in self.dir_cache:
45
+ return self.dir_cache[path + "/"]
46
+ else:
47
+ raise FileNotFoundError(path)
48
+
49
+ def ls(self, path, detail=True, **kwargs):
50
+ self._get_dirs()
51
+ paths = {}
52
+ for p, f in self.dir_cache.items():
53
+ p = p.rstrip("/")
54
+ if "/" in p:
55
+ root = p.rsplit("/", 1)[0]
56
+ else:
57
+ root = ""
58
+ if root == path.rstrip("/"):
59
+ paths[p] = f
60
+ elif all(
61
+ (a == b)
62
+ for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
63
+ ):
64
+ # root directory entry
65
+ ppath = p.rstrip("/").split("/", 1)[0]
66
+ if ppath not in paths:
67
+ out = {"name": ppath, "size": 0, "type": "directory"}
68
+ paths[ppath] = out
69
+ if detail:
70
+ out = sorted(paths.values(), key=lambda _: _["name"])
71
+ return out
72
+ else:
73
+ return sorted(paths)
meow/lib/python3.13/site-packages/fsspec/asyn.py ADDED
@@ -0,0 +1,1098 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import asyncio.events
3
+ import functools
4
+ import inspect
5
+ import io
6
+ import numbers
7
+ import os
8
+ import re
9
+ import threading
10
+ from contextlib import contextmanager
11
+ from glob import has_magic
12
+ from typing import TYPE_CHECKING, Iterable
13
+
14
+ from .callbacks import DEFAULT_CALLBACK
15
+ from .exceptions import FSTimeoutError
16
+ from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
17
+ from .spec import AbstractBufferedFile, AbstractFileSystem
18
+ from .utils import glob_translate, is_exception, other_paths
19
+
20
+ private = re.compile("_[^_]")
21
+ iothread = [None] # dedicated fsspec IO thread
22
+ loop = [None] # global event loop for any non-async instance
23
+ _lock = None # global lock placeholder
24
+ get_running_loop = asyncio.get_running_loop
25
+
26
+
27
+ def get_lock():
28
+ """Allocate or return a threading lock.
29
+
30
+ The lock is allocated on first use to allow setting one lock per forked process.
31
+ """
32
+ global _lock
33
+ if not _lock:
34
+ _lock = threading.Lock()
35
+ return _lock
36
+
37
+
38
+ def reset_lock():
39
+ """Reset the global lock.
40
+
41
+ This should be called only on the init of a forked process to reset the lock to
42
+ None, enabling the new forked process to get a new lock.
43
+ """
44
+ global _lock
45
+
46
+ iothread[0] = None
47
+ loop[0] = None
48
+ _lock = None
49
+
50
+
51
+ async def _runner(event, coro, result, timeout=None):
52
+ timeout = timeout if timeout else None # convert 0 or 0.0 to None
53
+ if timeout is not None:
54
+ coro = asyncio.wait_for(coro, timeout=timeout)
55
+ try:
56
+ result[0] = await coro
57
+ except Exception as ex:
58
+ result[0] = ex
59
+ finally:
60
+ event.set()
61
+
62
+
63
+ def sync(loop, func, *args, timeout=None, **kwargs):
64
+ """
65
+ Make loop run coroutine until it returns. Runs in other thread
66
+
67
+ Examples
68
+ --------
69
+ >>> fsspec.asyn.sync(fsspec.asyn.get_loop(), func, *args,
70
+ timeout=timeout, **kwargs)
71
+ """
72
+ timeout = timeout if timeout else None # convert 0 or 0.0 to None
73
+ # NB: if the loop is not running *yet*, it is OK to submit work
74
+ # and we will wait for it
75
+ if loop is None or loop.is_closed():
76
+ raise RuntimeError("Loop is not running")
77
+ try:
78
+ loop0 = asyncio.events.get_running_loop()
79
+ if loop0 is loop:
80
+ raise NotImplementedError("Calling sync() from within a running loop")
81
+ except NotImplementedError:
82
+ raise
83
+ except RuntimeError:
84
+ pass
85
+ coro = func(*args, **kwargs)
86
+ result = [None]
87
+ event = threading.Event()
88
+ asyncio.run_coroutine_threadsafe(_runner(event, coro, result, timeout), loop)
89
+ while True:
90
+ # this loops allows thread to get interrupted
91
+ if event.wait(1):
92
+ break
93
+ if timeout is not None:
94
+ timeout -= 1
95
+ if timeout < 0:
96
+ raise FSTimeoutError
97
+
98
+ return_result = result[0]
99
+ if isinstance(return_result, asyncio.TimeoutError):
100
+ # suppress asyncio.TimeoutError, raise FSTimeoutError
101
+ raise FSTimeoutError from return_result
102
+ elif isinstance(return_result, BaseException):
103
+ raise return_result
104
+ else:
105
+ return return_result
106
+
107
+
108
+ def sync_wrapper(func, obj=None):
109
+ """Given a function, make so can be called in blocking contexts
110
+
111
+ Leave obj=None if defining within a class. Pass the instance if attaching
112
+ as an attribute of the instance.
113
+ """
114
+
115
+ @functools.wraps(func)
116
+ def wrapper(*args, **kwargs):
117
+ self = obj or args[0]
118
+ return sync(self.loop, func, *args, **kwargs)
119
+
120
+ return wrapper
121
+
122
+
123
+ @contextmanager
124
+ def _selector_policy():
125
+ original_policy = asyncio.get_event_loop_policy()
126
+ try:
127
+ if os.name == "nt" and hasattr(asyncio, "WindowsSelectorEventLoopPolicy"):
128
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
129
+
130
+ yield
131
+ finally:
132
+ asyncio.set_event_loop_policy(original_policy)
133
+
134
+
135
+ def get_loop():
136
+ """Create or return the default fsspec IO loop
137
+
138
+ The loop will be running on a separate thread.
139
+ """
140
+ if loop[0] is None:
141
+ with get_lock():
142
+ # repeat the check just in case the loop got filled between the
143
+ # previous two calls from another thread
144
+ if loop[0] is None:
145
+ with _selector_policy():
146
+ loop[0] = asyncio.new_event_loop()
147
+ th = threading.Thread(target=loop[0].run_forever, name="fsspecIO")
148
+ th.daemon = True
149
+ th.start()
150
+ iothread[0] = th
151
+ return loop[0]
152
+
153
+
154
+ if TYPE_CHECKING:
155
+ import resource
156
+
157
+ ResourceError = resource.error
158
+ else:
159
+ try:
160
+ import resource
161
+ except ImportError:
162
+ resource = None
163
+ ResourceError = OSError
164
+ else:
165
+ ResourceError = getattr(resource, "error", OSError)
166
+
167
+ _DEFAULT_BATCH_SIZE = 128
168
+ _NOFILES_DEFAULT_BATCH_SIZE = 1280
169
+
170
+
171
+ def _get_batch_size(nofiles=False):
172
+ from fsspec.config import conf
173
+
174
+ if nofiles:
175
+ if "nofiles_gather_batch_size" in conf:
176
+ return conf["nofiles_gather_batch_size"]
177
+ else:
178
+ if "gather_batch_size" in conf:
179
+ return conf["gather_batch_size"]
180
+ if nofiles:
181
+ return _NOFILES_DEFAULT_BATCH_SIZE
182
+ if resource is None:
183
+ return _DEFAULT_BATCH_SIZE
184
+
185
+ try:
186
+ soft_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
187
+ except (ImportError, ValueError, ResourceError):
188
+ return _DEFAULT_BATCH_SIZE
189
+
190
+ if soft_limit == resource.RLIM_INFINITY:
191
+ return -1
192
+ else:
193
+ return soft_limit // 8
194
+
195
+
196
+ def running_async() -> bool:
197
+ """Being executed by an event loop?"""
198
+ try:
199
+ asyncio.get_running_loop()
200
+ return True
201
+ except RuntimeError:
202
+ return False
203
+
204
+
205
+ async def _run_coros_in_chunks(
206
+ coros,
207
+ batch_size=None,
208
+ callback=DEFAULT_CALLBACK,
209
+ timeout=None,
210
+ return_exceptions=False,
211
+ nofiles=False,
212
+ ):
213
+ """Run the given coroutines in chunks.
214
+
215
+ Parameters
216
+ ----------
217
+ coros: list of coroutines to run
218
+ batch_size: int or None
219
+ Number of coroutines to submit/wait on simultaneously.
220
+ If -1, then it will not be any throttling. If
221
+ None, it will be inferred from _get_batch_size()
222
+ callback: fsspec.callbacks.Callback instance
223
+ Gets a relative_update when each coroutine completes
224
+ timeout: number or None
225
+ If given, each coroutine times out after this time. Note that, since
226
+ there are multiple batches, the total run time of this function will in
227
+ general be longer
228
+ return_exceptions: bool
229
+ Same meaning as in asyncio.gather
230
+ nofiles: bool
231
+ If inferring the batch_size, does this operation involve local files?
232
+ If yes, you normally expect smaller batches.
233
+ """
234
+
235
+ if batch_size is None:
236
+ batch_size = _get_batch_size(nofiles=nofiles)
237
+
238
+ if batch_size == -1:
239
+ batch_size = len(coros)
240
+
241
+ assert batch_size > 0
242
+
243
+ async def _run_coro(coro, i):
244
+ try:
245
+ return await asyncio.wait_for(coro, timeout=timeout), i
246
+ except Exception as e:
247
+ if not return_exceptions:
248
+ raise
249
+ return e, i
250
+ finally:
251
+ callback.relative_update(1)
252
+
253
+ i = 0
254
+ n = len(coros)
255
+ results = [None] * n
256
+ pending = set()
257
+
258
+ while pending or i < n:
259
+ while len(pending) < batch_size and i < n:
260
+ pending.add(asyncio.ensure_future(_run_coro(coros[i], i)))
261
+ i += 1
262
+
263
+ if not pending:
264
+ break
265
+
266
+ done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
267
+ while done:
268
+ result, k = await done.pop()
269
+ results[k] = result
270
+
271
+ return results
272
+
273
+
274
+ # these methods should be implemented as async by any async-able backend
275
+ async_methods = [
276
+ "_ls",
277
+ "_cat_file",
278
+ "_get_file",
279
+ "_put_file",
280
+ "_rm_file",
281
+ "_cp_file",
282
+ "_pipe_file",
283
+ "_expand_path",
284
+ "_info",
285
+ "_isfile",
286
+ "_isdir",
287
+ "_exists",
288
+ "_walk",
289
+ "_glob",
290
+ "_find",
291
+ "_du",
292
+ "_size",
293
+ "_mkdir",
294
+ "_makedirs",
295
+ ]
296
+
297
+
298
+ class AsyncFileSystem(AbstractFileSystem):
299
+ """Async file operations, default implementations
300
+
301
+ Passes bulk operations to asyncio.gather for concurrent operation.
302
+
303
+ Implementations that have concurrent batch operations and/or async methods
304
+ should inherit from this class instead of AbstractFileSystem. Docstrings are
305
+ copied from the un-underscored method in AbstractFileSystem, if not given.
306
+ """
307
+
308
+ # note that methods do not have docstring here; they will be copied
309
+ # for _* methods and inferred for overridden methods.
310
+
311
+ async_impl = True
312
+ mirror_sync_methods = True
313
+ disable_throttling = False
314
+
315
+ def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs):
316
+ self.asynchronous = asynchronous
317
+ self._pid = os.getpid()
318
+ if not asynchronous:
319
+ self._loop = loop or get_loop()
320
+ else:
321
+ self._loop = None
322
+ self.batch_size = batch_size
323
+ super().__init__(*args, **kwargs)
324
+
325
+ @property
326
+ def loop(self):
327
+ if self._pid != os.getpid():
328
+ raise RuntimeError("This class is not fork-safe")
329
+ return self._loop
330
+
331
+ async def _rm_file(self, path, **kwargs):
332
+ raise NotImplementedError
333
+
334
+ async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
335
+ # TODO: implement on_error
336
+ batch_size = batch_size or self.batch_size
337
+ path = await self._expand_path(path, recursive=recursive)
338
+ return await _run_coros_in_chunks(
339
+ [self._rm_file(p, **kwargs) for p in reversed(path)],
340
+ batch_size=batch_size,
341
+ nofiles=True,
342
+ )
343
+
344
+ async def _cp_file(self, path1, path2, **kwargs):
345
+ raise NotImplementedError
346
+
347
+ async def _mv_file(self, path1, path2):
348
+ await self._cp_file(path1, path2)
349
+ await self._rm_file(path1)
350
+
351
+ async def _copy(
352
+ self,
353
+ path1,
354
+ path2,
355
+ recursive=False,
356
+ on_error=None,
357
+ maxdepth=None,
358
+ batch_size=None,
359
+ **kwargs,
360
+ ):
361
+ if on_error is None and recursive:
362
+ on_error = "ignore"
363
+ elif on_error is None:
364
+ on_error = "raise"
365
+
366
+ if isinstance(path1, list) and isinstance(path2, list):
367
+ # No need to expand paths when both source and destination
368
+ # are provided as lists
369
+ paths1 = path1
370
+ paths2 = path2
371
+ else:
372
+ source_is_str = isinstance(path1, str)
373
+ paths1 = await self._expand_path(
374
+ path1, maxdepth=maxdepth, recursive=recursive
375
+ )
376
+ if source_is_str and (not recursive or maxdepth is not None):
377
+ # Non-recursive glob does not copy directories
378
+ paths1 = [
379
+ p for p in paths1 if not (trailing_sep(p) or await self._isdir(p))
380
+ ]
381
+ if not paths1:
382
+ return
383
+
384
+ source_is_file = len(paths1) == 1
385
+ dest_is_dir = isinstance(path2, str) and (
386
+ trailing_sep(path2) or await self._isdir(path2)
387
+ )
388
+
389
+ exists = source_is_str and (
390
+ (has_magic(path1) and source_is_file)
391
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
392
+ )
393
+ paths2 = other_paths(
394
+ paths1,
395
+ path2,
396
+ exists=exists,
397
+ flatten=not source_is_str,
398
+ )
399
+
400
+ batch_size = batch_size or self.batch_size
401
+ coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)]
402
+ result = await _run_coros_in_chunks(
403
+ coros, batch_size=batch_size, return_exceptions=True, nofiles=True
404
+ )
405
+
406
+ for ex in filter(is_exception, result):
407
+ if on_error == "ignore" and isinstance(ex, FileNotFoundError):
408
+ continue
409
+ raise ex
410
+
411
+ async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
412
+ raise NotImplementedError
413
+
414
+ async def _pipe(self, path, value=None, batch_size=None, **kwargs):
415
+ if isinstance(path, str):
416
+ path = {path: value}
417
+ batch_size = batch_size or self.batch_size
418
+ return await _run_coros_in_chunks(
419
+ [self._pipe_file(k, v, **kwargs) for k, v in path.items()],
420
+ batch_size=batch_size,
421
+ nofiles=True,
422
+ )
423
+
424
+ async def _process_limits(self, url, start, end):
425
+ """Helper for "Range"-based _cat_file"""
426
+ size = None
427
+ suff = False
428
+ if start is not None and start < 0:
429
+ # if start is negative and end None, end is the "suffix length"
430
+ if end is None:
431
+ end = -start
432
+ start = ""
433
+ suff = True
434
+ else:
435
+ size = size or (await self._info(url))["size"]
436
+ start = size + start
437
+ elif start is None:
438
+ start = 0
439
+ if not suff:
440
+ if end is not None and end < 0:
441
+ if start is not None:
442
+ size = size or (await self._info(url))["size"]
443
+ end = size + end
444
+ elif end is None:
445
+ end = ""
446
+ if isinstance(end, numbers.Integral):
447
+ end -= 1 # bytes range is inclusive
448
+ return f"bytes={start}-{end}"
449
+
450
+ async def _cat_file(self, path, start=None, end=None, **kwargs):
451
+ raise NotImplementedError
452
+
453
+ async def _cat(
454
+ self, path, recursive=False, on_error="raise", batch_size=None, **kwargs
455
+ ):
456
+ paths = await self._expand_path(path, recursive=recursive)
457
+ coros = [self._cat_file(path, **kwargs) for path in paths]
458
+ batch_size = batch_size or self.batch_size
459
+ out = await _run_coros_in_chunks(
460
+ coros, batch_size=batch_size, nofiles=True, return_exceptions=True
461
+ )
462
+ if on_error == "raise":
463
+ ex = next(filter(is_exception, out), False)
464
+ if ex:
465
+ raise ex
466
+ if (
467
+ len(paths) > 1
468
+ or isinstance(path, list)
469
+ or paths[0] != self._strip_protocol(path)
470
+ ):
471
+ return {
472
+ k: v
473
+ for k, v in zip(paths, out)
474
+ if on_error != "omit" or not is_exception(v)
475
+ }
476
+ else:
477
+ return out[0]
478
+
479
+ async def _cat_ranges(
480
+ self,
481
+ paths,
482
+ starts,
483
+ ends,
484
+ max_gap=None,
485
+ batch_size=None,
486
+ on_error="return",
487
+ **kwargs,
488
+ ):
489
+ """Get the contents of byte ranges from one or more files
490
+
491
+ Parameters
492
+ ----------
493
+ paths: list
494
+ A list of of filepaths on this filesystems
495
+ starts, ends: int or list
496
+ Bytes limits of the read. If using a single int, the same value will be
497
+ used to read all the specified files.
498
+ """
499
+ # TODO: on_error
500
+ if max_gap is not None:
501
+ # use utils.merge_offset_ranges
502
+ raise NotImplementedError
503
+ if not isinstance(paths, list):
504
+ raise TypeError
505
+ if not isinstance(starts, Iterable):
506
+ starts = [starts] * len(paths)
507
+ if not isinstance(ends, Iterable):
508
+ ends = [ends] * len(paths)
509
+ if len(starts) != len(paths) or len(ends) != len(paths):
510
+ raise ValueError
511
+ coros = [
512
+ self._cat_file(p, start=s, end=e, **kwargs)
513
+ for p, s, e in zip(paths, starts, ends)
514
+ ]
515
+ batch_size = batch_size or self.batch_size
516
+ return await _run_coros_in_chunks(
517
+ coros, batch_size=batch_size, nofiles=True, return_exceptions=True
518
+ )
519
+
520
+ async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
521
+ raise NotImplementedError
522
+
523
+ async def _put(
524
+ self,
525
+ lpath,
526
+ rpath,
527
+ recursive=False,
528
+ callback=DEFAULT_CALLBACK,
529
+ batch_size=None,
530
+ maxdepth=None,
531
+ **kwargs,
532
+ ):
533
+ """Copy file(s) from local.
534
+
535
+ Copies a specific file or tree of files (if recursive=True). If rpath
536
+ ends with a "/", it will be assumed to be a directory, and target files
537
+ will go within.
538
+
539
+ The put_file method will be called concurrently on a batch of files. The
540
+ batch_size option can configure the amount of futures that can be executed
541
+ at the same time. If it is -1, then all the files will be uploaded concurrently.
542
+ The default can be set for this instance by passing "batch_size" in the
543
+ constructor, or for all instances by setting the "gather_batch_size" key
544
+ in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
545
+ """
546
+ if isinstance(lpath, list) and isinstance(rpath, list):
547
+ # No need to expand paths when both source and destination
548
+ # are provided as lists
549
+ rpaths = rpath
550
+ lpaths = lpath
551
+ else:
552
+ source_is_str = isinstance(lpath, str)
553
+ if source_is_str:
554
+ lpath = make_path_posix(lpath)
555
+ fs = LocalFileSystem()
556
+ lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
557
+ if source_is_str and (not recursive or maxdepth is not None):
558
+ # Non-recursive glob does not copy directories
559
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
560
+ if not lpaths:
561
+ return
562
+
563
+ source_is_file = len(lpaths) == 1
564
+ dest_is_dir = isinstance(rpath, str) and (
565
+ trailing_sep(rpath) or await self._isdir(rpath)
566
+ )
567
+
568
+ rpath = self._strip_protocol(rpath)
569
+ exists = source_is_str and (
570
+ (has_magic(lpath) and source_is_file)
571
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
572
+ )
573
+ rpaths = other_paths(
574
+ lpaths,
575
+ rpath,
576
+ exists=exists,
577
+ flatten=not source_is_str,
578
+ )
579
+
580
+ is_dir = {l: os.path.isdir(l) for l in lpaths}
581
+ rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]]
582
+ file_pairs = [(l, r) for l, r in zip(lpaths, rpaths) if not is_dir[l]]
583
+
584
+ await asyncio.gather(*[self._makedirs(d, exist_ok=True) for d in rdirs])
585
+ batch_size = batch_size or self.batch_size
586
+
587
+ coros = []
588
+ callback.set_size(len(file_pairs))
589
+ for lfile, rfile in file_pairs:
590
+ put_file = callback.branch_coro(self._put_file)
591
+ coros.append(put_file(lfile, rfile, **kwargs))
592
+
593
+ return await _run_coros_in_chunks(
594
+ coros, batch_size=batch_size, callback=callback
595
+ )
596
+
597
+ async def _get_file(self, rpath, lpath, **kwargs):
598
+ raise NotImplementedError
599
+
600
+ async def _get(
601
+ self,
602
+ rpath,
603
+ lpath,
604
+ recursive=False,
605
+ callback=DEFAULT_CALLBACK,
606
+ maxdepth=None,
607
+ **kwargs,
608
+ ):
609
+ """Copy file(s) to local.
610
+
611
+ Copies a specific file or tree of files (if recursive=True). If lpath
612
+ ends with a "/", it will be assumed to be a directory, and target files
613
+ will go within. Can submit a list of paths, which may be glob-patterns
614
+ and will be expanded.
615
+
616
+ The get_file method will be called concurrently on a batch of files. The
617
+ batch_size option can configure the amount of futures that can be executed
618
+ at the same time. If it is -1, then all the files will be uploaded concurrently.
619
+ The default can be set for this instance by passing "batch_size" in the
620
+ constructor, or for all instances by setting the "gather_batch_size" key
621
+ in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
622
+ """
623
+ if isinstance(lpath, list) and isinstance(rpath, list):
624
+ # No need to expand paths when both source and destination
625
+ # are provided as lists
626
+ rpaths = rpath
627
+ lpaths = lpath
628
+ else:
629
+ source_is_str = isinstance(rpath, str)
630
+ # First check for rpath trailing slash as _strip_protocol removes it.
631
+ source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
632
+ rpath = self._strip_protocol(rpath)
633
+ rpaths = await self._expand_path(
634
+ rpath, recursive=recursive, maxdepth=maxdepth
635
+ )
636
+ if source_is_str and (not recursive or maxdepth is not None):
637
+ # Non-recursive glob does not copy directories
638
+ rpaths = [
639
+ p for p in rpaths if not (trailing_sep(p) or await self._isdir(p))
640
+ ]
641
+ if not rpaths:
642
+ return
643
+
644
+ lpath = make_path_posix(lpath)
645
+ source_is_file = len(rpaths) == 1
646
+ dest_is_dir = isinstance(lpath, str) and (
647
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
648
+ )
649
+
650
+ exists = source_is_str and (
651
+ (has_magic(rpath) and source_is_file)
652
+ or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
653
+ )
654
+ lpaths = other_paths(
655
+ rpaths,
656
+ lpath,
657
+ exists=exists,
658
+ flatten=not source_is_str,
659
+ )
660
+
661
+ [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
662
+ batch_size = kwargs.pop("batch_size", self.batch_size)
663
+
664
+ coros = []
665
+ callback.set_size(len(lpaths))
666
+ for lpath, rpath in zip(lpaths, rpaths):
667
+ get_file = callback.branch_coro(self._get_file)
668
+ coros.append(get_file(rpath, lpath, **kwargs))
669
+ return await _run_coros_in_chunks(
670
+ coros, batch_size=batch_size, callback=callback
671
+ )
672
+
673
+ async def _isfile(self, path):
674
+ try:
675
+ return (await self._info(path))["type"] == "file"
676
+ except: # noqa: E722
677
+ return False
678
+
679
+ async def _isdir(self, path):
680
+ try:
681
+ return (await self._info(path))["type"] == "directory"
682
+ except OSError:
683
+ return False
684
+
685
+ async def _size(self, path):
686
+ return (await self._info(path)).get("size", None)
687
+
688
+ async def _sizes(self, paths, batch_size=None):
689
+ batch_size = batch_size or self.batch_size
690
+ return await _run_coros_in_chunks(
691
+ [self._size(p) for p in paths], batch_size=batch_size
692
+ )
693
+
694
+ async def _exists(self, path, **kwargs):
695
+ try:
696
+ await self._info(path, **kwargs)
697
+ return True
698
+ except FileNotFoundError:
699
+ return False
700
+
701
+ async def _info(self, path, **kwargs):
702
+ raise NotImplementedError
703
+
704
+ async def _ls(self, path, detail=True, **kwargs):
705
+ raise NotImplementedError
706
+
707
+ async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
708
+ if maxdepth is not None and maxdepth < 1:
709
+ raise ValueError("maxdepth must be at least 1")
710
+
711
+ path = self._strip_protocol(path)
712
+ full_dirs = {}
713
+ dirs = {}
714
+ files = {}
715
+
716
+ detail = kwargs.pop("detail", False)
717
+ try:
718
+ listing = await self._ls(path, detail=True, **kwargs)
719
+ except (FileNotFoundError, OSError) as e:
720
+ if on_error == "raise":
721
+ raise
722
+ elif callable(on_error):
723
+ on_error(e)
724
+ if detail:
725
+ yield path, {}, {}
726
+ else:
727
+ yield path, [], []
728
+ return
729
+
730
+ for info in listing:
731
+ # each info name must be at least [path]/part , but here
732
+ # we check also for names like [path]/part/
733
+ pathname = info["name"].rstrip("/")
734
+ name = pathname.rsplit("/", 1)[-1]
735
+ if info["type"] == "directory" and pathname != path:
736
+ # do not include "self" path
737
+ full_dirs[name] = pathname
738
+ dirs[name] = info
739
+ elif pathname == path:
740
+ # file-like with same name as give path
741
+ files[""] = info
742
+ else:
743
+ files[name] = info
744
+
745
+ if detail:
746
+ yield path, dirs, files
747
+ else:
748
+ yield path, list(dirs), list(files)
749
+
750
+ if maxdepth is not None:
751
+ maxdepth -= 1
752
+ if maxdepth < 1:
753
+ return
754
+
755
+ for d in dirs:
756
+ async for _ in self._walk(
757
+ full_dirs[d], maxdepth=maxdepth, detail=detail, **kwargs
758
+ ):
759
+ yield _
760
+
761
+ async def _glob(self, path, maxdepth=None, **kwargs):
762
+ if maxdepth is not None and maxdepth < 1:
763
+ raise ValueError("maxdepth must be at least 1")
764
+
765
+ import re
766
+
767
+ seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
768
+ ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
769
+ path = self._strip_protocol(path)
770
+ append_slash_to_dirname = ends_with_sep or path.endswith(
771
+ tuple(sep + "**" for sep in seps)
772
+ )
773
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
774
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
775
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
776
+
777
+ min_idx = min(idx_star, idx_qmark, idx_brace)
778
+
779
+ detail = kwargs.pop("detail", False)
780
+
781
+ if not has_magic(path):
782
+ if await self._exists(path, **kwargs):
783
+ if not detail:
784
+ return [path]
785
+ else:
786
+ return {path: await self._info(path, **kwargs)}
787
+ else:
788
+ if not detail:
789
+ return [] # glob of non-existent returns empty
790
+ else:
791
+ return {}
792
+ elif "/" in path[:min_idx]:
793
+ min_idx = path[:min_idx].rindex("/")
794
+ root = path[: min_idx + 1]
795
+ depth = path[min_idx + 1 :].count("/") + 1
796
+ else:
797
+ root = ""
798
+ depth = path[min_idx + 1 :].count("/") + 1
799
+
800
+ if "**" in path:
801
+ if maxdepth is not None:
802
+ idx_double_stars = path.find("**")
803
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
804
+ depth = depth - depth_double_stars + maxdepth
805
+ else:
806
+ depth = None
807
+
808
+ allpaths = await self._find(
809
+ root, maxdepth=depth, withdirs=True, detail=True, **kwargs
810
+ )
811
+
812
+ pattern = glob_translate(path + ("/" if ends_with_sep else ""))
813
+ pattern = re.compile(pattern)
814
+
815
+ out = {
816
+ p: info
817
+ for p, info in sorted(allpaths.items())
818
+ if pattern.match(
819
+ p + "/"
820
+ if append_slash_to_dirname and info["type"] == "directory"
821
+ else p
822
+ )
823
+ }
824
+
825
+ if detail:
826
+ return out
827
+ else:
828
+ return list(out)
829
+
830
+ async def _du(self, path, total=True, maxdepth=None, **kwargs):
831
+ sizes = {}
832
+ # async for?
833
+ for f in await self._find(path, maxdepth=maxdepth, **kwargs):
834
+ info = await self._info(f)
835
+ sizes[info["name"]] = info["size"]
836
+ if total:
837
+ return sum(sizes.values())
838
+ else:
839
+ return sizes
840
+
841
+ async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
842
+ path = self._strip_protocol(path)
843
+ out = {}
844
+ detail = kwargs.pop("detail", False)
845
+
846
+ # Add the root directory if withdirs is requested
847
+ # This is needed for posix glob compliance
848
+ if withdirs and path != "" and await self._isdir(path):
849
+ out[path] = await self._info(path)
850
+
851
+ # async for?
852
+ async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
853
+ if withdirs:
854
+ files.update(dirs)
855
+ out.update({info["name"]: info for name, info in files.items()})
856
+ if not out and (await self._isfile(path)):
857
+ # walk works on directories, but find should also return [path]
858
+ # when path happens to be a file
859
+ out[path] = {}
860
+ names = sorted(out)
861
+ if not detail:
862
+ return names
863
+ else:
864
+ return {name: out[name] for name in names}
865
+
866
+ async def _expand_path(self, path, recursive=False, maxdepth=None):
867
+ if maxdepth is not None and maxdepth < 1:
868
+ raise ValueError("maxdepth must be at least 1")
869
+
870
+ if isinstance(path, str):
871
+ out = await self._expand_path([path], recursive, maxdepth)
872
+ else:
873
+ out = set()
874
+ path = [self._strip_protocol(p) for p in path]
875
+ for p in path: # can gather here
876
+ if has_magic(p):
877
+ bit = set(await self._glob(p, maxdepth=maxdepth))
878
+ out |= bit
879
+ if recursive:
880
+ # glob call above expanded one depth so if maxdepth is defined
881
+ # then decrement it in expand_path call below. If it is zero
882
+ # after decrementing then avoid expand_path call.
883
+ if maxdepth is not None and maxdepth <= 1:
884
+ continue
885
+ out |= set(
886
+ await self._expand_path(
887
+ list(bit),
888
+ recursive=recursive,
889
+ maxdepth=maxdepth - 1 if maxdepth is not None else None,
890
+ )
891
+ )
892
+ continue
893
+ elif recursive:
894
+ rec = set(await self._find(p, maxdepth=maxdepth, withdirs=True))
895
+ out |= rec
896
+ if p not in out and (recursive is False or (await self._exists(p))):
897
+ # should only check once, for the root
898
+ out.add(p)
899
+ if not out:
900
+ raise FileNotFoundError(path)
901
+ return sorted(out)
902
+
903
+ async def _mkdir(self, path, create_parents=True, **kwargs):
904
+ pass # not necessary to implement, may not have directories
905
+
906
+ async def _makedirs(self, path, exist_ok=False):
907
+ pass # not necessary to implement, may not have directories
908
+
909
+ async def open_async(self, path, mode="rb", **kwargs):
910
+ if "b" not in mode or kwargs.get("compression"):
911
+ raise ValueError
912
+ raise NotImplementedError
913
+
914
+
915
+ def mirror_sync_methods(obj):
916
+ """Populate sync and async methods for obj
917
+
918
+ For each method will create a sync version if the name refers to an async method
919
+ (coroutine) and there is no override in the child class; will create an async
920
+ method for the corresponding sync method if there is no implementation.
921
+
922
+ Uses the methods specified in
923
+ - async_methods: the set that an implementation is expected to provide
924
+ - default_async_methods: that can be derived from their sync version in
925
+ AbstractFileSystem
926
+ - AsyncFileSystem: async-specific default coroutines
927
+ """
928
+ from fsspec import AbstractFileSystem
929
+
930
+ for method in async_methods + dir(AsyncFileSystem):
931
+ if not method.startswith("_"):
932
+ continue
933
+ smethod = method[1:]
934
+ if private.match(method):
935
+ isco = inspect.iscoroutinefunction(getattr(obj, method, None))
936
+ unsync = getattr(getattr(obj, smethod, False), "__func__", None)
937
+ is_default = unsync is getattr(AbstractFileSystem, smethod, "")
938
+ if isco and is_default:
939
+ mth = sync_wrapper(getattr(obj, method), obj=obj)
940
+ setattr(obj, smethod, mth)
941
+ if not mth.__doc__:
942
+ mth.__doc__ = getattr(
943
+ getattr(AbstractFileSystem, smethod, None), "__doc__", ""
944
+ )
945
+
946
+
947
+ class FSSpecCoroutineCancel(Exception):
948
+ pass
949
+
950
+
951
+ def _dump_running_tasks(
952
+ printout=True, cancel=True, exc=FSSpecCoroutineCancel, with_task=False
953
+ ):
954
+ import traceback
955
+
956
+ tasks = [t for t in asyncio.tasks.all_tasks(loop[0]) if not t.done()]
957
+ if printout:
958
+ [task.print_stack() for task in tasks]
959
+ out = [
960
+ {
961
+ "locals": task._coro.cr_frame.f_locals,
962
+ "file": task._coro.cr_frame.f_code.co_filename,
963
+ "firstline": task._coro.cr_frame.f_code.co_firstlineno,
964
+ "linelo": task._coro.cr_frame.f_lineno,
965
+ "stack": traceback.format_stack(task._coro.cr_frame),
966
+ "task": task if with_task else None,
967
+ }
968
+ for task in tasks
969
+ ]
970
+ if cancel:
971
+ for t in tasks:
972
+ cbs = t._callbacks
973
+ t.cancel()
974
+ asyncio.futures.Future.set_exception(t, exc)
975
+ asyncio.futures.Future.cancel(t)
976
+ [cb[0](t) for cb in cbs] # cancels any dependent concurrent.futures
977
+ try:
978
+ t._coro.throw(exc) # exits coro, unless explicitly handled
979
+ except exc:
980
+ pass
981
+ return out
982
+
983
+
984
+ class AbstractAsyncStreamedFile(AbstractBufferedFile):
985
+ # no read buffering, and always auto-commit
986
+ # TODO: readahead might still be useful here, but needs async version
987
+
988
+ async def read(self, length=-1):
989
+ """
990
+ Return data from cache, or fetch pieces as necessary
991
+
992
+ Parameters
993
+ ----------
994
+ length: int (-1)
995
+ Number of bytes to read; if <0, all remaining bytes.
996
+ """
997
+ length = -1 if length is None else int(length)
998
+ if self.mode != "rb":
999
+ raise ValueError("File not in read mode")
1000
+ if length < 0:
1001
+ length = self.size - self.loc
1002
+ if self.closed:
1003
+ raise ValueError("I/O operation on closed file.")
1004
+ if length == 0:
1005
+ # don't even bother calling fetch
1006
+ return b""
1007
+ out = await self._fetch_range(self.loc, self.loc + length)
1008
+ self.loc += len(out)
1009
+ return out
1010
+
1011
+ async def write(self, data):
1012
+ """
1013
+ Write data to buffer.
1014
+
1015
+ Buffer only sent on flush() or if buffer is greater than
1016
+ or equal to blocksize.
1017
+
1018
+ Parameters
1019
+ ----------
1020
+ data: bytes
1021
+ Set of bytes to be written.
1022
+ """
1023
+ if self.mode not in {"wb", "ab"}:
1024
+ raise ValueError("File not in write mode")
1025
+ if self.closed:
1026
+ raise ValueError("I/O operation on closed file.")
1027
+ if self.forced:
1028
+ raise ValueError("This file has been force-flushed, can only close")
1029
+ out = self.buffer.write(data)
1030
+ self.loc += out
1031
+ if self.buffer.tell() >= self.blocksize:
1032
+ await self.flush()
1033
+ return out
1034
+
1035
+ async def close(self):
1036
+ """Close file
1037
+
1038
+ Finalizes writes, discards cache
1039
+ """
1040
+ if getattr(self, "_unclosable", False):
1041
+ return
1042
+ if self.closed:
1043
+ return
1044
+ if self.mode == "rb":
1045
+ self.cache = None
1046
+ else:
1047
+ if not self.forced:
1048
+ await self.flush(force=True)
1049
+
1050
+ if self.fs is not None:
1051
+ self.fs.invalidate_cache(self.path)
1052
+ self.fs.invalidate_cache(self.fs._parent(self.path))
1053
+
1054
+ self.closed = True
1055
+
1056
+ async def flush(self, force=False):
1057
+ if self.closed:
1058
+ raise ValueError("Flush on closed file")
1059
+ if force and self.forced:
1060
+ raise ValueError("Force flush cannot be called more than once")
1061
+ if force:
1062
+ self.forced = True
1063
+
1064
+ if self.mode not in {"wb", "ab"}:
1065
+ # no-op to flush on read-mode
1066
+ return
1067
+
1068
+ if not force and self.buffer.tell() < self.blocksize:
1069
+ # Defer write on small block
1070
+ return
1071
+
1072
+ if self.offset is None:
1073
+ # Initialize a multipart upload
1074
+ self.offset = 0
1075
+ try:
1076
+ await self._initiate_upload()
1077
+ except:
1078
+ self.closed = True
1079
+ raise
1080
+
1081
+ if await self._upload_chunk(final=force) is not False:
1082
+ self.offset += self.buffer.seek(0, 2)
1083
+ self.buffer = io.BytesIO()
1084
+
1085
+ async def __aenter__(self):
1086
+ return self
1087
+
1088
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
1089
+ await self.close()
1090
+
1091
+ async def _fetch_range(self, start, end):
1092
+ raise NotImplementedError
1093
+
1094
+ async def _initiate_upload(self):
1095
+ pass
1096
+
1097
+ async def _upload_chunk(self, final=False):
1098
+ raise NotImplementedError
meow/lib/python3.13/site-packages/fsspec/caching.py ADDED
@@ -0,0 +1,966 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import collections
4
+ import functools
5
+ import logging
6
+ import math
7
+ import os
8
+ import threading
9
+ import warnings
10
+ from concurrent.futures import Future, ThreadPoolExecutor
11
+ from itertools import groupby
12
+ from operator import itemgetter
13
+ from typing import (
14
+ TYPE_CHECKING,
15
+ Any,
16
+ Callable,
17
+ ClassVar,
18
+ Generic,
19
+ NamedTuple,
20
+ Optional,
21
+ OrderedDict,
22
+ TypeVar,
23
+ )
24
+
25
+ if TYPE_CHECKING:
26
+ import mmap
27
+
28
+ from typing_extensions import ParamSpec
29
+
30
+ P = ParamSpec("P")
31
+ else:
32
+ P = TypeVar("P")
33
+
34
+ T = TypeVar("T")
35
+
36
+
37
+ logger = logging.getLogger("fsspec")
38
+
39
+ Fetcher = Callable[[int, int], bytes] # Maps (start, end) to bytes
40
+
41
+
42
+ class BaseCache:
43
+ """Pass-though cache: doesn't keep anything, calls every time
44
+
45
+ Acts as base class for other cachers
46
+
47
+ Parameters
48
+ ----------
49
+ blocksize: int
50
+ How far to read ahead in numbers of bytes
51
+ fetcher: func
52
+ Function of the form f(start, end) which gets bytes from remote as
53
+ specified
54
+ size: int
55
+ How big this file is
56
+ """
57
+
58
+ name: ClassVar[str] = "none"
59
+
60
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
61
+ self.blocksize = blocksize
62
+ self.nblocks = 0
63
+ self.fetcher = fetcher
64
+ self.size = size
65
+ self.hit_count = 0
66
+ self.miss_count = 0
67
+ # the bytes that we actually requested
68
+ self.total_requested_bytes = 0
69
+
70
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
71
+ if start is None:
72
+ start = 0
73
+ if stop is None:
74
+ stop = self.size
75
+ if start >= self.size or start >= stop:
76
+ return b""
77
+ return self.fetcher(start, stop)
78
+
79
+ def _reset_stats(self) -> None:
80
+ """Reset hit and miss counts for a more ganular report e.g. by file."""
81
+ self.hit_count = 0
82
+ self.miss_count = 0
83
+ self.total_requested_bytes = 0
84
+
85
+ def _log_stats(self) -> str:
86
+ """Return a formatted string of the cache statistics."""
87
+ if self.hit_count == 0 and self.miss_count == 0:
88
+ # a cache that does nothing, this is for logs only
89
+ return ""
90
+ return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
91
+
92
+ def __repr__(self) -> str:
93
+ # TODO: use rich for better formatting
94
+ return f"""
95
+ <{self.__class__.__name__}:
96
+ block size : {self.blocksize}
97
+ block count : {self.nblocks}
98
+ file size : {self.size}
99
+ cache hits : {self.hit_count}
100
+ cache misses: {self.miss_count}
101
+ total requested bytes: {self.total_requested_bytes}>
102
+ """
103
+
104
+
105
+ class MMapCache(BaseCache):
106
+ """memory-mapped sparse file cache
107
+
108
+ Opens temporary file, which is filled blocks-wise when data is requested.
109
+ Ensure there is enough disc space in the temporary location.
110
+
111
+ This cache method might only work on posix
112
+ """
113
+
114
+ name = "mmap"
115
+
116
+ def __init__(
117
+ self,
118
+ blocksize: int,
119
+ fetcher: Fetcher,
120
+ size: int,
121
+ location: str | None = None,
122
+ blocks: set[int] | None = None,
123
+ ) -> None:
124
+ super().__init__(blocksize, fetcher, size)
125
+ self.blocks = set() if blocks is None else blocks
126
+ self.location = location
127
+ self.cache = self._makefile()
128
+
129
+ def _makefile(self) -> mmap.mmap | bytearray:
130
+ import mmap
131
+ import tempfile
132
+
133
+ if self.size == 0:
134
+ return bytearray()
135
+
136
+ # posix version
137
+ if self.location is None or not os.path.exists(self.location):
138
+ if self.location is None:
139
+ fd = tempfile.TemporaryFile()
140
+ self.blocks = set()
141
+ else:
142
+ fd = open(self.location, "wb+")
143
+ fd.seek(self.size - 1)
144
+ fd.write(b"1")
145
+ fd.flush()
146
+ else:
147
+ fd = open(self.location, "r+b")
148
+
149
+ return mmap.mmap(fd.fileno(), self.size)
150
+
151
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
152
+ logger.debug(f"MMap cache fetching {start}-{end}")
153
+ if start is None:
154
+ start = 0
155
+ if end is None:
156
+ end = self.size
157
+ if start >= self.size or start >= end:
158
+ return b""
159
+ start_block = start // self.blocksize
160
+ end_block = end // self.blocksize
161
+ block_range = range(start_block, end_block + 1)
162
+ # Determine which blocks need to be fetched. This sequence is sorted by construction.
163
+ need = (i for i in block_range if i not in self.blocks)
164
+ # Count the number of blocks already cached
165
+ self.hit_count += sum(1 for i in block_range if i in self.blocks)
166
+
167
+ # Consolidate needed blocks.
168
+ # Algorithm adapted from Python 2.x itertools documentation.
169
+ # We are grouping an enumerated sequence of blocks. By comparing when the difference
170
+ # between an ascending range (provided by enumerate) and the needed block numbers
171
+ # we can detect when the block number skips values. The key computes this difference.
172
+ # Whenever the difference changes, we know that we have previously cached block(s),
173
+ # and a new group is started. In other words, this algorithm neatly groups
174
+ # runs of consecutive block numbers so they can be fetched together.
175
+ for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
176
+ # Extract the blocks from the enumerated sequence
177
+ _blocks = tuple(map(itemgetter(1), _blocks))
178
+ # Compute start of first block
179
+ sstart = _blocks[0] * self.blocksize
180
+ # Compute the end of the last block. Last block may not be full size.
181
+ send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
182
+
183
+ # Fetch bytes (could be multiple consecutive blocks)
184
+ self.total_requested_bytes += send - sstart
185
+ logger.debug(
186
+ f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
187
+ )
188
+ self.cache[sstart:send] = self.fetcher(sstart, send)
189
+
190
+ # Update set of cached blocks
191
+ self.blocks.update(_blocks)
192
+ # Update cache statistics with number of blocks we had to cache
193
+ self.miss_count += len(_blocks)
194
+
195
+ return self.cache[start:end]
196
+
197
+ def __getstate__(self) -> dict[str, Any]:
198
+ state = self.__dict__.copy()
199
+ # Remove the unpicklable entries.
200
+ del state["cache"]
201
+ return state
202
+
203
+ def __setstate__(self, state: dict[str, Any]) -> None:
204
+ # Restore instance attributes
205
+ self.__dict__.update(state)
206
+ self.cache = self._makefile()
207
+
208
+
209
+ class ReadAheadCache(BaseCache):
210
+ """Cache which reads only when we get beyond a block of data
211
+
212
+ This is a much simpler version of BytesCache, and does not attempt to
213
+ fill holes in the cache or keep fragments alive. It is best suited to
214
+ many small reads in a sequential order (e.g., reading lines from a file).
215
+ """
216
+
217
+ name = "readahead"
218
+
219
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
220
+ super().__init__(blocksize, fetcher, size)
221
+ self.cache = b""
222
+ self.start = 0
223
+ self.end = 0
224
+
225
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
226
+ if start is None:
227
+ start = 0
228
+ if end is None or end > self.size:
229
+ end = self.size
230
+ if start >= self.size or start >= end:
231
+ return b""
232
+ l = end - start
233
+ if start >= self.start and end <= self.end:
234
+ # cache hit
235
+ self.hit_count += 1
236
+ return self.cache[start - self.start : end - self.start]
237
+ elif self.start <= start < self.end:
238
+ # partial hit
239
+ self.miss_count += 1
240
+ part = self.cache[start - self.start :]
241
+ l -= len(part)
242
+ start = self.end
243
+ else:
244
+ # miss
245
+ self.miss_count += 1
246
+ part = b""
247
+ end = min(self.size, end + self.blocksize)
248
+ self.total_requested_bytes += end - start
249
+ self.cache = self.fetcher(start, end) # new block replaces old
250
+ self.start = start
251
+ self.end = self.start + len(self.cache)
252
+ return part + self.cache[:l]
253
+
254
+
255
+ class FirstChunkCache(BaseCache):
256
+ """Caches the first block of a file only
257
+
258
+ This may be useful for file types where the metadata is stored in the header,
259
+ but is randomly accessed.
260
+ """
261
+
262
+ name = "first"
263
+
264
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
265
+ if blocksize > size:
266
+ # this will buffer the whole thing
267
+ blocksize = size
268
+ super().__init__(blocksize, fetcher, size)
269
+ self.cache: bytes | None = None
270
+
271
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
272
+ start = start or 0
273
+ if start > self.size:
274
+ logger.debug("FirstChunkCache: requested start > file size")
275
+ return b""
276
+
277
+ end = min(end, self.size)
278
+
279
+ if start < self.blocksize:
280
+ if self.cache is None:
281
+ self.miss_count += 1
282
+ if end > self.blocksize:
283
+ self.total_requested_bytes += end
284
+ data = self.fetcher(0, end)
285
+ self.cache = data[: self.blocksize]
286
+ return data[start:]
287
+ self.cache = self.fetcher(0, self.blocksize)
288
+ self.total_requested_bytes += self.blocksize
289
+ part = self.cache[start:end]
290
+ if end > self.blocksize:
291
+ self.total_requested_bytes += end - self.blocksize
292
+ part += self.fetcher(self.blocksize, end)
293
+ self.hit_count += 1
294
+ return part
295
+ else:
296
+ self.miss_count += 1
297
+ self.total_requested_bytes += end - start
298
+ return self.fetcher(start, end)
299
+
300
+
301
+ class BlockCache(BaseCache):
302
+ """
303
+ Cache holding memory as a set of blocks.
304
+
305
+ Requests are only ever made ``blocksize`` at a time, and are
306
+ stored in an LRU cache. The least recently accessed block is
307
+ discarded when more than ``maxblocks`` are stored.
308
+
309
+ Parameters
310
+ ----------
311
+ blocksize : int
312
+ The number of bytes to store in each block.
313
+ Requests are only ever made for ``blocksize``, so this
314
+ should balance the overhead of making a request against
315
+ the granularity of the blocks.
316
+ fetcher : Callable
317
+ size : int
318
+ The total size of the file being cached.
319
+ maxblocks : int
320
+ The maximum number of blocks to cache for. The maximum memory
321
+ use for this cache is then ``blocksize * maxblocks``.
322
+ """
323
+
324
+ name = "blockcache"
325
+
326
+ def __init__(
327
+ self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
328
+ ) -> None:
329
+ super().__init__(blocksize, fetcher, size)
330
+ self.nblocks = math.ceil(size / blocksize)
331
+ self.maxblocks = maxblocks
332
+ self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
333
+
334
+ def cache_info(self):
335
+ """
336
+ The statistics on the block cache.
337
+
338
+ Returns
339
+ -------
340
+ NamedTuple
341
+ Returned directly from the LRU Cache used internally.
342
+ """
343
+ return self._fetch_block_cached.cache_info()
344
+
345
+ def __getstate__(self) -> dict[str, Any]:
346
+ state = self.__dict__
347
+ del state["_fetch_block_cached"]
348
+ return state
349
+
350
+ def __setstate__(self, state: dict[str, Any]) -> None:
351
+ self.__dict__.update(state)
352
+ self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
353
+ self._fetch_block
354
+ )
355
+
356
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
357
+ if start is None:
358
+ start = 0
359
+ if end is None:
360
+ end = self.size
361
+ if start >= self.size or start >= end:
362
+ return b""
363
+
364
+ # byte position -> block numbers
365
+ start_block_number = start // self.blocksize
366
+ end_block_number = end // self.blocksize
367
+
368
+ # these are cached, so safe to do multiple calls for the same start and end.
369
+ for block_number in range(start_block_number, end_block_number + 1):
370
+ self._fetch_block_cached(block_number)
371
+
372
+ return self._read_cache(
373
+ start,
374
+ end,
375
+ start_block_number=start_block_number,
376
+ end_block_number=end_block_number,
377
+ )
378
+
379
+ def _fetch_block(self, block_number: int) -> bytes:
380
+ """
381
+ Fetch the block of data for `block_number`.
382
+ """
383
+ if block_number > self.nblocks:
384
+ raise ValueError(
385
+ f"'block_number={block_number}' is greater than "
386
+ f"the number of blocks ({self.nblocks})"
387
+ )
388
+
389
+ start = block_number * self.blocksize
390
+ end = start + self.blocksize
391
+ self.total_requested_bytes += end - start
392
+ self.miss_count += 1
393
+ logger.info("BlockCache fetching block %d", block_number)
394
+ block_contents = super()._fetch(start, end)
395
+ return block_contents
396
+
397
+ def _read_cache(
398
+ self, start: int, end: int, start_block_number: int, end_block_number: int
399
+ ) -> bytes:
400
+ """
401
+ Read from our block cache.
402
+
403
+ Parameters
404
+ ----------
405
+ start, end : int
406
+ The start and end byte positions.
407
+ start_block_number, end_block_number : int
408
+ The start and end block numbers.
409
+ """
410
+ start_pos = start % self.blocksize
411
+ end_pos = end % self.blocksize
412
+
413
+ self.hit_count += 1
414
+ if start_block_number == end_block_number:
415
+ block: bytes = self._fetch_block_cached(start_block_number)
416
+ return block[start_pos:end_pos]
417
+
418
+ else:
419
+ # read from the initial
420
+ out = [self._fetch_block_cached(start_block_number)[start_pos:]]
421
+
422
+ # intermediate blocks
423
+ # Note: it'd be nice to combine these into one big request. However
424
+ # that doesn't play nicely with our LRU cache.
425
+ out.extend(
426
+ map(
427
+ self._fetch_block_cached,
428
+ range(start_block_number + 1, end_block_number),
429
+ )
430
+ )
431
+
432
+ # final block
433
+ out.append(self._fetch_block_cached(end_block_number)[:end_pos])
434
+
435
+ return b"".join(out)
436
+
437
+
438
+ class BytesCache(BaseCache):
439
+ """Cache which holds data in a in-memory bytes object
440
+
441
+ Implements read-ahead by the block size, for semi-random reads progressing
442
+ through the file.
443
+
444
+ Parameters
445
+ ----------
446
+ trim: bool
447
+ As we read more data, whether to discard the start of the buffer when
448
+ we are more than a blocksize ahead of it.
449
+ """
450
+
451
+ name: ClassVar[str] = "bytes"
452
+
453
+ def __init__(
454
+ self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True
455
+ ) -> None:
456
+ super().__init__(blocksize, fetcher, size)
457
+ self.cache = b""
458
+ self.start: int | None = None
459
+ self.end: int | None = None
460
+ self.trim = trim
461
+
462
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
463
+ # TODO: only set start/end after fetch, in case it fails?
464
+ # is this where retry logic might go?
465
+ if start is None:
466
+ start = 0
467
+ if end is None:
468
+ end = self.size
469
+ if start >= self.size or start >= end:
470
+ return b""
471
+ if (
472
+ self.start is not None
473
+ and start >= self.start
474
+ and self.end is not None
475
+ and end < self.end
476
+ ):
477
+ # cache hit: we have all the required data
478
+ offset = start - self.start
479
+ self.hit_count += 1
480
+ return self.cache[offset : offset + end - start]
481
+
482
+ if self.blocksize:
483
+ bend = min(self.size, end + self.blocksize)
484
+ else:
485
+ bend = end
486
+
487
+ if bend == start or start > self.size:
488
+ return b""
489
+
490
+ if (self.start is None or start < self.start) and (
491
+ self.end is None or end > self.end
492
+ ):
493
+ # First read, or extending both before and after
494
+ self.total_requested_bytes += bend - start
495
+ self.miss_count += 1
496
+ self.cache = self.fetcher(start, bend)
497
+ self.start = start
498
+ else:
499
+ assert self.start is not None
500
+ assert self.end is not None
501
+ self.miss_count += 1
502
+
503
+ if start < self.start:
504
+ if self.end is None or self.end - end > self.blocksize:
505
+ self.total_requested_bytes += bend - start
506
+ self.cache = self.fetcher(start, bend)
507
+ self.start = start
508
+ else:
509
+ self.total_requested_bytes += self.start - start
510
+ new = self.fetcher(start, self.start)
511
+ self.start = start
512
+ self.cache = new + self.cache
513
+ elif self.end is not None and bend > self.end:
514
+ if self.end > self.size:
515
+ pass
516
+ elif end - self.end > self.blocksize:
517
+ self.total_requested_bytes += bend - start
518
+ self.cache = self.fetcher(start, bend)
519
+ self.start = start
520
+ else:
521
+ self.total_requested_bytes += bend - self.end
522
+ new = self.fetcher(self.end, bend)
523
+ self.cache = self.cache + new
524
+
525
+ self.end = self.start + len(self.cache)
526
+ offset = start - self.start
527
+ out = self.cache[offset : offset + end - start]
528
+ if self.trim:
529
+ num = (self.end - self.start) // (self.blocksize + 1)
530
+ if num > 1:
531
+ self.start += self.blocksize * num
532
+ self.cache = self.cache[self.blocksize * num :]
533
+ return out
534
+
535
+ def __len__(self) -> int:
536
+ return len(self.cache)
537
+
538
+
539
+ class AllBytes(BaseCache):
540
+ """Cache entire contents of the file"""
541
+
542
+ name: ClassVar[str] = "all"
543
+
544
+ def __init__(
545
+ self,
546
+ blocksize: int | None = None,
547
+ fetcher: Fetcher | None = None,
548
+ size: int | None = None,
549
+ data: bytes | None = None,
550
+ ) -> None:
551
+ super().__init__(blocksize, fetcher, size) # type: ignore[arg-type]
552
+ if data is None:
553
+ self.miss_count += 1
554
+ self.total_requested_bytes += self.size
555
+ data = self.fetcher(0, self.size)
556
+ self.data = data
557
+
558
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
559
+ self.hit_count += 1
560
+ return self.data[start:stop]
561
+
562
+
563
+ class KnownPartsOfAFile(BaseCache):
564
+ """
565
+ Cache holding known file parts.
566
+
567
+ Parameters
568
+ ----------
569
+ blocksize: int
570
+ How far to read ahead in numbers of bytes
571
+ fetcher: func
572
+ Function of the form f(start, end) which gets bytes from remote as
573
+ specified
574
+ size: int
575
+ How big this file is
576
+ data: dict
577
+ A dictionary mapping explicit `(start, stop)` file-offset tuples
578
+ with known bytes.
579
+ strict: bool, default True
580
+ Whether to fetch reads that go beyond a known byte-range boundary.
581
+ If `False`, any read that ends outside a known part will be zero
582
+ padded. Note that zero padding will not be used for reads that
583
+ begin outside a known byte-range.
584
+ """
585
+
586
+ name: ClassVar[str] = "parts"
587
+
588
+ def __init__(
589
+ self,
590
+ blocksize: int,
591
+ fetcher: Fetcher,
592
+ size: int,
593
+ data: Optional[dict[tuple[int, int], bytes]] = None,
594
+ strict: bool = True,
595
+ **_: Any,
596
+ ):
597
+ super().__init__(blocksize, fetcher, size)
598
+ self.strict = strict
599
+
600
+ # simple consolidation of contiguous blocks
601
+ if data:
602
+ old_offsets = sorted(data.keys())
603
+ offsets = [old_offsets[0]]
604
+ blocks = [data.pop(old_offsets[0])]
605
+ for start, stop in old_offsets[1:]:
606
+ start0, stop0 = offsets[-1]
607
+ if start == stop0:
608
+ offsets[-1] = (start0, stop)
609
+ blocks[-1] += data.pop((start, stop))
610
+ else:
611
+ offsets.append((start, stop))
612
+ blocks.append(data.pop((start, stop)))
613
+
614
+ self.data = dict(zip(offsets, blocks))
615
+ else:
616
+ self.data = {}
617
+
618
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
619
+ if start is None:
620
+ start = 0
621
+ if stop is None:
622
+ stop = self.size
623
+
624
+ out = b""
625
+ for (loc0, loc1), data in self.data.items():
626
+ # If self.strict=False, use zero-padded data
627
+ # for reads beyond the end of a "known" buffer
628
+ if loc0 <= start < loc1:
629
+ off = start - loc0
630
+ out = data[off : off + stop - start]
631
+ if not self.strict or loc0 <= stop <= loc1:
632
+ # The request is within a known range, or
633
+ # it begins within a known range, and we
634
+ # are allowed to pad reads beyond the
635
+ # buffer with zero
636
+ out += b"\x00" * (stop - start - len(out))
637
+ self.hit_count += 1
638
+ return out
639
+ else:
640
+ # The request ends outside a known range,
641
+ # and we are being "strict" about reads
642
+ # beyond the buffer
643
+ start = loc1
644
+ break
645
+
646
+ # We only get here if there is a request outside the
647
+ # known parts of the file. In an ideal world, this
648
+ # should never happen
649
+ if self.fetcher is None:
650
+ # We cannot fetch the data, so raise an error
651
+ raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
652
+ # We can fetch the data, but should warn the user
653
+ # that this may be slow
654
+ warnings.warn(
655
+ f"Read is outside the known file parts: {(start, stop)}. "
656
+ f"IO/caching performance may be poor!"
657
+ )
658
+ logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
659
+ self.total_requested_bytes += stop - start
660
+ self.miss_count += 1
661
+ return out + super()._fetch(start, stop)
662
+
663
+
664
+ class UpdatableLRU(Generic[P, T]):
665
+ """
666
+ Custom implementation of LRU cache that allows updating keys
667
+
668
+ Used by BackgroudBlockCache
669
+ """
670
+
671
+ class CacheInfo(NamedTuple):
672
+ hits: int
673
+ misses: int
674
+ maxsize: int
675
+ currsize: int
676
+
677
+ def __init__(self, func: Callable[P, T], max_size: int = 128) -> None:
678
+ self._cache: OrderedDict[Any, T] = collections.OrderedDict()
679
+ self._func = func
680
+ self._max_size = max_size
681
+ self._hits = 0
682
+ self._misses = 0
683
+ self._lock = threading.Lock()
684
+
685
+ def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T:
686
+ if kwargs:
687
+ raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}")
688
+ with self._lock:
689
+ if args in self._cache:
690
+ self._cache.move_to_end(args)
691
+ self._hits += 1
692
+ return self._cache[args]
693
+
694
+ result = self._func(*args, **kwargs)
695
+
696
+ with self._lock:
697
+ self._cache[args] = result
698
+ self._misses += 1
699
+ if len(self._cache) > self._max_size:
700
+ self._cache.popitem(last=False)
701
+
702
+ return result
703
+
704
+ def is_key_cached(self, *args: Any) -> bool:
705
+ with self._lock:
706
+ return args in self._cache
707
+
708
+ def add_key(self, result: T, *args: Any) -> None:
709
+ with self._lock:
710
+ self._cache[args] = result
711
+ if len(self._cache) > self._max_size:
712
+ self._cache.popitem(last=False)
713
+
714
+ def cache_info(self) -> UpdatableLRU.CacheInfo:
715
+ with self._lock:
716
+ return self.CacheInfo(
717
+ maxsize=self._max_size,
718
+ currsize=len(self._cache),
719
+ hits=self._hits,
720
+ misses=self._misses,
721
+ )
722
+
723
+
724
+ class BackgroundBlockCache(BaseCache):
725
+ """
726
+ Cache holding memory as a set of blocks with pre-loading of
727
+ the next block in the background.
728
+
729
+ Requests are only ever made ``blocksize`` at a time, and are
730
+ stored in an LRU cache. The least recently accessed block is
731
+ discarded when more than ``maxblocks`` are stored. If the
732
+ next block is not in cache, it is loaded in a separate thread
733
+ in non-blocking way.
734
+
735
+ Parameters
736
+ ----------
737
+ blocksize : int
738
+ The number of bytes to store in each block.
739
+ Requests are only ever made for ``blocksize``, so this
740
+ should balance the overhead of making a request against
741
+ the granularity of the blocks.
742
+ fetcher : Callable
743
+ size : int
744
+ The total size of the file being cached.
745
+ maxblocks : int
746
+ The maximum number of blocks to cache for. The maximum memory
747
+ use for this cache is then ``blocksize * maxblocks``.
748
+ """
749
+
750
+ name: ClassVar[str] = "background"
751
+
752
+ def __init__(
753
+ self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
754
+ ) -> None:
755
+ super().__init__(blocksize, fetcher, size)
756
+ self.nblocks = math.ceil(size / blocksize)
757
+ self.maxblocks = maxblocks
758
+ self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks)
759
+
760
+ self._thread_executor = ThreadPoolExecutor(max_workers=1)
761
+ self._fetch_future_block_number: int | None = None
762
+ self._fetch_future: Future[bytes] | None = None
763
+ self._fetch_future_lock = threading.Lock()
764
+
765
+ def cache_info(self) -> UpdatableLRU.CacheInfo:
766
+ """
767
+ The statistics on the block cache.
768
+
769
+ Returns
770
+ -------
771
+ NamedTuple
772
+ Returned directly from the LRU Cache used internally.
773
+ """
774
+ return self._fetch_block_cached.cache_info()
775
+
776
+ def __getstate__(self) -> dict[str, Any]:
777
+ state = self.__dict__
778
+ del state["_fetch_block_cached"]
779
+ del state["_thread_executor"]
780
+ del state["_fetch_future_block_number"]
781
+ del state["_fetch_future"]
782
+ del state["_fetch_future_lock"]
783
+ return state
784
+
785
+ def __setstate__(self, state) -> None:
786
+ self.__dict__.update(state)
787
+ self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"])
788
+ self._thread_executor = ThreadPoolExecutor(max_workers=1)
789
+ self._fetch_future_block_number = None
790
+ self._fetch_future = None
791
+ self._fetch_future_lock = threading.Lock()
792
+
793
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
794
+ if start is None:
795
+ start = 0
796
+ if end is None:
797
+ end = self.size
798
+ if start >= self.size or start >= end:
799
+ return b""
800
+
801
+ # byte position -> block numbers
802
+ start_block_number = start // self.blocksize
803
+ end_block_number = end // self.blocksize
804
+
805
+ fetch_future_block_number = None
806
+ fetch_future = None
807
+ with self._fetch_future_lock:
808
+ # Background thread is running. Check we we can or must join it.
809
+ if self._fetch_future is not None:
810
+ assert self._fetch_future_block_number is not None
811
+ if self._fetch_future.done():
812
+ logger.info("BlockCache joined background fetch without waiting.")
813
+ self._fetch_block_cached.add_key(
814
+ self._fetch_future.result(), self._fetch_future_block_number
815
+ )
816
+ # Cleanup the fetch variables. Done with fetching the block.
817
+ self._fetch_future_block_number = None
818
+ self._fetch_future = None
819
+ else:
820
+ # Must join if we need the block for the current fetch
821
+ must_join = bool(
822
+ start_block_number
823
+ <= self._fetch_future_block_number
824
+ <= end_block_number
825
+ )
826
+ if must_join:
827
+ # Copy to the local variables to release lock
828
+ # before waiting for result
829
+ fetch_future_block_number = self._fetch_future_block_number
830
+ fetch_future = self._fetch_future
831
+
832
+ # Cleanup the fetch variables. Have a local copy.
833
+ self._fetch_future_block_number = None
834
+ self._fetch_future = None
835
+
836
+ # Need to wait for the future for the current read
837
+ if fetch_future is not None:
838
+ logger.info("BlockCache waiting for background fetch.")
839
+ # Wait until result and put it in cache
840
+ self._fetch_block_cached.add_key(
841
+ fetch_future.result(), fetch_future_block_number
842
+ )
843
+
844
+ # these are cached, so safe to do multiple calls for the same start and end.
845
+ for block_number in range(start_block_number, end_block_number + 1):
846
+ self._fetch_block_cached(block_number)
847
+
848
+ # fetch next block in the background if nothing is running in the background,
849
+ # the block is within file and it is not already cached
850
+ end_block_plus_1 = end_block_number + 1
851
+ with self._fetch_future_lock:
852
+ if (
853
+ self._fetch_future is None
854
+ and end_block_plus_1 <= self.nblocks
855
+ and not self._fetch_block_cached.is_key_cached(end_block_plus_1)
856
+ ):
857
+ self._fetch_future_block_number = end_block_plus_1
858
+ self._fetch_future = self._thread_executor.submit(
859
+ self._fetch_block, end_block_plus_1, "async"
860
+ )
861
+
862
+ return self._read_cache(
863
+ start,
864
+ end,
865
+ start_block_number=start_block_number,
866
+ end_block_number=end_block_number,
867
+ )
868
+
869
+ def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
870
+ """
871
+ Fetch the block of data for `block_number`.
872
+ """
873
+ if block_number > self.nblocks:
874
+ raise ValueError(
875
+ f"'block_number={block_number}' is greater than "
876
+ f"the number of blocks ({self.nblocks})"
877
+ )
878
+
879
+ start = block_number * self.blocksize
880
+ end = start + self.blocksize
881
+ logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
882
+ self.total_requested_bytes += end - start
883
+ self.miss_count += 1
884
+ block_contents = super()._fetch(start, end)
885
+ return block_contents
886
+
887
+ def _read_cache(
888
+ self, start: int, end: int, start_block_number: int, end_block_number: int
889
+ ) -> bytes:
890
+ """
891
+ Read from our block cache.
892
+
893
+ Parameters
894
+ ----------
895
+ start, end : int
896
+ The start and end byte positions.
897
+ start_block_number, end_block_number : int
898
+ The start and end block numbers.
899
+ """
900
+ start_pos = start % self.blocksize
901
+ end_pos = end % self.blocksize
902
+
903
+ # kind of pointless to count this as a hit, but it is
904
+ self.hit_count += 1
905
+
906
+ if start_block_number == end_block_number:
907
+ block = self._fetch_block_cached(start_block_number)
908
+ return block[start_pos:end_pos]
909
+
910
+ else:
911
+ # read from the initial
912
+ out = [self._fetch_block_cached(start_block_number)[start_pos:]]
913
+
914
+ # intermediate blocks
915
+ # Note: it'd be nice to combine these into one big request. However
916
+ # that doesn't play nicely with our LRU cache.
917
+ out.extend(
918
+ map(
919
+ self._fetch_block_cached,
920
+ range(start_block_number + 1, end_block_number),
921
+ )
922
+ )
923
+
924
+ # final block
925
+ out.append(self._fetch_block_cached(end_block_number)[:end_pos])
926
+
927
+ return b"".join(out)
928
+
929
+
930
+ caches: dict[str | None, type[BaseCache]] = {
931
+ # one custom case
932
+ None: BaseCache,
933
+ }
934
+
935
+
936
+ def register_cache(cls: type[BaseCache], clobber: bool = False) -> None:
937
+ """'Register' cache implementation.
938
+
939
+ Parameters
940
+ ----------
941
+ clobber: bool, optional
942
+ If set to True (default is False) - allow to overwrite existing
943
+ entry.
944
+
945
+ Raises
946
+ ------
947
+ ValueError
948
+ """
949
+ name = cls.name
950
+ if not clobber and name in caches:
951
+ raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}")
952
+ caches[name] = cls
953
+
954
+
955
+ for c in (
956
+ BaseCache,
957
+ MMapCache,
958
+ BytesCache,
959
+ ReadAheadCache,
960
+ BlockCache,
961
+ FirstChunkCache,
962
+ AllBytes,
963
+ KnownPartsOfAFile,
964
+ BackgroundBlockCache,
965
+ ):
966
+ register_cache(c)
meow/lib/python3.13/site-packages/fsspec/callbacks.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import wraps
2
+
3
+
4
+ class Callback:
5
+ """
6
+ Base class and interface for callback mechanism
7
+
8
+ This class can be used directly for monitoring file transfers by
9
+ providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument,
10
+ below), or subclassed for more specialised behaviour.
11
+
12
+ Parameters
13
+ ----------
14
+ size: int (optional)
15
+ Nominal quantity for the value that corresponds to a complete
16
+ transfer, e.g., total number of tiles or total number of
17
+ bytes
18
+ value: int (0)
19
+ Starting internal counter value
20
+ hooks: dict or None
21
+ A dict of named functions to be called on each update. The signature
22
+ of these must be ``f(size, value, **kwargs)``
23
+ """
24
+
25
+ def __init__(self, size=None, value=0, hooks=None, **kwargs):
26
+ self.size = size
27
+ self.value = value
28
+ self.hooks = hooks or {}
29
+ self.kw = kwargs
30
+
31
+ def __enter__(self):
32
+ return self
33
+
34
+ def __exit__(self, *exc_args):
35
+ self.close()
36
+
37
+ def close(self):
38
+ """Close callback."""
39
+
40
+ def branched(self, path_1, path_2, **kwargs):
41
+ """
42
+ Return callback for child transfers
43
+
44
+ If this callback is operating at a higher level, e.g., put, which may
45
+ trigger transfers that can also be monitored. The function returns a callback
46
+ that has to be passed to the child method, e.g., put_file,
47
+ as `callback=` argument.
48
+
49
+ The implementation uses `callback.branch` for compatibility.
50
+ When implementing callbacks, it is recommended to override this function instead
51
+ of `branch` and avoid calling `super().branched(...)`.
52
+
53
+ Prefer using this function over `branch`.
54
+
55
+ Parameters
56
+ ----------
57
+ path_1: str
58
+ Child's source path
59
+ path_2: str
60
+ Child's destination path
61
+ **kwargs:
62
+ Arbitrary keyword arguments
63
+
64
+ Returns
65
+ -------
66
+ callback: Callback
67
+ A callback instance to be passed to the child method
68
+ """
69
+ self.branch(path_1, path_2, kwargs)
70
+ # mutate kwargs so that we can force the caller to pass "callback=" explicitly
71
+ return kwargs.pop("callback", DEFAULT_CALLBACK)
72
+
73
+ def branch_coro(self, fn):
74
+ """
75
+ Wraps a coroutine, and pass a new child callback to it.
76
+ """
77
+
78
+ @wraps(fn)
79
+ async def func(path1, path2: str, **kwargs):
80
+ with self.branched(path1, path2, **kwargs) as child:
81
+ return await fn(path1, path2, callback=child, **kwargs)
82
+
83
+ return func
84
+
85
+ def set_size(self, size):
86
+ """
87
+ Set the internal maximum size attribute
88
+
89
+ Usually called if not initially set at instantiation. Note that this
90
+ triggers a ``call()``.
91
+
92
+ Parameters
93
+ ----------
94
+ size: int
95
+ """
96
+ self.size = size
97
+ self.call()
98
+
99
+ def absolute_update(self, value):
100
+ """
101
+ Set the internal value state
102
+
103
+ Triggers ``call()``
104
+
105
+ Parameters
106
+ ----------
107
+ value: int
108
+ """
109
+ self.value = value
110
+ self.call()
111
+
112
+ def relative_update(self, inc=1):
113
+ """
114
+ Delta increment the internal counter
115
+
116
+ Triggers ``call()``
117
+
118
+ Parameters
119
+ ----------
120
+ inc: int
121
+ """
122
+ self.value += inc
123
+ self.call()
124
+
125
+ def call(self, hook_name=None, **kwargs):
126
+ """
127
+ Execute hook(s) with current state
128
+
129
+ Each function is passed the internal size and current value
130
+
131
+ Parameters
132
+ ----------
133
+ hook_name: str or None
134
+ If given, execute on this hook
135
+ kwargs: passed on to (all) hook(s)
136
+ """
137
+ if not self.hooks:
138
+ return
139
+ kw = self.kw.copy()
140
+ kw.update(kwargs)
141
+ if hook_name:
142
+ if hook_name not in self.hooks:
143
+ return
144
+ return self.hooks[hook_name](self.size, self.value, **kw)
145
+ for hook in self.hooks.values() or []:
146
+ hook(self.size, self.value, **kw)
147
+
148
+ def wrap(self, iterable):
149
+ """
150
+ Wrap an iterable to call ``relative_update`` on each iterations
151
+
152
+ Parameters
153
+ ----------
154
+ iterable: Iterable
155
+ The iterable that is being wrapped
156
+ """
157
+ for item in iterable:
158
+ self.relative_update()
159
+ yield item
160
+
161
+ def branch(self, path_1, path_2, kwargs):
162
+ """
163
+ Set callbacks for child transfers
164
+
165
+ If this callback is operating at a higher level, e.g., put, which may
166
+ trigger transfers that can also be monitored. The passed kwargs are
167
+ to be *mutated* to add ``callback=``, if this class supports branching
168
+ to children.
169
+
170
+ Parameters
171
+ ----------
172
+ path_1: str
173
+ Child's source path
174
+ path_2: str
175
+ Child's destination path
176
+ kwargs: dict
177
+ arguments passed to child method, e.g., put_file.
178
+
179
+ Returns
180
+ -------
181
+
182
+ """
183
+ return None
184
+
185
+ def no_op(self, *_, **__):
186
+ pass
187
+
188
+ def __getattr__(self, item):
189
+ """
190
+ If undefined methods are called on this class, nothing happens
191
+ """
192
+ return self.no_op
193
+
194
+ @classmethod
195
+ def as_callback(cls, maybe_callback=None):
196
+ """Transform callback=... into Callback instance
197
+
198
+ For the special value of ``None``, return the global instance of
199
+ ``NoOpCallback``. This is an alternative to including
200
+ ``callback=DEFAULT_CALLBACK`` directly in a method signature.
201
+ """
202
+ if maybe_callback is None:
203
+ return DEFAULT_CALLBACK
204
+ return maybe_callback
205
+
206
+
207
+ class NoOpCallback(Callback):
208
+ """
209
+ This implementation of Callback does exactly nothing
210
+ """
211
+
212
+ def call(self, *args, **kwargs):
213
+ return None
214
+
215
+
216
+ class DotPrinterCallback(Callback):
217
+ """
218
+ Simple example Callback implementation
219
+
220
+ Almost identical to Callback with a hook that prints a char; here we
221
+ demonstrate how the outer layer may print "#" and the inner layer "."
222
+ """
223
+
224
+ def __init__(self, chr_to_print="#", **kwargs):
225
+ self.chr = chr_to_print
226
+ super().__init__(**kwargs)
227
+
228
+ def branch(self, path_1, path_2, kwargs):
229
+ """Mutate kwargs to add new instance with different print char"""
230
+ kwargs["callback"] = DotPrinterCallback(".")
231
+
232
+ def call(self, **kwargs):
233
+ """Just outputs a character"""
234
+ print(self.chr, end="")
235
+
236
+
237
+ class TqdmCallback(Callback):
238
+ """
239
+ A callback to display a progress bar using tqdm
240
+
241
+ Parameters
242
+ ----------
243
+ tqdm_kwargs : dict, (optional)
244
+ Any argument accepted by the tqdm constructor.
245
+ See the `tqdm doc <https://tqdm.github.io/docs/tqdm/#__init__>`_.
246
+ Will be forwarded to `tqdm_cls`.
247
+ tqdm_cls: (optional)
248
+ subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`.
249
+
250
+ Examples
251
+ --------
252
+ >>> import fsspec
253
+ >>> from fsspec.callbacks import TqdmCallback
254
+ >>> fs = fsspec.filesystem("memory")
255
+ >>> path2distant_data = "/your-path"
256
+ >>> fs.upload(
257
+ ".",
258
+ path2distant_data,
259
+ recursive=True,
260
+ callback=TqdmCallback(),
261
+ )
262
+
263
+ You can forward args to tqdm using the ``tqdm_kwargs`` parameter.
264
+
265
+ >>> fs.upload(
266
+ ".",
267
+ path2distant_data,
268
+ recursive=True,
269
+ callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}),
270
+ )
271
+
272
+ You can also customize the progress bar by passing a subclass of `tqdm`.
273
+
274
+ .. code-block:: python
275
+
276
+ class TqdmFormat(tqdm):
277
+ '''Provides a `total_time` format parameter'''
278
+ @property
279
+ def format_dict(self):
280
+ d = super().format_dict
281
+ total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1)
282
+ d.update(total_time=self.format_interval(total_time) + " in total")
283
+ return d
284
+
285
+ >>> with TqdmCallback(
286
+ tqdm_kwargs={
287
+ "desc": "desc",
288
+ "bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}",
289
+ },
290
+ tqdm_cls=TqdmFormat,
291
+ ) as callback:
292
+ fs.upload(".", path2distant_data, recursive=True, callback=callback)
293
+ """
294
+
295
+ def __init__(self, tqdm_kwargs=None, *args, **kwargs):
296
+ try:
297
+ from tqdm import tqdm
298
+
299
+ except ImportError as exce:
300
+ raise ImportError(
301
+ "Using TqdmCallback requires tqdm to be installed"
302
+ ) from exce
303
+
304
+ self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
305
+ self._tqdm_kwargs = tqdm_kwargs or {}
306
+ self.tqdm = None
307
+ super().__init__(*args, **kwargs)
308
+
309
+ def call(self, *args, **kwargs):
310
+ if self.tqdm is None:
311
+ self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
312
+ self.tqdm.total = self.size
313
+ self.tqdm.update(self.value - self.tqdm.n)
314
+
315
+ def close(self):
316
+ if self.tqdm is not None:
317
+ self.tqdm.close()
318
+ self.tqdm = None
319
+
320
+ def __del__(self):
321
+ return self.close()
322
+
323
+
324
+ DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback()
meow/lib/python3.13/site-packages/fsspec/compression.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helper functions for a standard streaming compression API"""
2
+
3
+ from zipfile import ZipFile
4
+
5
+ import fsspec.utils
6
+ from fsspec.spec import AbstractBufferedFile
7
+
8
+
9
+ def noop_file(file, mode, **kwargs):
10
+ return file
11
+
12
+
13
+ # TODO: files should also be available as contexts
14
+ # should be functions of the form func(infile, mode=, **kwargs) -> file-like
15
+ compr = {None: noop_file}
16
+
17
+
18
+ def register_compression(name, callback, extensions, force=False):
19
+ """Register an "inferable" file compression type.
20
+
21
+ Registers transparent file compression type for use with fsspec.open.
22
+ Compression can be specified by name in open, or "infer"-ed for any files
23
+ ending with the given extensions.
24
+
25
+ Args:
26
+ name: (str) The compression type name. Eg. "gzip".
27
+ callback: A callable of form (infile, mode, **kwargs) -> file-like.
28
+ Accepts an input file-like object, the target mode and kwargs.
29
+ Returns a wrapped file-like object.
30
+ extensions: (str, Iterable[str]) A file extension, or list of file
31
+ extensions for which to infer this compression scheme. Eg. "gz".
32
+ force: (bool) Force re-registration of compression type or extensions.
33
+
34
+ Raises:
35
+ ValueError: If name or extensions already registered, and not force.
36
+
37
+ """
38
+ if isinstance(extensions, str):
39
+ extensions = [extensions]
40
+
41
+ # Validate registration
42
+ if name in compr and not force:
43
+ raise ValueError(f"Duplicate compression registration: {name}")
44
+
45
+ for ext in extensions:
46
+ if ext in fsspec.utils.compressions and not force:
47
+ raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
48
+
49
+ compr[name] = callback
50
+
51
+ for ext in extensions:
52
+ fsspec.utils.compressions[ext] = name
53
+
54
+
55
+ def unzip(infile, mode="rb", filename=None, **kwargs):
56
+ if "r" not in mode:
57
+ filename = filename or "file"
58
+ z = ZipFile(infile, mode="w", **kwargs)
59
+ fo = z.open(filename, mode="w")
60
+ fo.close = lambda closer=fo.close: closer() or z.close()
61
+ return fo
62
+ z = ZipFile(infile)
63
+ if filename is None:
64
+ filename = z.namelist()[0]
65
+ return z.open(filename, mode="r", **kwargs)
66
+
67
+
68
+ register_compression("zip", unzip, "zip")
69
+
70
+ try:
71
+ from bz2 import BZ2File
72
+ except ImportError:
73
+ pass
74
+ else:
75
+ register_compression("bz2", BZ2File, "bz2")
76
+
77
+ try: # pragma: no cover
78
+ from isal import igzip
79
+
80
+ def isal(infile, mode="rb", **kwargs):
81
+ return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
82
+
83
+ register_compression("gzip", isal, "gz")
84
+ except ImportError:
85
+ from gzip import GzipFile
86
+
87
+ register_compression(
88
+ "gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
89
+ )
90
+
91
+ try:
92
+ from lzma import LZMAFile
93
+
94
+ register_compression("lzma", LZMAFile, "lzma")
95
+ register_compression("xz", LZMAFile, "xz")
96
+ except ImportError:
97
+ pass
98
+
99
+ try:
100
+ import lzmaffi
101
+
102
+ register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
103
+ register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
104
+ except ImportError:
105
+ pass
106
+
107
+
108
+ class SnappyFile(AbstractBufferedFile):
109
+ def __init__(self, infile, mode, **kwargs):
110
+ import snappy
111
+
112
+ super().__init__(
113
+ fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
114
+ )
115
+ self.infile = infile
116
+ if "r" in mode:
117
+ self.codec = snappy.StreamDecompressor()
118
+ else:
119
+ self.codec = snappy.StreamCompressor()
120
+
121
+ def _upload_chunk(self, final=False):
122
+ self.buffer.seek(0)
123
+ out = self.codec.add_chunk(self.buffer.read())
124
+ self.infile.write(out)
125
+ return True
126
+
127
+ def seek(self, loc, whence=0):
128
+ raise NotImplementedError("SnappyFile is not seekable")
129
+
130
+ def seekable(self):
131
+ return False
132
+
133
+ def _fetch_range(self, start, end):
134
+ """Get the specified set of bytes from remote"""
135
+ data = self.infile.read(end - start)
136
+ return self.codec.decompress(data)
137
+
138
+
139
+ try:
140
+ import snappy
141
+
142
+ snappy.compress(b"")
143
+ # Snappy may use the .sz file extension, but this is not part of the
144
+ # standard implementation.
145
+ register_compression("snappy", SnappyFile, [])
146
+
147
+ except (ImportError, NameError, AttributeError):
148
+ pass
149
+
150
+ try:
151
+ import lz4.frame
152
+
153
+ register_compression("lz4", lz4.frame.open, "lz4")
154
+ except ImportError:
155
+ pass
156
+
157
+ try:
158
+ import zstandard as zstd
159
+
160
+ def zstandard_file(infile, mode="rb"):
161
+ if "r" in mode:
162
+ cctx = zstd.ZstdDecompressor()
163
+ return cctx.stream_reader(infile)
164
+ else:
165
+ cctx = zstd.ZstdCompressor(level=10)
166
+ return cctx.stream_writer(infile)
167
+
168
+ register_compression("zstd", zstandard_file, "zst")
169
+ except ImportError:
170
+ pass
171
+
172
+
173
+ def available_compressions():
174
+ """Return a list of the implemented compressions."""
175
+ return list(compr)
meow/lib/python3.13/site-packages/fsspec/config.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import configparser
4
+ import json
5
+ import os
6
+ import warnings
7
+ from typing import Any
8
+
9
+ conf: dict[str, dict[str, Any]] = {}
10
+ default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
11
+ conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
12
+
13
+
14
+ def set_conf_env(conf_dict, envdict=os.environ):
15
+ """Set config values from environment variables
16
+
17
+ Looks for variables of the form ``FSSPEC_<protocol>`` and
18
+ ``FSSPEC_<protocol>_<kwarg>``. For ``FSSPEC_<protocol>`` the value is parsed
19
+ as a json dictionary and used to ``update`` the config of the
20
+ corresponding protocol. For ``FSSPEC_<protocol>_<kwarg>`` there is no
21
+ attempt to convert the string value, but the kwarg keys will be lower-cased.
22
+
23
+ The ``FSSPEC_<protocol>_<kwarg>`` variables are applied after the
24
+ ``FSSPEC_<protocol>`` ones.
25
+
26
+ Parameters
27
+ ----------
28
+ conf_dict : dict(str, dict)
29
+ This dict will be mutated
30
+ envdict : dict-like(str, str)
31
+ Source for the values - usually the real environment
32
+ """
33
+ kwarg_keys = []
34
+ for key in envdict:
35
+ if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
36
+ if key.count("_") > 1:
37
+ kwarg_keys.append(key)
38
+ continue
39
+ try:
40
+ value = json.loads(envdict[key])
41
+ except json.decoder.JSONDecodeError as ex:
42
+ warnings.warn(
43
+ f"Ignoring environment variable {key} due to a parse failure: {ex}"
44
+ )
45
+ else:
46
+ if isinstance(value, dict):
47
+ _, proto = key.split("_", 1)
48
+ conf_dict.setdefault(proto.lower(), {}).update(value)
49
+ else:
50
+ warnings.warn(
51
+ f"Ignoring environment variable {key} due to not being a dict:"
52
+ f" {type(value)}"
53
+ )
54
+ elif key.startswith("FSSPEC"):
55
+ warnings.warn(
56
+ f"Ignoring environment variable {key} due to having an unexpected name"
57
+ )
58
+
59
+ for key in kwarg_keys:
60
+ _, proto, kwarg = key.split("_", 2)
61
+ conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
62
+
63
+
64
+ def set_conf_files(cdir, conf_dict):
65
+ """Set config values from files
66
+
67
+ Scans for INI and JSON files in the given dictionary, and uses their
68
+ contents to set the config. In case of repeated values, later values
69
+ win.
70
+
71
+ In the case of INI files, all values are strings, and these will not
72
+ be converted.
73
+
74
+ Parameters
75
+ ----------
76
+ cdir : str
77
+ Directory to search
78
+ conf_dict : dict(str, dict)
79
+ This dict will be mutated
80
+ """
81
+ if not os.path.isdir(cdir):
82
+ return
83
+ allfiles = sorted(os.listdir(cdir))
84
+ for fn in allfiles:
85
+ if fn.endswith(".ini"):
86
+ ini = configparser.ConfigParser()
87
+ ini.read(os.path.join(cdir, fn))
88
+ for key in ini:
89
+ if key == "DEFAULT":
90
+ continue
91
+ conf_dict.setdefault(key, {}).update(dict(ini[key]))
92
+ if fn.endswith(".json"):
93
+ with open(os.path.join(cdir, fn)) as f:
94
+ js = json.load(f)
95
+ for key in js:
96
+ conf_dict.setdefault(key, {}).update(dict(js[key]))
97
+
98
+
99
+ def apply_config(cls, kwargs, conf_dict=None):
100
+ """Supply default values for kwargs when instantiating class
101
+
102
+ Augments the passed kwargs, by finding entries in the config dict
103
+ which match the classes ``.protocol`` attribute (one or more str)
104
+
105
+ Parameters
106
+ ----------
107
+ cls : file system implementation
108
+ kwargs : dict
109
+ conf_dict : dict of dict
110
+ Typically this is the global configuration
111
+
112
+ Returns
113
+ -------
114
+ dict : the modified set of kwargs
115
+ """
116
+ if conf_dict is None:
117
+ conf_dict = conf
118
+ protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
119
+ kw = {}
120
+ for proto in protos:
121
+ # default kwargs from the current state of the config
122
+ if proto in conf_dict:
123
+ kw.update(conf_dict[proto])
124
+ # explicit kwargs always win
125
+ kw.update(**kwargs)
126
+ kwargs = kw
127
+ return kwargs
128
+
129
+
130
+ set_conf_files(conf_dir, conf)
131
+ set_conf_env(conf)
meow/lib/python3.13/site-packages/fsspec/conftest.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import sys
5
+ import time
6
+
7
+ import pytest
8
+
9
+ import fsspec
10
+ from fsspec.implementations.cached import CachingFileSystem
11
+
12
+
13
+ @pytest.fixture()
14
+ def m():
15
+ """
16
+ Fixture providing a memory filesystem.
17
+ """
18
+ m = fsspec.filesystem("memory")
19
+ m.store.clear()
20
+ m.pseudo_dirs.clear()
21
+ m.pseudo_dirs.append("")
22
+ try:
23
+ yield m
24
+ finally:
25
+ m.store.clear()
26
+ m.pseudo_dirs.clear()
27
+ m.pseudo_dirs.append("")
28
+
29
+
30
+ @pytest.fixture
31
+ def ftp_writable(tmpdir):
32
+ """
33
+ Fixture providing a writable FTP filesystem.
34
+ """
35
+ pytest.importorskip("pyftpdlib")
36
+ from fsspec.implementations.ftp import FTPFileSystem
37
+
38
+ FTPFileSystem.clear_instance_cache() # remove lingering connections
39
+ CachingFileSystem.clear_instance_cache()
40
+ d = str(tmpdir)
41
+ with open(os.path.join(d, "out"), "wb") as f:
42
+ f.write(b"hello" * 10000)
43
+ P = subprocess.Popen(
44
+ [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
45
+ )
46
+ try:
47
+ time.sleep(1)
48
+ yield "localhost", 2121, "user", "pass"
49
+ finally:
50
+ P.terminate()
51
+ P.wait()
52
+ try:
53
+ shutil.rmtree(tmpdir)
54
+ except Exception:
55
+ pass
meow/lib/python3.13/site-packages/fsspec/core.py ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ import re
7
+ from glob import has_magic
8
+ from pathlib import Path
9
+
10
+ # for backwards compat, we export cache things from here too
11
+ from fsspec.caching import ( # noqa: F401
12
+ BaseCache,
13
+ BlockCache,
14
+ BytesCache,
15
+ MMapCache,
16
+ ReadAheadCache,
17
+ caches,
18
+ )
19
+ from fsspec.compression import compr
20
+ from fsspec.config import conf
21
+ from fsspec.registry import filesystem, get_filesystem_class
22
+ from fsspec.utils import (
23
+ _unstrip_protocol,
24
+ build_name_function,
25
+ infer_compression,
26
+ stringify_path,
27
+ )
28
+
29
+ logger = logging.getLogger("fsspec")
30
+
31
+
32
+ class OpenFile:
33
+ """
34
+ File-like object to be used in a context
35
+
36
+ Can layer (buffered) text-mode and compression over any file-system, which
37
+ are typically binary-only.
38
+
39
+ These instances are safe to serialize, as the low-level file object
40
+ is not created until invoked using ``with``.
41
+
42
+ Parameters
43
+ ----------
44
+ fs: FileSystem
45
+ The file system to use for opening the file. Should be a subclass or duck-type
46
+ with ``fsspec.spec.AbstractFileSystem``
47
+ path: str
48
+ Location to open
49
+ mode: str like 'rb', optional
50
+ Mode of the opened file
51
+ compression: str or None, optional
52
+ Compression to apply
53
+ encoding: str or None, optional
54
+ The encoding to use if opened in text mode.
55
+ errors: str or None, optional
56
+ How to handle encoding errors if opened in text mode.
57
+ newline: None or str
58
+ Passed to TextIOWrapper in text mode, how to handle line endings.
59
+ autoopen: bool
60
+ If True, calls open() immediately. Mostly used by pickle
61
+ pos: int
62
+ If given and autoopen is True, seek to this location immediately
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ fs,
68
+ path,
69
+ mode="rb",
70
+ compression=None,
71
+ encoding=None,
72
+ errors=None,
73
+ newline=None,
74
+ ):
75
+ self.fs = fs
76
+ self.path = path
77
+ self.mode = mode
78
+ self.compression = get_compression(path, compression)
79
+ self.encoding = encoding
80
+ self.errors = errors
81
+ self.newline = newline
82
+ self.fobjects = []
83
+
84
+ def __reduce__(self):
85
+ return (
86
+ OpenFile,
87
+ (
88
+ self.fs,
89
+ self.path,
90
+ self.mode,
91
+ self.compression,
92
+ self.encoding,
93
+ self.errors,
94
+ self.newline,
95
+ ),
96
+ )
97
+
98
+ def __repr__(self):
99
+ return f"<OpenFile '{self.path}'>"
100
+
101
+ def __enter__(self):
102
+ mode = self.mode.replace("t", "").replace("b", "") + "b"
103
+
104
+ try:
105
+ f = self.fs.open(self.path, mode=mode)
106
+ except FileNotFoundError as e:
107
+ if has_magic(self.path):
108
+ raise FileNotFoundError(
109
+ "%s not found. The URL contains glob characters: you maybe needed\n"
110
+ "to pass expand=True in fsspec.open() or the storage_options of \n"
111
+ "your library. You can also set the config value 'open_expand'\n"
112
+ "before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
113
+ self.path,
114
+ ) from e
115
+ raise
116
+
117
+ self.fobjects = [f]
118
+
119
+ if self.compression is not None:
120
+ compress = compr[self.compression]
121
+ f = compress(f, mode=mode[0])
122
+ self.fobjects.append(f)
123
+
124
+ if "b" not in self.mode:
125
+ # assume, for example, that 'r' is equivalent to 'rt' as in builtin
126
+ f = PickleableTextIOWrapper(
127
+ f, encoding=self.encoding, errors=self.errors, newline=self.newline
128
+ )
129
+ self.fobjects.append(f)
130
+
131
+ return self.fobjects[-1]
132
+
133
+ def __exit__(self, *args):
134
+ self.close()
135
+
136
+ @property
137
+ def full_name(self):
138
+ return _unstrip_protocol(self.path, self.fs)
139
+
140
+ def open(self):
141
+ """Materialise this as a real open file without context
142
+
143
+ The OpenFile object should be explicitly closed to avoid enclosed file
144
+ instances persisting. You must, therefore, keep a reference to the OpenFile
145
+ during the life of the file-like it generates.
146
+ """
147
+ return self.__enter__()
148
+
149
+ def close(self):
150
+ """Close all encapsulated file objects"""
151
+ for f in reversed(self.fobjects):
152
+ if "r" not in self.mode and not f.closed:
153
+ f.flush()
154
+ f.close()
155
+ self.fobjects.clear()
156
+
157
+
158
+ class OpenFiles(list):
159
+ """List of OpenFile instances
160
+
161
+ Can be used in a single context, which opens and closes all of the
162
+ contained files. Normal list access to get the elements works as
163
+ normal.
164
+
165
+ A special case is made for caching filesystems - the files will
166
+ be down/uploaded together at the start or end of the context, and
167
+ this may happen concurrently, if the target filesystem supports it.
168
+ """
169
+
170
+ def __init__(self, *args, mode="rb", fs=None):
171
+ self.mode = mode
172
+ self.fs = fs
173
+ self.files = []
174
+ super().__init__(*args)
175
+
176
+ def __enter__(self):
177
+ if self.fs is None:
178
+ raise ValueError("Context has already been used")
179
+
180
+ fs = self.fs
181
+ while True:
182
+ if hasattr(fs, "open_many"):
183
+ # check for concurrent cache download; or set up for upload
184
+ self.files = fs.open_many(self)
185
+ return self.files
186
+ if hasattr(fs, "fs") and fs.fs is not None:
187
+ fs = fs.fs
188
+ else:
189
+ break
190
+ return [s.__enter__() for s in self]
191
+
192
+ def __exit__(self, *args):
193
+ fs = self.fs
194
+ [s.__exit__(*args) for s in self]
195
+ if "r" not in self.mode:
196
+ while True:
197
+ if hasattr(fs, "open_many"):
198
+ # check for concurrent cache upload
199
+ fs.commit_many(self.files)
200
+ return
201
+ if hasattr(fs, "fs") and fs.fs is not None:
202
+ fs = fs.fs
203
+ else:
204
+ break
205
+
206
+ def __getitem__(self, item):
207
+ out = super().__getitem__(item)
208
+ if isinstance(item, slice):
209
+ return OpenFiles(out, mode=self.mode, fs=self.fs)
210
+ return out
211
+
212
+ def __repr__(self):
213
+ return f"<List of {len(self)} OpenFile instances>"
214
+
215
+
216
+ def open_files(
217
+ urlpath,
218
+ mode="rb",
219
+ compression=None,
220
+ encoding="utf8",
221
+ errors=None,
222
+ name_function=None,
223
+ num=1,
224
+ protocol=None,
225
+ newline=None,
226
+ auto_mkdir=True,
227
+ expand=True,
228
+ **kwargs,
229
+ ):
230
+ """Given a path or paths, return a list of ``OpenFile`` objects.
231
+
232
+ For writing, a str path must contain the "*" character, which will be filled
233
+ in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
234
+
235
+ For either reading or writing, can instead provide explicit list of paths.
236
+
237
+ Parameters
238
+ ----------
239
+ urlpath: string or list
240
+ Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
241
+ to read from alternative filesystems. To read from multiple files you
242
+ can pass a globstring or a list of paths, with the caveat that they
243
+ must all have the same protocol.
244
+ mode: 'rb', 'wt', etc.
245
+ compression: string or None
246
+ If given, open file using compression codec. Can either be a compression
247
+ name (a key in ``fsspec.compression.compr``) or "infer" to guess the
248
+ compression from the filename suffix.
249
+ encoding: str
250
+ For text mode only
251
+ errors: None or str
252
+ Passed to TextIOWrapper in text mode
253
+ name_function: function or None
254
+ if opening a set of files for writing, those files do not yet exist,
255
+ so we need to generate their names by formatting the urlpath for
256
+ each sequence number
257
+ num: int [1]
258
+ if writing mode, number of files we expect to create (passed to
259
+ name+function)
260
+ protocol: str or None
261
+ If given, overrides the protocol found in the URL.
262
+ newline: bytes or None
263
+ Used for line terminator in text mode. If None, uses system default;
264
+ if blank, uses no translation.
265
+ auto_mkdir: bool (True)
266
+ If in write mode, this will ensure the target directory exists before
267
+ writing, by calling ``fs.mkdirs(exist_ok=True)``.
268
+ expand: bool
269
+ **kwargs: dict
270
+ Extra options that make sense to a particular storage connection, e.g.
271
+ host, port, username, password, etc.
272
+
273
+ Examples
274
+ --------
275
+ >>> files = open_files('2015-*-*.csv') # doctest: +SKIP
276
+ >>> files = open_files(
277
+ ... 's3://bucket/2015-*-*.csv.gz', compression='gzip'
278
+ ... ) # doctest: +SKIP
279
+
280
+ Returns
281
+ -------
282
+ An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can
283
+ be used as a single context
284
+
285
+ Notes
286
+ -----
287
+ For a full list of the available protocols and the implementations that
288
+ they map across to see the latest online documentation:
289
+
290
+ - For implementations built into ``fsspec`` see
291
+ https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
292
+ - For implementations in separate packages see
293
+ https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
294
+ """
295
+ fs, fs_token, paths = get_fs_token_paths(
296
+ urlpath,
297
+ mode,
298
+ num=num,
299
+ name_function=name_function,
300
+ storage_options=kwargs,
301
+ protocol=protocol,
302
+ expand=expand,
303
+ )
304
+ if fs.protocol == "file":
305
+ fs.auto_mkdir = auto_mkdir
306
+ elif "r" not in mode and auto_mkdir:
307
+ parents = {fs._parent(path) for path in paths}
308
+ for parent in parents:
309
+ try:
310
+ fs.makedirs(parent, exist_ok=True)
311
+ except PermissionError:
312
+ pass
313
+ return OpenFiles(
314
+ [
315
+ OpenFile(
316
+ fs,
317
+ path,
318
+ mode=mode,
319
+ compression=compression,
320
+ encoding=encoding,
321
+ errors=errors,
322
+ newline=newline,
323
+ )
324
+ for path in paths
325
+ ],
326
+ mode=mode,
327
+ fs=fs,
328
+ )
329
+
330
+
331
+ def _un_chain(path, kwargs):
332
+ # Avoid a circular import
333
+ from fsspec.implementations.cached import CachingFileSystem
334
+
335
+ if "::" in path:
336
+ x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
337
+ bits = []
338
+ for p in path.split("::"):
339
+ if "://" in p or x.match(p):
340
+ bits.append(p)
341
+ else:
342
+ bits.append(p + "://")
343
+ else:
344
+ bits = [path]
345
+ # [[url, protocol, kwargs], ...]
346
+ out = []
347
+ previous_bit = None
348
+ kwargs = kwargs.copy()
349
+ for bit in reversed(bits):
350
+ protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
351
+ cls = get_filesystem_class(protocol)
352
+ extra_kwargs = cls._get_kwargs_from_urls(bit)
353
+ kws = kwargs.pop(protocol, {})
354
+ if bit is bits[0]:
355
+ kws.update(kwargs)
356
+ kw = dict(
357
+ **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
358
+ **kws,
359
+ )
360
+ bit = cls._strip_protocol(bit)
361
+ if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
362
+ bit = previous_bit
363
+ out.append((bit, protocol, kw))
364
+ previous_bit = bit
365
+ out.reverse()
366
+ return out
367
+
368
+
369
+ def url_to_fs(url, **kwargs):
370
+ """
371
+ Turn fully-qualified and potentially chained URL into filesystem instance
372
+
373
+ Parameters
374
+ ----------
375
+ url : str
376
+ The fsspec-compatible URL
377
+ **kwargs: dict
378
+ Extra options that make sense to a particular storage connection, e.g.
379
+ host, port, username, password, etc.
380
+
381
+ Returns
382
+ -------
383
+ filesystem : FileSystem
384
+ The new filesystem discovered from ``url`` and created with
385
+ ``**kwargs``.
386
+ urlpath : str
387
+ The file-systems-specific URL for ``url``.
388
+ """
389
+ url = stringify_path(url)
390
+ # non-FS arguments that appear in fsspec.open()
391
+ # inspect could keep this in sync with open()'s signature
392
+ known_kwargs = {
393
+ "compression",
394
+ "encoding",
395
+ "errors",
396
+ "expand",
397
+ "mode",
398
+ "name_function",
399
+ "newline",
400
+ "num",
401
+ }
402
+ kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
403
+ chain = _un_chain(url, kwargs)
404
+ inkwargs = {}
405
+ # Reverse iterate the chain, creating a nested target_* structure
406
+ for i, ch in enumerate(reversed(chain)):
407
+ urls, protocol, kw = ch
408
+ if i == len(chain) - 1:
409
+ inkwargs = dict(**kw, **inkwargs)
410
+ continue
411
+ inkwargs["target_options"] = dict(**kw, **inkwargs)
412
+ inkwargs["target_protocol"] = protocol
413
+ inkwargs["fo"] = urls
414
+ urlpath, protocol, _ = chain[0]
415
+ fs = filesystem(protocol, **inkwargs)
416
+ return fs, urlpath
417
+
418
+
419
+ DEFAULT_EXPAND = conf.get("open_expand", False)
420
+
421
+
422
+ def open(
423
+ urlpath,
424
+ mode="rb",
425
+ compression=None,
426
+ encoding="utf8",
427
+ errors=None,
428
+ protocol=None,
429
+ newline=None,
430
+ expand=None,
431
+ **kwargs,
432
+ ):
433
+ """Given a path or paths, return one ``OpenFile`` object.
434
+
435
+ Parameters
436
+ ----------
437
+ urlpath: string or list
438
+ Absolute or relative filepath. Prefix with a protocol like ``s3://``
439
+ to read from alternative filesystems. Should not include glob
440
+ character(s).
441
+ mode: 'rb', 'wt', etc.
442
+ compression: string or None
443
+ If given, open file using compression codec. Can either be a compression
444
+ name (a key in ``fsspec.compression.compr``) or "infer" to guess the
445
+ compression from the filename suffix.
446
+ encoding: str
447
+ For text mode only
448
+ errors: None or str
449
+ Passed to TextIOWrapper in text mode
450
+ protocol: str or None
451
+ If given, overrides the protocol found in the URL.
452
+ newline: bytes or None
453
+ Used for line terminator in text mode. If None, uses system default;
454
+ if blank, uses no translation.
455
+ expand: bool or Nonw
456
+ Whether to regard file paths containing special glob characters as needing
457
+ expansion (finding the first match) or absolute. Setting False allows using
458
+ paths which do embed such characters. If None (default), this argument
459
+ takes its value from the DEFAULT_EXPAND module variable, which takes
460
+ its initial value from the "open_expand" config value at startup, which will
461
+ be False if not set.
462
+ **kwargs: dict
463
+ Extra options that make sense to a particular storage connection, e.g.
464
+ host, port, username, password, etc.
465
+
466
+ Examples
467
+ --------
468
+ >>> openfile = open('2015-01-01.csv') # doctest: +SKIP
469
+ >>> openfile = open(
470
+ ... 's3://bucket/2015-01-01.csv.gz', compression='gzip'
471
+ ... ) # doctest: +SKIP
472
+ >>> with openfile as f:
473
+ ... df = pd.read_csv(f) # doctest: +SKIP
474
+ ...
475
+
476
+ Returns
477
+ -------
478
+ ``OpenFile`` object.
479
+
480
+ Notes
481
+ -----
482
+ For a full list of the available protocols and the implementations that
483
+ they map across to see the latest online documentation:
484
+
485
+ - For implementations built into ``fsspec`` see
486
+ https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
487
+ - For implementations in separate packages see
488
+ https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
489
+ """
490
+ expand = DEFAULT_EXPAND if expand is None else expand
491
+ out = open_files(
492
+ urlpath=[urlpath],
493
+ mode=mode,
494
+ compression=compression,
495
+ encoding=encoding,
496
+ errors=errors,
497
+ protocol=protocol,
498
+ newline=newline,
499
+ expand=expand,
500
+ **kwargs,
501
+ )
502
+ if not out:
503
+ raise FileNotFoundError(urlpath)
504
+ return out[0]
505
+
506
+
507
+ def open_local(
508
+ url: str | list[str] | Path | list[Path],
509
+ mode: str = "rb",
510
+ **storage_options: dict,
511
+ ) -> str | list[str]:
512
+ """Open file(s) which can be resolved to local
513
+
514
+ For files which either are local, or get downloaded upon open
515
+ (e.g., by file caching)
516
+
517
+ Parameters
518
+ ----------
519
+ url: str or list(str)
520
+ mode: str
521
+ Must be read mode
522
+ storage_options:
523
+ passed on to FS for or used by open_files (e.g., compression)
524
+ """
525
+ if "r" not in mode:
526
+ raise ValueError("Can only ensure local files when reading")
527
+ of = open_files(url, mode=mode, **storage_options)
528
+ if not getattr(of[0].fs, "local_file", False):
529
+ raise ValueError(
530
+ "open_local can only be used on a filesystem which"
531
+ " has attribute local_file=True"
532
+ )
533
+ with of as files:
534
+ paths = [f.name for f in files]
535
+ if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
536
+ return paths[0]
537
+ return paths
538
+
539
+
540
+ def get_compression(urlpath, compression):
541
+ if compression == "infer":
542
+ compression = infer_compression(urlpath)
543
+ if compression is not None and compression not in compr:
544
+ raise ValueError(f"Compression type {compression} not supported")
545
+ return compression
546
+
547
+
548
+ def split_protocol(urlpath):
549
+ """Return protocol, path pair"""
550
+ urlpath = stringify_path(urlpath)
551
+ if "://" in urlpath:
552
+ protocol, path = urlpath.split("://", 1)
553
+ if len(protocol) > 1:
554
+ # excludes Windows paths
555
+ return protocol, path
556
+ if urlpath.startswith("data:"):
557
+ return urlpath.split(":", 1)
558
+ return None, urlpath
559
+
560
+
561
+ def strip_protocol(urlpath):
562
+ """Return only path part of full URL, according to appropriate backend"""
563
+ protocol, _ = split_protocol(urlpath)
564
+ cls = get_filesystem_class(protocol)
565
+ return cls._strip_protocol(urlpath)
566
+
567
+
568
+ def expand_paths_if_needed(paths, mode, num, fs, name_function):
569
+ """Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]``
570
+ in them (read mode).
571
+
572
+ :param paths: list of paths
573
+ mode: str
574
+ Mode in which to open files.
575
+ num: int
576
+ If opening in writing mode, number of files we expect to create.
577
+ fs: filesystem object
578
+ name_function: callable
579
+ If opening in writing mode, this callable is used to generate path
580
+ names. Names are generated for each partition by
581
+ ``urlpath.replace('*', name_function(partition_index))``.
582
+ :return: list of paths
583
+ """
584
+ expanded_paths = []
585
+ paths = list(paths)
586
+
587
+ if "w" in mode: # read mode
588
+ if sum(1 for p in paths if "*" in p) > 1:
589
+ raise ValueError(
590
+ "When writing data, only one filename mask can be specified."
591
+ )
592
+ num = max(num, len(paths))
593
+
594
+ for curr_path in paths:
595
+ if "*" in curr_path:
596
+ # expand using name_function
597
+ expanded_paths.extend(_expand_paths(curr_path, name_function, num))
598
+ else:
599
+ expanded_paths.append(curr_path)
600
+ # if we generated more paths that asked for, trim the list
601
+ if len(expanded_paths) > num:
602
+ expanded_paths = expanded_paths[:num]
603
+
604
+ else: # read mode
605
+ for curr_path in paths:
606
+ if has_magic(curr_path):
607
+ # expand using glob
608
+ expanded_paths.extend(fs.glob(curr_path))
609
+ else:
610
+ expanded_paths.append(curr_path)
611
+
612
+ return expanded_paths
613
+
614
+
615
+ def get_fs_token_paths(
616
+ urlpath,
617
+ mode="rb",
618
+ num=1,
619
+ name_function=None,
620
+ storage_options=None,
621
+ protocol=None,
622
+ expand=True,
623
+ ):
624
+ """Filesystem, deterministic token, and paths from a urlpath and options.
625
+
626
+ Parameters
627
+ ----------
628
+ urlpath: string or iterable
629
+ Absolute or relative filepath, URL (may include protocols like
630
+ ``s3://``), or globstring pointing to data.
631
+ mode: str, optional
632
+ Mode in which to open files.
633
+ num: int, optional
634
+ If opening in writing mode, number of files we expect to create.
635
+ name_function: callable, optional
636
+ If opening in writing mode, this callable is used to generate path
637
+ names. Names are generated for each partition by
638
+ ``urlpath.replace('*', name_function(partition_index))``.
639
+ storage_options: dict, optional
640
+ Additional keywords to pass to the filesystem class.
641
+ protocol: str or None
642
+ To override the protocol specifier in the URL
643
+ expand: bool
644
+ Expand string paths for writing, assuming the path is a directory
645
+ """
646
+ if isinstance(urlpath, (list, tuple, set)):
647
+ if not urlpath:
648
+ raise ValueError("empty urlpath sequence")
649
+ urlpath0 = stringify_path(next(iter(urlpath)))
650
+ else:
651
+ urlpath0 = stringify_path(urlpath)
652
+ storage_options = storage_options or {}
653
+ if protocol:
654
+ storage_options["protocol"] = protocol
655
+ chain = _un_chain(urlpath0, storage_options or {})
656
+ inkwargs = {}
657
+ # Reverse iterate the chain, creating a nested target_* structure
658
+ for i, ch in enumerate(reversed(chain)):
659
+ urls, nested_protocol, kw = ch
660
+ if i == len(chain) - 1:
661
+ inkwargs = dict(**kw, **inkwargs)
662
+ continue
663
+ inkwargs["target_options"] = dict(**kw, **inkwargs)
664
+ inkwargs["target_protocol"] = nested_protocol
665
+ inkwargs["fo"] = urls
666
+ paths, protocol, _ = chain[0]
667
+ fs = filesystem(protocol, **inkwargs)
668
+ if isinstance(urlpath, (list, tuple, set)):
669
+ pchains = [
670
+ _un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
671
+ ]
672
+ if len({pc[1] for pc in pchains}) > 1:
673
+ raise ValueError("Protocol mismatch getting fs from %s", urlpath)
674
+ paths = [pc[0] for pc in pchains]
675
+ else:
676
+ paths = fs._strip_protocol(paths)
677
+ if isinstance(paths, (list, tuple, set)):
678
+ if expand:
679
+ paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
680
+ elif not isinstance(paths, list):
681
+ paths = list(paths)
682
+ else:
683
+ if ("w" in mode or "x" in mode) and expand:
684
+ paths = _expand_paths(paths, name_function, num)
685
+ elif "*" in paths:
686
+ paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
687
+ else:
688
+ paths = [paths]
689
+
690
+ return fs, fs._fs_token, paths
691
+
692
+
693
+ def _expand_paths(path, name_function, num):
694
+ if isinstance(path, str):
695
+ if path.count("*") > 1:
696
+ raise ValueError("Output path spec must contain exactly one '*'.")
697
+ elif "*" not in path:
698
+ path = os.path.join(path, "*.part")
699
+
700
+ if name_function is None:
701
+ name_function = build_name_function(num - 1)
702
+
703
+ paths = [path.replace("*", name_function(i)) for i in range(num)]
704
+ if paths != sorted(paths):
705
+ logger.warning(
706
+ "In order to preserve order between partitions"
707
+ " paths created with ``name_function`` should "
708
+ "sort to partition order"
709
+ )
710
+ elif isinstance(path, (tuple, list)):
711
+ assert len(path) == num
712
+ paths = list(path)
713
+ else:
714
+ raise ValueError(
715
+ "Path should be either\n"
716
+ "1. A list of paths: ['foo.json', 'bar.json', ...]\n"
717
+ "2. A directory: 'foo/\n"
718
+ "3. A path with a '*' in it: 'foo.*.json'"
719
+ )
720
+ return paths
721
+
722
+
723
+ class PickleableTextIOWrapper(io.TextIOWrapper):
724
+ """TextIOWrapper cannot be pickled. This solves it.
725
+
726
+ Requires that ``buffer`` be pickleable, which all instances of
727
+ AbstractBufferedFile are.
728
+ """
729
+
730
+ def __init__(
731
+ self,
732
+ buffer,
733
+ encoding=None,
734
+ errors=None,
735
+ newline=None,
736
+ line_buffering=False,
737
+ write_through=False,
738
+ ):
739
+ self.args = buffer, encoding, errors, newline, line_buffering, write_through
740
+ super().__init__(*self.args)
741
+
742
+ def __reduce__(self):
743
+ return PickleableTextIOWrapper, self.args
meow/lib/python3.13/site-packages/fsspec/dircache.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from collections.abc import MutableMapping
3
+ from functools import lru_cache
4
+
5
+
6
+ class DirCache(MutableMapping):
7
+ """
8
+ Caching of directory listings, in a structure like::
9
+
10
+ {"path0": [
11
+ {"name": "path0/file0",
12
+ "size": 123,
13
+ "type": "file",
14
+ ...
15
+ },
16
+ {"name": "path0/file1",
17
+ },
18
+ ...
19
+ ],
20
+ "path1": [...]
21
+ }
22
+
23
+ Parameters to this class control listing expiry or indeed turn
24
+ caching off
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ use_listings_cache=True,
30
+ listings_expiry_time=None,
31
+ max_paths=None,
32
+ **kwargs,
33
+ ):
34
+ """
35
+
36
+ Parameters
37
+ ----------
38
+ use_listings_cache: bool
39
+ If False, this cache never returns items, but always reports KeyError,
40
+ and setting items has no effect
41
+ listings_expiry_time: int or float (optional)
42
+ Time in seconds that a listing is considered valid. If None,
43
+ listings do not expire.
44
+ max_paths: int (optional)
45
+ The number of most recent listings that are considered valid; 'recent'
46
+ refers to when the entry was set.
47
+ """
48
+ self._cache = {}
49
+ self._times = {}
50
+ if max_paths:
51
+ self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
52
+ self.use_listings_cache = use_listings_cache
53
+ self.listings_expiry_time = listings_expiry_time
54
+ self.max_paths = max_paths
55
+
56
+ def __getitem__(self, item):
57
+ if self.listings_expiry_time is not None:
58
+ if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
59
+ del self._cache[item]
60
+ if self.max_paths:
61
+ self._q(item)
62
+ return self._cache[item] # maybe raises KeyError
63
+
64
+ def clear(self):
65
+ self._cache.clear()
66
+
67
+ def __len__(self):
68
+ return len(self._cache)
69
+
70
+ def __contains__(self, item):
71
+ try:
72
+ self[item]
73
+ return True
74
+ except KeyError:
75
+ return False
76
+
77
+ def __setitem__(self, key, value):
78
+ if not self.use_listings_cache:
79
+ return
80
+ if self.max_paths:
81
+ self._q(key)
82
+ self._cache[key] = value
83
+ if self.listings_expiry_time is not None:
84
+ self._times[key] = time.time()
85
+
86
+ def __delitem__(self, key):
87
+ del self._cache[key]
88
+
89
+ def __iter__(self):
90
+ entries = list(self._cache)
91
+
92
+ return (k for k in entries if k in self)
93
+
94
+ def __reduce__(self):
95
+ return (
96
+ DirCache,
97
+ (self.use_listings_cache, self.listings_expiry_time, self.max_paths),
98
+ )
meow/lib/python3.13/site-packages/fsspec/exceptions.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ fsspec user-defined exception classes
3
+ """
4
+
5
+ import asyncio
6
+
7
+
8
+ class BlocksizeMismatchError(ValueError):
9
+ """
10
+ Raised when a cached file is opened with a different blocksize than it was
11
+ written with
12
+ """
13
+
14
+
15
+ class FSTimeoutError(asyncio.TimeoutError):
16
+ """
17
+ Raised when a fsspec function timed out occurs
18
+ """
meow/lib/python3.13/site-packages/fsspec/fuse.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import stat
5
+ import threading
6
+ import time
7
+ from errno import EIO, ENOENT
8
+
9
+ from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
10
+
11
+ from fsspec import __version__
12
+ from fsspec.core import url_to_fs
13
+
14
+ logger = logging.getLogger("fsspec.fuse")
15
+
16
+
17
+ class FUSEr(Operations):
18
+ def __init__(self, fs, path, ready_file=False):
19
+ self.fs = fs
20
+ self.cache = {}
21
+ self.root = path.rstrip("/") + "/"
22
+ self.counter = 0
23
+ logger.info("Starting FUSE at %s", path)
24
+ self._ready_file = ready_file
25
+
26
+ def getattr(self, path, fh=None):
27
+ logger.debug("getattr %s", path)
28
+ if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
29
+ return {"type": "file", "st_size": 5}
30
+
31
+ path = "".join([self.root, path.lstrip("/")]).rstrip("/")
32
+ try:
33
+ info = self.fs.info(path)
34
+ except FileNotFoundError as exc:
35
+ raise FuseOSError(ENOENT) from exc
36
+
37
+ data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
38
+ perm = info.get("mode", 0o777)
39
+
40
+ if info["type"] != "file":
41
+ data["st_mode"] = stat.S_IFDIR | perm
42
+ data["st_size"] = 0
43
+ data["st_blksize"] = 0
44
+ else:
45
+ data["st_mode"] = stat.S_IFREG | perm
46
+ data["st_size"] = info["size"]
47
+ data["st_blksize"] = 5 * 2**20
48
+ data["st_nlink"] = 1
49
+ data["st_atime"] = info["atime"] if "atime" in info else time.time()
50
+ data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
51
+ data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
52
+ return data
53
+
54
+ def readdir(self, path, fh):
55
+ logger.debug("readdir %s", path)
56
+ path = "".join([self.root, path.lstrip("/")])
57
+ files = self.fs.ls(path, False)
58
+ files = [os.path.basename(f.rstrip("/")) for f in files]
59
+ return [".", ".."] + files
60
+
61
+ def mkdir(self, path, mode):
62
+ path = "".join([self.root, path.lstrip("/")])
63
+ self.fs.mkdir(path)
64
+ return 0
65
+
66
+ def rmdir(self, path):
67
+ path = "".join([self.root, path.lstrip("/")])
68
+ self.fs.rmdir(path)
69
+ return 0
70
+
71
+ def read(self, path, size, offset, fh):
72
+ logger.debug("read %s", (path, size, offset))
73
+ if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
74
+ # status indicator
75
+ return b"ready"
76
+
77
+ f = self.cache[fh]
78
+ f.seek(offset)
79
+ out = f.read(size)
80
+ return out
81
+
82
+ def write(self, path, data, offset, fh):
83
+ logger.debug("write %s", (path, offset))
84
+ f = self.cache[fh]
85
+ f.seek(offset)
86
+ f.write(data)
87
+ return len(data)
88
+
89
+ def create(self, path, flags, fi=None):
90
+ logger.debug("create %s", (path, flags))
91
+ fn = "".join([self.root, path.lstrip("/")])
92
+ self.fs.touch(fn) # OS will want to get attributes immediately
93
+ f = self.fs.open(fn, "wb")
94
+ self.cache[self.counter] = f
95
+ self.counter += 1
96
+ return self.counter - 1
97
+
98
+ def open(self, path, flags):
99
+ logger.debug("open %s", (path, flags))
100
+ fn = "".join([self.root, path.lstrip("/")])
101
+ if flags % 2 == 0:
102
+ # read
103
+ mode = "rb"
104
+ else:
105
+ # write/create
106
+ mode = "wb"
107
+ self.cache[self.counter] = self.fs.open(fn, mode)
108
+ self.counter += 1
109
+ return self.counter - 1
110
+
111
+ def truncate(self, path, length, fh=None):
112
+ fn = "".join([self.root, path.lstrip("/")])
113
+ if length != 0:
114
+ raise NotImplementedError
115
+ # maybe should be no-op since open with write sets size to zero anyway
116
+ self.fs.touch(fn)
117
+
118
+ def unlink(self, path):
119
+ fn = "".join([self.root, path.lstrip("/")])
120
+ try:
121
+ self.fs.rm(fn, False)
122
+ except (OSError, FileNotFoundError) as exc:
123
+ raise FuseOSError(EIO) from exc
124
+
125
+ def release(self, path, fh):
126
+ try:
127
+ if fh in self.cache:
128
+ f = self.cache[fh]
129
+ f.close()
130
+ self.cache.pop(fh)
131
+ except Exception as e:
132
+ print(e)
133
+ return 0
134
+
135
+ def chmod(self, path, mode):
136
+ if hasattr(self.fs, "chmod"):
137
+ path = "".join([self.root, path.lstrip("/")])
138
+ return self.fs.chmod(path, mode)
139
+ raise NotImplementedError
140
+
141
+
142
+ def run(
143
+ fs,
144
+ path,
145
+ mount_point,
146
+ foreground=True,
147
+ threads=False,
148
+ ready_file=False,
149
+ ops_class=FUSEr,
150
+ ):
151
+ """Mount stuff in a local directory
152
+
153
+ This uses fusepy to make it appear as if a given path on an fsspec
154
+ instance is in fact resident within the local file-system.
155
+
156
+ This requires that fusepy by installed, and that FUSE be available on
157
+ the system (typically requiring a package to be installed with
158
+ apt, yum, brew, etc.).
159
+
160
+ Parameters
161
+ ----------
162
+ fs: file-system instance
163
+ From one of the compatible implementations
164
+ path: str
165
+ Location on that file-system to regard as the root directory to
166
+ mount. Note that you typically should include the terminating "/"
167
+ character.
168
+ mount_point: str
169
+ An empty directory on the local file-system where the contents of
170
+ the remote path will appear.
171
+ foreground: bool
172
+ Whether or not calling this function will block. Operation will
173
+ typically be more stable if True.
174
+ threads: bool
175
+ Whether or not to create threads when responding to file operations
176
+ within the mounter directory. Operation will typically be more
177
+ stable if False.
178
+ ready_file: bool
179
+ Whether the FUSE process is ready. The ``.fuse_ready`` file will
180
+ exist in the ``mount_point`` directory if True. Debugging purpose.
181
+ ops_class: FUSEr or Subclass of FUSEr
182
+ To override the default behavior of FUSEr. For Example, logging
183
+ to file.
184
+
185
+ """
186
+ func = lambda: FUSE(
187
+ ops_class(fs, path, ready_file=ready_file),
188
+ mount_point,
189
+ nothreads=not threads,
190
+ foreground=foreground,
191
+ )
192
+ if not foreground:
193
+ th = threading.Thread(target=func)
194
+ th.daemon = True
195
+ th.start()
196
+ return th
197
+ else: # pragma: no cover
198
+ try:
199
+ func()
200
+ except KeyboardInterrupt:
201
+ pass
202
+
203
+
204
+ def main(args):
205
+ """Mount filesystem from chained URL to MOUNT_POINT.
206
+
207
+ Examples:
208
+
209
+ python3 -m fsspec.fuse memory /usr/share /tmp/mem
210
+
211
+ python3 -m fsspec.fuse local /tmp/source /tmp/local \\
212
+ -l /tmp/fsspecfuse.log
213
+
214
+ You can also mount chained-URLs and use special settings:
215
+
216
+ python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
217
+ / /tmp/zip \\
218
+ -o 'filecache-cache_storage=/tmp/simplecache'
219
+
220
+ You can specify the type of the setting by using `[int]` or `[bool]`,
221
+ (`true`, `yes`, `1` represents the Boolean value `True`):
222
+
223
+ python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
224
+ /historic/packages/RPMS /tmp/ftp \\
225
+ -o 'simplecache-cache_storage=/tmp/simplecache' \\
226
+ -o 'simplecache-check_files=false[bool]' \\
227
+ -o 'ftp-listings_expiry_time=60[int]' \\
228
+ -o 'ftp-username=anonymous' \\
229
+ -o 'ftp-password=xieyanbo'
230
+ """
231
+
232
+ class RawDescriptionArgumentParser(argparse.ArgumentParser):
233
+ def format_help(self):
234
+ usage = super().format_help()
235
+ parts = usage.split("\n\n")
236
+ parts[1] = self.description.rstrip()
237
+ return "\n\n".join(parts)
238
+
239
+ parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
240
+ parser.add_argument("--version", action="version", version=__version__)
241
+ parser.add_argument("url", type=str, help="fs url")
242
+ parser.add_argument("source_path", type=str, help="source directory in fs")
243
+ parser.add_argument("mount_point", type=str, help="local directory")
244
+ parser.add_argument(
245
+ "-o",
246
+ "--option",
247
+ action="append",
248
+ help="Any options of protocol included in the chained URL",
249
+ )
250
+ parser.add_argument(
251
+ "-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
252
+ )
253
+ parser.add_argument(
254
+ "-f",
255
+ "--foreground",
256
+ action="store_false",
257
+ help="Running in foreground or not (Default: False)",
258
+ )
259
+ parser.add_argument(
260
+ "-t",
261
+ "--threads",
262
+ action="store_false",
263
+ help="Running with threads support (Default: False)",
264
+ )
265
+ parser.add_argument(
266
+ "-r",
267
+ "--ready-file",
268
+ action="store_false",
269
+ help="The `.fuse_ready` file will exist after FUSE is ready. "
270
+ "(Debugging purpose, Default: False)",
271
+ )
272
+ args = parser.parse_args(args)
273
+
274
+ kwargs = {}
275
+ for item in args.option or []:
276
+ key, sep, value = item.partition("=")
277
+ if not sep:
278
+ parser.error(message=f"Wrong option: {item!r}")
279
+ val = value.lower()
280
+ if val.endswith("[int]"):
281
+ value = int(value[: -len("[int]")])
282
+ elif val.endswith("[bool]"):
283
+ value = val[: -len("[bool]")] in ["1", "yes", "true"]
284
+
285
+ if "-" in key:
286
+ fs_name, setting_name = key.split("-", 1)
287
+ if fs_name in kwargs:
288
+ kwargs[fs_name][setting_name] = value
289
+ else:
290
+ kwargs[fs_name] = {setting_name: value}
291
+ else:
292
+ kwargs[key] = value
293
+
294
+ if args.log_file:
295
+ logging.basicConfig(
296
+ level=logging.DEBUG,
297
+ filename=args.log_file,
298
+ format="%(asctime)s %(message)s",
299
+ )
300
+
301
+ class LoggingFUSEr(FUSEr, LoggingMixIn):
302
+ pass
303
+
304
+ fuser = LoggingFUSEr
305
+ else:
306
+ fuser = FUSEr
307
+
308
+ fs, url_path = url_to_fs(args.url, **kwargs)
309
+ logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
310
+ run(
311
+ fs,
312
+ args.source_path,
313
+ args.mount_point,
314
+ foreground=args.foreground,
315
+ threads=args.threads,
316
+ ready_file=args.ready_file,
317
+ ops_class=fuser,
318
+ )
319
+
320
+
321
+ if __name__ == "__main__":
322
+ import sys
323
+
324
+ main(sys.argv[1:])
meow/lib/python3.13/site-packages/fsspec/generic.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import uuid
8
+ from typing import Optional
9
+
10
+ from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
11
+ from .callbacks import DEFAULT_CALLBACK
12
+ from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
13
+
14
+ _generic_fs = {}
15
+ logger = logging.getLogger("fsspec.generic")
16
+
17
+
18
+ def set_generic_fs(protocol, **storage_options):
19
+ _generic_fs[protocol] = filesystem(protocol, **storage_options)
20
+
21
+
22
+ default_method = "default"
23
+
24
+
25
+ def _resolve_fs(url, method=None, protocol=None, storage_options=None):
26
+ """Pick instance of backend FS"""
27
+ method = method or default_method
28
+ protocol = protocol or split_protocol(url)[0]
29
+ storage_options = storage_options or {}
30
+ if method == "default":
31
+ return filesystem(protocol)
32
+ if method == "generic":
33
+ return _generic_fs[protocol]
34
+ if method == "current":
35
+ cls = get_filesystem_class(protocol)
36
+ return cls.current()
37
+ if method == "options":
38
+ fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
39
+ return fs
40
+ raise ValueError(f"Unknown FS resolution method: {method}")
41
+
42
+
43
+ def rsync(
44
+ source,
45
+ destination,
46
+ delete_missing=False,
47
+ source_field="size",
48
+ dest_field="size",
49
+ update_cond="different",
50
+ inst_kwargs=None,
51
+ fs=None,
52
+ **kwargs,
53
+ ):
54
+ """Sync files between two directory trees
55
+
56
+ (experimental)
57
+
58
+ Parameters
59
+ ----------
60
+ source: str
61
+ Root of the directory tree to take files from. This must be a directory, but
62
+ do not include any terminating "/" character
63
+ destination: str
64
+ Root path to copy into. The contents of this location should be
65
+ identical to the contents of ``source`` when done. This will be made a
66
+ directory, and the terminal "/" should not be included.
67
+ delete_missing: bool
68
+ If there are paths in the destination that don't exist in the
69
+ source and this is True, delete them. Otherwise, leave them alone.
70
+ source_field: str | callable
71
+ If ``update_field`` is "different", this is the key in the info
72
+ of source files to consider for difference. Maybe a function of the
73
+ info dict.
74
+ dest_field: str | callable
75
+ If ``update_field`` is "different", this is the key in the info
76
+ of destination files to consider for difference. May be a function of
77
+ the info dict.
78
+ update_cond: "different"|"always"|"never"
79
+ If "always", every file is copied, regardless of whether it exists in
80
+ the destination. If "never", files that exist in the destination are
81
+ not copied again. If "different" (default), only copy if the info
82
+ fields given by ``source_field`` and ``dest_field`` (usually "size")
83
+ are different. Other comparisons may be added in the future.
84
+ inst_kwargs: dict|None
85
+ If ``fs`` is None, use this set of keyword arguments to make a
86
+ GenericFileSystem instance
87
+ fs: GenericFileSystem|None
88
+ Instance to use if explicitly given. The instance defines how to
89
+ to make downstream file system instances from paths.
90
+
91
+ Returns
92
+ -------
93
+ dict of the copy operations that were performed, {source: destination}
94
+ """
95
+ fs = fs or GenericFileSystem(**(inst_kwargs or {}))
96
+ source = fs._strip_protocol(source)
97
+ destination = fs._strip_protocol(destination)
98
+ allfiles = fs.find(source, withdirs=True, detail=True)
99
+ if not fs.isdir(source):
100
+ raise ValueError("Can only rsync on a directory")
101
+ otherfiles = fs.find(destination, withdirs=True, detail=True)
102
+ dirs = [
103
+ a
104
+ for a, v in allfiles.items()
105
+ if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
106
+ ]
107
+ logger.debug(f"{len(dirs)} directories to create")
108
+ if dirs:
109
+ fs.make_many_dirs(
110
+ [dirn.replace(source, destination) for dirn in dirs], exist_ok=True
111
+ )
112
+ allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
113
+ logger.debug(f"{len(allfiles)} files to consider for copy")
114
+ to_delete = [
115
+ o
116
+ for o, v in otherfiles.items()
117
+ if o.replace(destination, source) not in allfiles and v["type"] == "file"
118
+ ]
119
+ for k, v in allfiles.copy().items():
120
+ otherfile = k.replace(source, destination)
121
+ if otherfile in otherfiles:
122
+ if update_cond == "always":
123
+ allfiles[k] = otherfile
124
+ elif update_cond == "different":
125
+ inf1 = source_field(v) if callable(source_field) else v[source_field]
126
+ v2 = otherfiles[otherfile]
127
+ inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
128
+ if inf1 != inf2:
129
+ # details mismatch, make copy
130
+ allfiles[k] = otherfile
131
+ else:
132
+ # details match, don't copy
133
+ allfiles.pop(k)
134
+ else:
135
+ # file not in target yet
136
+ allfiles[k] = otherfile
137
+ logger.debug(f"{len(allfiles)} files to copy")
138
+ if allfiles:
139
+ source_files, target_files = zip(*allfiles.items())
140
+ fs.cp(source_files, target_files, **kwargs)
141
+ logger.debug(f"{len(to_delete)} files to delete")
142
+ if delete_missing and to_delete:
143
+ fs.rm(to_delete)
144
+ return allfiles
145
+
146
+
147
+ class GenericFileSystem(AsyncFileSystem):
148
+ """Wrapper over all other FS types
149
+
150
+ <experimental!>
151
+
152
+ This implementation is a single unified interface to be able to run FS operations
153
+ over generic URLs, and dispatch to the specific implementations using the URL
154
+ protocol prefix.
155
+
156
+ Note: instances of this FS are always async, even if you never use it with any async
157
+ backend.
158
+ """
159
+
160
+ protocol = "generic" # there is no real reason to ever use a protocol with this FS
161
+
162
+ def __init__(self, default_method="default", **kwargs):
163
+ """
164
+
165
+ Parameters
166
+ ----------
167
+ default_method: str (optional)
168
+ Defines how to configure backend FS instances. Options are:
169
+ - "default": instantiate like FSClass(), with no
170
+ extra arguments; this is the default instance of that FS, and can be
171
+ configured via the config system
172
+ - "generic": takes instances from the `_generic_fs` dict in this module,
173
+ which you must populate before use. Keys are by protocol
174
+ - "current": takes the most recently instantiated version of each FS
175
+ """
176
+ self.method = default_method
177
+ super().__init__(**kwargs)
178
+
179
+ def _parent(self, path):
180
+ fs = _resolve_fs(path, self.method)
181
+ return fs.unstrip_protocol(fs._parent(path))
182
+
183
+ def _strip_protocol(self, path):
184
+ # normalization only
185
+ fs = _resolve_fs(path, self.method)
186
+ return fs.unstrip_protocol(fs._strip_protocol(path))
187
+
188
+ async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
189
+ fs = _resolve_fs(path, self.method)
190
+ if fs.async_impl:
191
+ out = await fs._find(
192
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
193
+ )
194
+ else:
195
+ out = fs.find(
196
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
197
+ )
198
+ result = {}
199
+ for k, v in out.items():
200
+ v = v.copy() # don't corrupt target FS dircache
201
+ name = fs.unstrip_protocol(k)
202
+ v["name"] = name
203
+ result[name] = v
204
+ if detail:
205
+ return result
206
+ return list(result)
207
+
208
+ async def _info(self, url, **kwargs):
209
+ fs = _resolve_fs(url, self.method)
210
+ if fs.async_impl:
211
+ out = await fs._info(url, **kwargs)
212
+ else:
213
+ out = fs.info(url, **kwargs)
214
+ out = out.copy() # don't edit originals
215
+ out["name"] = fs.unstrip_protocol(out["name"])
216
+ return out
217
+
218
+ async def _ls(
219
+ self,
220
+ url,
221
+ detail=True,
222
+ **kwargs,
223
+ ):
224
+ fs = _resolve_fs(url, self.method)
225
+ if fs.async_impl:
226
+ out = await fs._ls(url, detail=True, **kwargs)
227
+ else:
228
+ out = fs.ls(url, detail=True, **kwargs)
229
+ out = [o.copy() for o in out] # don't edit originals
230
+ for o in out:
231
+ o["name"] = fs.unstrip_protocol(o["name"])
232
+ if detail:
233
+ return out
234
+ else:
235
+ return [o["name"] for o in out]
236
+
237
+ async def _cat_file(
238
+ self,
239
+ url,
240
+ **kwargs,
241
+ ):
242
+ fs = _resolve_fs(url, self.method)
243
+ if fs.async_impl:
244
+ return await fs._cat_file(url, **kwargs)
245
+ else:
246
+ return fs.cat_file(url, **kwargs)
247
+
248
+ async def _pipe_file(
249
+ self,
250
+ path,
251
+ value,
252
+ **kwargs,
253
+ ):
254
+ fs = _resolve_fs(path, self.method)
255
+ if fs.async_impl:
256
+ return await fs._pipe_file(path, value, **kwargs)
257
+ else:
258
+ return fs.pipe_file(path, value, **kwargs)
259
+
260
+ async def _rm(self, url, **kwargs):
261
+ urls = url
262
+ if isinstance(urls, str):
263
+ urls = [urls]
264
+ fs = _resolve_fs(urls[0], self.method)
265
+ if fs.async_impl:
266
+ await fs._rm(urls, **kwargs)
267
+ else:
268
+ fs.rm(url, **kwargs)
269
+
270
+ async def _makedirs(self, path, exist_ok=False):
271
+ logger.debug("Make dir %s", path)
272
+ fs = _resolve_fs(path, self.method)
273
+ if fs.async_impl:
274
+ await fs._makedirs(path, exist_ok=exist_ok)
275
+ else:
276
+ fs.makedirs(path, exist_ok=exist_ok)
277
+
278
+ def rsync(self, source, destination, **kwargs):
279
+ """Sync files between two directory trees
280
+
281
+ See `func:rsync` for more details.
282
+ """
283
+ rsync(source, destination, fs=self, **kwargs)
284
+
285
+ async def _cp_file(
286
+ self,
287
+ url,
288
+ url2,
289
+ blocksize=2**20,
290
+ callback=DEFAULT_CALLBACK,
291
+ **kwargs,
292
+ ):
293
+ fs = _resolve_fs(url, self.method)
294
+ fs2 = _resolve_fs(url2, self.method)
295
+ if fs is fs2:
296
+ # pure remote
297
+ if fs.async_impl:
298
+ return await fs._cp_file(url, url2, **kwargs)
299
+ else:
300
+ return fs.cp_file(url, url2, **kwargs)
301
+ kw = {"blocksize": 0, "cache_type": "none"}
302
+ try:
303
+ f1 = (
304
+ await fs.open_async(url, "rb")
305
+ if hasattr(fs, "open_async")
306
+ else fs.open(url, "rb", **kw)
307
+ )
308
+ callback.set_size(await maybe_await(f1.size))
309
+ f2 = (
310
+ await fs2.open_async(url2, "wb")
311
+ if hasattr(fs2, "open_async")
312
+ else fs2.open(url2, "wb", **kw)
313
+ )
314
+ while f1.size is None or f2.tell() < f1.size:
315
+ data = await maybe_await(f1.read(blocksize))
316
+ if f1.size is None and not data:
317
+ break
318
+ await maybe_await(f2.write(data))
319
+ callback.absolute_update(f2.tell())
320
+ finally:
321
+ try:
322
+ await maybe_await(f2.close())
323
+ await maybe_await(f1.close())
324
+ except NameError:
325
+ # fail while opening f1 or f2
326
+ pass
327
+
328
+ async def _make_many_dirs(self, urls, exist_ok=True):
329
+ fs = _resolve_fs(urls[0], self.method)
330
+ if fs.async_impl:
331
+ coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
332
+ await _run_coros_in_chunks(coros)
333
+ else:
334
+ for u in urls:
335
+ fs.makedirs(u, exist_ok=exist_ok)
336
+
337
+ make_many_dirs = sync_wrapper(_make_many_dirs)
338
+
339
+ async def _copy(
340
+ self,
341
+ path1: list[str],
342
+ path2: list[str],
343
+ recursive: bool = False,
344
+ on_error: str = "ignore",
345
+ maxdepth: Optional[int] = None,
346
+ batch_size: Optional[int] = None,
347
+ tempdir: Optional[str] = None,
348
+ **kwargs,
349
+ ):
350
+ if recursive:
351
+ raise NotImplementedError
352
+ fs = _resolve_fs(path1[0], self.method)
353
+ fs2 = _resolve_fs(path2[0], self.method)
354
+ # not expanding paths atm., assume call is from rsync()
355
+ if fs is fs2:
356
+ # pure remote
357
+ if fs.async_impl:
358
+ return await fs._copy(path1, path2, **kwargs)
359
+ else:
360
+ return fs.copy(path1, path2, **kwargs)
361
+ await copy_file_op(
362
+ fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
363
+ )
364
+
365
+
366
+ async def copy_file_op(
367
+ fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
368
+ ):
369
+ import tempfile
370
+
371
+ tempdir = tempdir or tempfile.mkdtemp()
372
+ try:
373
+ coros = [
374
+ _copy_file_op(
375
+ fs1,
376
+ u1,
377
+ fs2,
378
+ u2,
379
+ os.path.join(tempdir, uuid.uuid4().hex),
380
+ on_error=on_error,
381
+ )
382
+ for u1, u2 in zip(url1, url2)
383
+ ]
384
+ await _run_coros_in_chunks(coros, batch_size=batch_size)
385
+ finally:
386
+ shutil.rmtree(tempdir)
387
+
388
+
389
+ async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
390
+ ex = () if on_error == "raise" else Exception
391
+ logger.debug("Copy %s -> %s", url1, url2)
392
+ try:
393
+ if fs1.async_impl:
394
+ await fs1._get_file(url1, local)
395
+ else:
396
+ fs1.get_file(url1, local)
397
+ if fs2.async_impl:
398
+ await fs2._put_file(local, url2)
399
+ else:
400
+ fs2.put_file(local, url2)
401
+ os.unlink(local)
402
+ logger.debug("Copy %s -> %s; done", url1, url2)
403
+ except ex as e:
404
+ logger.debug("ignoring cp exception for %s: %s", url1, e)
405
+
406
+
407
+ async def maybe_await(cor):
408
+ if inspect.iscoroutine(cor):
409
+ return await cor
410
+ else:
411
+ return cor
meow/lib/python3.13/site-packages/fsspec/gui.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import contextlib
3
+ import logging
4
+ import os
5
+ import re
6
+ from typing import ClassVar, Sequence
7
+
8
+ import panel as pn
9
+
10
+ from .core import OpenFile, get_filesystem_class, split_protocol
11
+ from .registry import known_implementations
12
+
13
+ pn.extension()
14
+ logger = logging.getLogger("fsspec.gui")
15
+
16
+
17
+ class SigSlot:
18
+ """Signal-slot mixin, for Panel event passing
19
+
20
+ Include this class in a widget manager's superclasses to be able to
21
+ register events and callbacks on Panel widgets managed by that class.
22
+
23
+ The method ``_register`` should be called as widgets are added, and external
24
+ code should call ``connect`` to associate callbacks.
25
+
26
+ By default, all signals emit a DEBUG logging statement.
27
+ """
28
+
29
+ # names of signals that this class may emit each of which must be
30
+ # set by _register for any new instance
31
+ signals: ClassVar[Sequence[str]] = []
32
+ # names of actions that this class may respond to
33
+ slots: ClassVar[Sequence[str]] = []
34
+
35
+ # each of which must be a method name
36
+
37
+ def __init__(self):
38
+ self._ignoring_events = False
39
+ self._sigs = {}
40
+ self._map = {}
41
+ self._setup()
42
+
43
+ def _setup(self):
44
+ """Create GUI elements and register signals"""
45
+ self.panel = pn.pane.PaneBase()
46
+ # no signals to set up in the base class
47
+
48
+ def _register(
49
+ self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
50
+ ):
51
+ """Watch the given attribute of a widget and assign it a named event
52
+
53
+ This is normally called at the time a widget is instantiated, in the
54
+ class which owns it.
55
+
56
+ Parameters
57
+ ----------
58
+ widget : pn.layout.Panel or None
59
+ Widget to watch. If None, an anonymous signal not associated with
60
+ any widget.
61
+ name : str
62
+ Name of this event
63
+ thing : str
64
+ Attribute of the given widget to watch
65
+ log_level : int
66
+ When the signal is triggered, a logging event of the given level
67
+ will be fired in the dfviz logger.
68
+ auto : bool
69
+ If True, automatically connects with a method in this class of the
70
+ same name.
71
+ """
72
+ if name not in self.signals:
73
+ raise ValueError(f"Attempt to assign an undeclared signal: {name}")
74
+ self._sigs[name] = {
75
+ "widget": widget,
76
+ "callbacks": [],
77
+ "thing": thing,
78
+ "log": log_level,
79
+ }
80
+ wn = "-".join(
81
+ [
82
+ getattr(widget, "name", str(widget)) if widget is not None else "none",
83
+ thing,
84
+ ]
85
+ )
86
+ self._map[wn] = name
87
+ if widget is not None:
88
+ widget.param.watch(self._signal, thing, onlychanged=True)
89
+ if auto and hasattr(self, name):
90
+ self.connect(name, getattr(self, name))
91
+
92
+ def _repr_mimebundle_(self, *args, **kwargs):
93
+ """Display in a notebook or a server"""
94
+ try:
95
+ return self.panel._repr_mimebundle_(*args, **kwargs)
96
+ except (ValueError, AttributeError) as exc:
97
+ raise NotImplementedError(
98
+ "Panel does not seem to be set up properly"
99
+ ) from exc
100
+
101
+ def connect(self, signal, slot):
102
+ """Associate call back with given event
103
+
104
+ The callback must be a function which takes the "new" value of the
105
+ watched attribute as the only parameter. If the callback return False,
106
+ this cancels any further processing of the given event.
107
+
108
+ Alternatively, the callback can be a string, in which case it means
109
+ emitting the correspondingly-named event (i.e., connect to self)
110
+ """
111
+ self._sigs[signal]["callbacks"].append(slot)
112
+
113
+ def _signal(self, event):
114
+ """This is called by a an action on a widget
115
+
116
+ Within an self.ignore_events context, nothing happens.
117
+
118
+ Tests can execute this method by directly changing the values of
119
+ widget components.
120
+ """
121
+ if not self._ignoring_events:
122
+ wn = "-".join([event.obj.name, event.name])
123
+ if wn in self._map and self._map[wn] in self._sigs:
124
+ self._emit(self._map[wn], event.new)
125
+
126
+ @contextlib.contextmanager
127
+ def ignore_events(self):
128
+ """Temporarily turn off events processing in this instance
129
+
130
+ (does not propagate to children)
131
+ """
132
+ self._ignoring_events = True
133
+ try:
134
+ yield
135
+ finally:
136
+ self._ignoring_events = False
137
+
138
+ def _emit(self, sig, value=None):
139
+ """An event happened, call its callbacks
140
+
141
+ This method can be used in tests to simulate message passing without
142
+ directly changing visual elements.
143
+
144
+ Calling of callbacks will halt whenever one returns False.
145
+ """
146
+ logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
147
+ for callback in self._sigs[sig]["callbacks"]:
148
+ if isinstance(callback, str):
149
+ self._emit(callback)
150
+ else:
151
+ try:
152
+ # running callbacks should not break the interface
153
+ ret = callback(value)
154
+ if ret is False:
155
+ break
156
+ except Exception as e:
157
+ logger.exception(
158
+ "Exception (%s) while executing callback for signal: %s",
159
+ e,
160
+ sig,
161
+ )
162
+
163
+ def show(self, threads=False):
164
+ """Open a new browser tab and display this instance's interface"""
165
+ self.panel.show(threads=threads, verbose=False)
166
+ return self
167
+
168
+
169
+ class SingleSelect(SigSlot):
170
+ """A multiselect which only allows you to select one item for an event"""
171
+
172
+ signals = ["_selected", "selected"] # the first is internal
173
+ slots = ["set_options", "set_selection", "add", "clear", "select"]
174
+
175
+ def __init__(self, **kwargs):
176
+ self.kwargs = kwargs
177
+ super().__init__()
178
+
179
+ def _setup(self):
180
+ self.panel = pn.widgets.MultiSelect(**self.kwargs)
181
+ self._register(self.panel, "_selected", "value")
182
+ self._register(None, "selected")
183
+ self.connect("_selected", self.select_one)
184
+
185
+ def _signal(self, *args, **kwargs):
186
+ super()._signal(*args, **kwargs)
187
+
188
+ def select_one(self, *_):
189
+ with self.ignore_events():
190
+ val = [self.panel.value[-1]] if self.panel.value else []
191
+ self.panel.value = val
192
+ self._emit("selected", self.panel.value)
193
+
194
+ def set_options(self, options):
195
+ self.panel.options = options
196
+
197
+ def clear(self):
198
+ self.panel.options = []
199
+
200
+ @property
201
+ def value(self):
202
+ return self.panel.value
203
+
204
+ def set_selection(self, selection):
205
+ self.panel.value = [selection]
206
+
207
+
208
+ class FileSelector(SigSlot):
209
+ """Panel-based graphical file selector widget
210
+
211
+ Instances of this widget are interactive and can be displayed in jupyter by having
212
+ them as the output of a cell, or in a separate browser tab using ``.show()``.
213
+ """
214
+
215
+ signals = [
216
+ "protocol_changed",
217
+ "selection_changed",
218
+ "directory_entered",
219
+ "home_clicked",
220
+ "up_clicked",
221
+ "go_clicked",
222
+ "filters_changed",
223
+ ]
224
+ slots = ["set_filters", "go_home"]
225
+
226
+ def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
227
+ """
228
+
229
+ Parameters
230
+ ----------
231
+ url : str (optional)
232
+ Initial value of the URL to populate the dialog; should include protocol
233
+ filters : list(str) (optional)
234
+ File endings to include in the listings. If not included, all files are
235
+ allowed. Does not affect directories.
236
+ If given, the endings will appear as checkboxes in the interface
237
+ ignore : list(str) (optional)
238
+ Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
239
+ hidden files on posix
240
+ kwargs : dict (optional)
241
+ To pass to file system instance
242
+ """
243
+ if url:
244
+ self.init_protocol, url = split_protocol(url)
245
+ else:
246
+ self.init_protocol, url = "file", os.getcwd()
247
+ self.init_url = url
248
+ self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
249
+ self.filters = filters
250
+ self.ignore = [re.compile(i) for i in ignore or []]
251
+ self._fs = None
252
+ super().__init__()
253
+
254
+ def _setup(self):
255
+ self.url = pn.widgets.TextInput(
256
+ name="url",
257
+ value=self.init_url,
258
+ align="end",
259
+ sizing_mode="stretch_width",
260
+ width_policy="max",
261
+ )
262
+ self.protocol = pn.widgets.Select(
263
+ options=sorted(known_implementations),
264
+ value=self.init_protocol,
265
+ name="protocol",
266
+ align="center",
267
+ )
268
+ self.kwargs = pn.widgets.TextInput(
269
+ name="kwargs", value=self.init_kwargs, align="center"
270
+ )
271
+ self.go = pn.widgets.Button(name="⇨", align="end", width=45)
272
+ self.main = SingleSelect(size=10)
273
+ self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
274
+ self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
275
+
276
+ self._register(self.protocol, "protocol_changed", auto=True)
277
+ self._register(self.go, "go_clicked", "clicks", auto=True)
278
+ self._register(self.up, "up_clicked", "clicks", auto=True)
279
+ self._register(self.home, "home_clicked", "clicks", auto=True)
280
+ self._register(None, "selection_changed")
281
+ self.main.connect("selected", self.selection_changed)
282
+ self._register(None, "directory_entered")
283
+ self.prev_protocol = self.protocol.value
284
+ self.prev_kwargs = self.storage_options
285
+
286
+ self.filter_sel = pn.widgets.CheckBoxGroup(
287
+ value=[], options=[], inline=False, align="end", width_policy="min"
288
+ )
289
+ self._register(self.filter_sel, "filters_changed", auto=True)
290
+
291
+ self.panel = pn.Column(
292
+ pn.Row(self.protocol, self.kwargs),
293
+ pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
294
+ self.main.panel,
295
+ )
296
+ self.set_filters(self.filters)
297
+ self.go_clicked()
298
+
299
+ def set_filters(self, filters=None):
300
+ self.filters = filters
301
+ if filters:
302
+ self.filter_sel.options = filters
303
+ self.filter_sel.value = filters
304
+ else:
305
+ self.filter_sel.options = []
306
+ self.filter_sel.value = []
307
+
308
+ @property
309
+ def storage_options(self):
310
+ """Value of the kwargs box as a dictionary"""
311
+ return ast.literal_eval(self.kwargs.value) or {}
312
+
313
+ @property
314
+ def fs(self):
315
+ """Current filesystem instance"""
316
+ if self._fs is None:
317
+ cls = get_filesystem_class(self.protocol.value)
318
+ self._fs = cls(**self.storage_options)
319
+ return self._fs
320
+
321
+ @property
322
+ def urlpath(self):
323
+ """URL of currently selected item"""
324
+ return (
325
+ (f"{self.protocol.value}://{self.main.value[0]}")
326
+ if self.main.value
327
+ else None
328
+ )
329
+
330
+ def open_file(self, mode="rb", compression=None, encoding=None):
331
+ """Create OpenFile instance for the currently selected item
332
+
333
+ For example, in a notebook you might do something like
334
+
335
+ .. code-block::
336
+
337
+ [ ]: sel = FileSelector(); sel
338
+
339
+ # user selects their file
340
+
341
+ [ ]: with sel.open_file('rb') as f:
342
+ ... out = f.read()
343
+
344
+ Parameters
345
+ ----------
346
+ mode: str (optional)
347
+ Open mode for the file.
348
+ compression: str (optional)
349
+ The interact with the file as compressed. Set to 'infer' to guess
350
+ compression from the file ending
351
+ encoding: str (optional)
352
+ If using text mode, use this encoding; defaults to UTF8.
353
+ """
354
+ if self.urlpath is None:
355
+ raise ValueError("No file selected")
356
+ return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
357
+
358
+ def filters_changed(self, values):
359
+ self.filters = values
360
+ self.go_clicked()
361
+
362
+ def selection_changed(self, *_):
363
+ if self.urlpath is None:
364
+ return
365
+ if self.fs.isdir(self.urlpath):
366
+ self.url.value = self.fs._strip_protocol(self.urlpath)
367
+ self.go_clicked()
368
+
369
+ def go_clicked(self, *_):
370
+ if (
371
+ self.prev_protocol != self.protocol.value
372
+ or self.prev_kwargs != self.storage_options
373
+ ):
374
+ self._fs = None # causes fs to be recreated
375
+ self.prev_protocol = self.protocol.value
376
+ self.prev_kwargs = self.storage_options
377
+ listing = sorted(
378
+ self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
379
+ )
380
+ listing = [
381
+ l
382
+ for l in listing
383
+ if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
384
+ ]
385
+ folders = {
386
+ "📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
387
+ for o in listing
388
+ if o["type"] == "directory"
389
+ }
390
+ files = {
391
+ "📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
392
+ for o in listing
393
+ if o["type"] == "file"
394
+ }
395
+ if self.filters:
396
+ files = {
397
+ k: v
398
+ for k, v in files.items()
399
+ if any(v.endswith(ext) for ext in self.filters)
400
+ }
401
+ self.main.set_options(dict(**folders, **files))
402
+
403
+ def protocol_changed(self, *_):
404
+ self._fs = None
405
+ self.main.options = []
406
+ self.url.value = ""
407
+
408
+ def home_clicked(self, *_):
409
+ self.protocol.value = self.init_protocol
410
+ self.kwargs.value = self.init_kwargs
411
+ self.url.value = self.init_url
412
+ self.go_clicked()
413
+
414
+ def up_clicked(self, *_):
415
+ self.url.value = self.fs._parent(self.url.value)
416
+ self.go_clicked()
meow/lib/python3.13/site-packages/fsspec/json.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from contextlib import suppress
3
+ from pathlib import PurePath
4
+ from typing import (
5
+ Any,
6
+ Callable,
7
+ ClassVar,
8
+ Dict,
9
+ List,
10
+ Mapping,
11
+ Optional,
12
+ Sequence,
13
+ Tuple,
14
+ )
15
+
16
+ from .registry import _import_class, get_filesystem_class
17
+ from .spec import AbstractFileSystem
18
+
19
+
20
+ class FilesystemJSONEncoder(json.JSONEncoder):
21
+ include_password: ClassVar[bool] = True
22
+
23
+ def default(self, o: Any) -> Any:
24
+ if isinstance(o, AbstractFileSystem):
25
+ return o.to_dict(include_password=self.include_password)
26
+ if isinstance(o, PurePath):
27
+ cls = type(o)
28
+ return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
29
+
30
+ return super().default(o)
31
+
32
+ def make_serializable(self, obj: Any) -> Any:
33
+ """
34
+ Recursively converts an object so that it can be JSON serialized via
35
+ :func:`json.dumps` and :func:`json.dump`, without actually calling
36
+ said functions.
37
+ """
38
+ if isinstance(obj, (str, int, float, bool)):
39
+ return obj
40
+ if isinstance(obj, Mapping):
41
+ return {k: self.make_serializable(v) for k, v in obj.items()}
42
+ if isinstance(obj, Sequence):
43
+ return [self.make_serializable(v) for v in obj]
44
+
45
+ return self.default(obj)
46
+
47
+
48
+ class FilesystemJSONDecoder(json.JSONDecoder):
49
+ def __init__(
50
+ self,
51
+ *,
52
+ object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None,
53
+ parse_float: Optional[Callable[[str], Any]] = None,
54
+ parse_int: Optional[Callable[[str], Any]] = None,
55
+ parse_constant: Optional[Callable[[str], Any]] = None,
56
+ strict: bool = True,
57
+ object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None,
58
+ ) -> None:
59
+ self.original_object_hook = object_hook
60
+
61
+ super().__init__(
62
+ object_hook=self.custom_object_hook,
63
+ parse_float=parse_float,
64
+ parse_int=parse_int,
65
+ parse_constant=parse_constant,
66
+ strict=strict,
67
+ object_pairs_hook=object_pairs_hook,
68
+ )
69
+
70
+ @classmethod
71
+ def try_resolve_path_cls(cls, dct: Dict[str, Any]):
72
+ with suppress(Exception):
73
+ fqp = dct["cls"]
74
+
75
+ path_cls = _import_class(fqp)
76
+
77
+ if issubclass(path_cls, PurePath):
78
+ return path_cls
79
+
80
+ return None
81
+
82
+ @classmethod
83
+ def try_resolve_fs_cls(cls, dct: Dict[str, Any]):
84
+ with suppress(Exception):
85
+ if "cls" in dct:
86
+ try:
87
+ fs_cls = _import_class(dct["cls"])
88
+ if issubclass(fs_cls, AbstractFileSystem):
89
+ return fs_cls
90
+ except Exception:
91
+ if "protocol" in dct: # Fallback if cls cannot be imported
92
+ return get_filesystem_class(dct["protocol"])
93
+
94
+ raise
95
+
96
+ return None
97
+
98
+ def custom_object_hook(self, dct: Dict[str, Any]):
99
+ if "cls" in dct:
100
+ if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
101
+ return AbstractFileSystem.from_dict(dct)
102
+ if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
103
+ return obj_cls(dct["str"])
104
+
105
+ if self.original_object_hook is not None:
106
+ return self.original_object_hook(dct)
107
+
108
+ return dct
109
+
110
+ def unmake_serializable(self, obj: Any) -> Any:
111
+ """
112
+ Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
113
+ """
114
+ if isinstance(obj, dict):
115
+ obj = self.custom_object_hook(obj)
116
+ if isinstance(obj, dict):
117
+ return {k: self.unmake_serializable(v) for k, v in obj.items()}
118
+ if isinstance(obj, (list, tuple)):
119
+ return [self.unmake_serializable(v) for v in obj]
120
+
121
+ return obj
meow/lib/python3.13/site-packages/fsspec/mapping.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import array
2
+ import logging
3
+ import posixpath
4
+ import warnings
5
+ from collections.abc import MutableMapping
6
+ from functools import cached_property
7
+
8
+ from fsspec.core import url_to_fs
9
+
10
+ logger = logging.getLogger("fsspec.mapping")
11
+
12
+
13
+ class FSMap(MutableMapping):
14
+ """Wrap a FileSystem instance as a mutable wrapping.
15
+
16
+ The keys of the mapping become files under the given root, and the
17
+ values (which must be bytes) the contents of those files.
18
+
19
+ Parameters
20
+ ----------
21
+ root: string
22
+ prefix for all the files
23
+ fs: FileSystem instance
24
+ check: bool (=True)
25
+ performs a touch at the location, to check for write access.
26
+
27
+ Examples
28
+ --------
29
+ >>> fs = FileSystem(**parameters) # doctest: +SKIP
30
+ >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
31
+ or, more likely
32
+ >>> d = fs.get_mapper('my-data/path/')
33
+
34
+ >>> d['loc1'] = b'Hello World' # doctest: +SKIP
35
+ >>> list(d.keys()) # doctest: +SKIP
36
+ ['loc1']
37
+ >>> d['loc1'] # doctest: +SKIP
38
+ b'Hello World'
39
+ """
40
+
41
+ def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
42
+ self.fs = fs
43
+ self.root = fs._strip_protocol(root)
44
+ self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
45
+ if missing_exceptions is None:
46
+ missing_exceptions = (
47
+ FileNotFoundError,
48
+ IsADirectoryError,
49
+ NotADirectoryError,
50
+ )
51
+ self.missing_exceptions = missing_exceptions
52
+ self.check = check
53
+ self.create = create
54
+ if create:
55
+ if not self.fs.exists(root):
56
+ self.fs.mkdir(root)
57
+ if check:
58
+ if not self.fs.exists(root):
59
+ raise ValueError(
60
+ f"Path {root} does not exist. Create "
61
+ f" with the ``create=True`` keyword"
62
+ )
63
+ self.fs.touch(root + "/a")
64
+ self.fs.rm(root + "/a")
65
+
66
+ @cached_property
67
+ def dirfs(self):
68
+ """dirfs instance that can be used with the same keys as the mapper"""
69
+ from .implementations.dirfs import DirFileSystem
70
+
71
+ return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
72
+
73
+ def clear(self):
74
+ """Remove all keys below root - empties out mapping"""
75
+ logger.info("Clear mapping at %s", self.root)
76
+ try:
77
+ self.fs.rm(self.root, True)
78
+ self.fs.mkdir(self.root)
79
+ except: # noqa: E722
80
+ pass
81
+
82
+ def getitems(self, keys, on_error="raise"):
83
+ """Fetch multiple items from the store
84
+
85
+ If the backend is async-able, this might proceed concurrently
86
+
87
+ Parameters
88
+ ----------
89
+ keys: list(str)
90
+ They keys to be fetched
91
+ on_error : "raise", "omit", "return"
92
+ If raise, an underlying exception will be raised (converted to KeyError
93
+ if the type is in self.missing_exceptions); if omit, keys with exception
94
+ will simply not be included in the output; if "return", all keys are
95
+ included in the output, but the value will be bytes or an exception
96
+ instance.
97
+
98
+ Returns
99
+ -------
100
+ dict(key, bytes|exception)
101
+ """
102
+ keys2 = [self._key_to_str(k) for k in keys]
103
+ oe = on_error if on_error == "raise" else "return"
104
+ try:
105
+ out = self.fs.cat(keys2, on_error=oe)
106
+ if isinstance(out, bytes):
107
+ out = {keys2[0]: out}
108
+ except self.missing_exceptions as e:
109
+ raise KeyError from e
110
+ out = {
111
+ k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
112
+ for k, v in out.items()
113
+ }
114
+ return {
115
+ key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2))
116
+ for key, k2 in zip(keys, keys2)
117
+ if on_error == "return" or not isinstance(out[k2], BaseException)
118
+ }
119
+
120
+ def setitems(self, values_dict):
121
+ """Set the values of multiple items in the store
122
+
123
+ Parameters
124
+ ----------
125
+ values_dict: dict(str, bytes)
126
+ """
127
+ values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
128
+ self.fs.pipe(values)
129
+
130
+ def delitems(self, keys):
131
+ """Remove multiple keys from the store"""
132
+ self.fs.rm([self._key_to_str(k) for k in keys])
133
+
134
+ def _key_to_str(self, key):
135
+ """Generate full path for the key"""
136
+ if not isinstance(key, str):
137
+ # raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
138
+ warnings.warn(
139
+ "from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
140
+ DeprecationWarning,
141
+ )
142
+ if isinstance(key, list):
143
+ key = tuple(key)
144
+ key = str(key)
145
+ return f"{self._root_key_to_str}{key}".rstrip("/")
146
+
147
+ def _str_to_key(self, s):
148
+ """Strip path of to leave key name"""
149
+ return s[len(self.root) :].lstrip("/")
150
+
151
+ def __getitem__(self, key, default=None):
152
+ """Retrieve data"""
153
+ k = self._key_to_str(key)
154
+ try:
155
+ result = self.fs.cat(k)
156
+ except self.missing_exceptions as exc:
157
+ if default is not None:
158
+ return default
159
+ raise KeyError(key) from exc
160
+ return result
161
+
162
+ def pop(self, key, default=None):
163
+ """Pop data"""
164
+ result = self.__getitem__(key, default)
165
+ try:
166
+ del self[key]
167
+ except KeyError:
168
+ pass
169
+ return result
170
+
171
+ def __setitem__(self, key, value):
172
+ """Store value in key"""
173
+ key = self._key_to_str(key)
174
+ self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
175
+ self.fs.pipe_file(key, maybe_convert(value))
176
+
177
+ def __iter__(self):
178
+ return (self._str_to_key(x) for x in self.fs.find(self.root))
179
+
180
+ def __len__(self):
181
+ return len(self.fs.find(self.root))
182
+
183
+ def __delitem__(self, key):
184
+ """Remove key"""
185
+ try:
186
+ self.fs.rm(self._key_to_str(key))
187
+ except Exception as exc:
188
+ raise KeyError from exc
189
+
190
+ def __contains__(self, key):
191
+ """Does key exist in mapping?"""
192
+ path = self._key_to_str(key)
193
+ return self.fs.isfile(path)
194
+
195
+ def __reduce__(self):
196
+ return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
197
+
198
+
199
+ def maybe_convert(value):
200
+ if isinstance(value, array.array) or hasattr(value, "__array__"):
201
+ # bytes-like things
202
+ if hasattr(value, "dtype") and value.dtype.kind in "Mm":
203
+ # The buffer interface doesn't support datetime64/timdelta64 numpy
204
+ # arrays
205
+ value = value.view("int64")
206
+ value = bytes(memoryview(value))
207
+ return value
208
+
209
+
210
+ def get_mapper(
211
+ url="",
212
+ check=False,
213
+ create=False,
214
+ missing_exceptions=None,
215
+ alternate_root=None,
216
+ **kwargs,
217
+ ):
218
+ """Create key-value interface for given URL and options
219
+
220
+ The URL will be of the form "protocol://location" and point to the root
221
+ of the mapper required. All keys will be file-names below this location,
222
+ and their values the contents of each key.
223
+
224
+ Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
225
+
226
+ Parameters
227
+ ----------
228
+ url: str
229
+ Root URL of mapping
230
+ check: bool
231
+ Whether to attempt to read from the location before instantiation, to
232
+ check that the mapping does exist
233
+ create: bool
234
+ Whether to make the directory corresponding to the root before
235
+ instantiating
236
+ missing_exceptions: None or tuple
237
+ If given, these exception types will be regarded as missing keys and
238
+ return KeyError when trying to read data. By default, you get
239
+ (FileNotFoundError, IsADirectoryError, NotADirectoryError)
240
+ alternate_root: None or str
241
+ In cases of complex URLs, the parser may fail to pick the correct part
242
+ for the mapper root, so this arg can override
243
+
244
+ Returns
245
+ -------
246
+ ``FSMap`` instance, the dict-like key-value store.
247
+ """
248
+ # Removing protocol here - could defer to each open() on the backend
249
+ fs, urlpath = url_to_fs(url, **kwargs)
250
+ root = alternate_root if alternate_root is not None else urlpath
251
+ return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)
meow/lib/python3.13/site-packages/fsspec/parquet.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import warnings
4
+
5
+ from .core import url_to_fs
6
+ from .utils import merge_offset_ranges
7
+
8
+ # Parquet-Specific Utilities for fsspec
9
+ #
10
+ # Most of the functions defined in this module are NOT
11
+ # intended for public consumption. The only exception
12
+ # to this is `open_parquet_file`, which should be used
13
+ # place of `fs.open()` to open parquet-formatted files
14
+ # on remote file systems.
15
+
16
+
17
+ def open_parquet_file(
18
+ path,
19
+ mode="rb",
20
+ fs=None,
21
+ metadata=None,
22
+ columns=None,
23
+ row_groups=None,
24
+ storage_options=None,
25
+ strict=False,
26
+ engine="auto",
27
+ max_gap=64_000,
28
+ max_block=256_000_000,
29
+ footer_sample_size=1_000_000,
30
+ **kwargs,
31
+ ):
32
+ """
33
+ Return a file-like object for a single Parquet file.
34
+
35
+ The specified parquet `engine` will be used to parse the
36
+ footer metadata, and determine the required byte ranges
37
+ from the file. The target path will then be opened with
38
+ the "parts" (`KnownPartsOfAFile`) caching strategy.
39
+
40
+ Note that this method is intended for usage with remote
41
+ file systems, and is unlikely to improve parquet-read
42
+ performance on local file systems.
43
+
44
+ Parameters
45
+ ----------
46
+ path: str
47
+ Target file path.
48
+ mode: str, optional
49
+ Mode option to be passed through to `fs.open`. Default is "rb".
50
+ metadata: Any, optional
51
+ Parquet metadata object. Object type must be supported
52
+ by the backend parquet engine. For now, only the "fastparquet"
53
+ engine supports an explicit `ParquetFile` metadata object.
54
+ If a metadata object is supplied, the remote footer metadata
55
+ will not need to be transferred into local memory.
56
+ fs: AbstractFileSystem, optional
57
+ Filesystem object to use for opening the file. If nothing is
58
+ specified, an `AbstractFileSystem` object will be inferred.
59
+ engine : str, default "auto"
60
+ Parquet engine to use for metadata parsing. Allowed options
61
+ include "fastparquet", "pyarrow", and "auto". The specified
62
+ engine must be installed in the current environment. If
63
+ "auto" is specified, and both engines are installed,
64
+ "fastparquet" will take precedence over "pyarrow".
65
+ columns: list, optional
66
+ List of all column names that may be read from the file.
67
+ row_groups : list, optional
68
+ List of all row-groups that may be read from the file. This
69
+ may be a list of row-group indices (integers), or it may be
70
+ a list of `RowGroup` metadata objects (if the "fastparquet"
71
+ engine is used).
72
+ storage_options : dict, optional
73
+ Used to generate an `AbstractFileSystem` object if `fs` was
74
+ not specified.
75
+ strict : bool, optional
76
+ Whether the resulting `KnownPartsOfAFile` cache should
77
+ fetch reads that go beyond a known byte-range boundary.
78
+ If `False` (the default), any read that ends outside a
79
+ known part will be zero padded. Note that using
80
+ `strict=True` may be useful for debugging.
81
+ max_gap : int, optional
82
+ Neighboring byte ranges will only be merged when their
83
+ inter-range gap is <= `max_gap`. Default is 64KB.
84
+ max_block : int, optional
85
+ Neighboring byte ranges will only be merged when the size of
86
+ the aggregated range is <= `max_block`. Default is 256MB.
87
+ footer_sample_size : int, optional
88
+ Number of bytes to read from the end of the path to look
89
+ for the footer metadata. If the sampled bytes do not contain
90
+ the footer, a second read request will be required, and
91
+ performance will suffer. Default is 1MB.
92
+ **kwargs :
93
+ Optional key-word arguments to pass to `fs.open`
94
+ """
95
+
96
+ # Make sure we have an `AbstractFileSystem` object
97
+ # to work with
98
+ if fs is None:
99
+ fs = url_to_fs(path, **(storage_options or {}))[0]
100
+
101
+ # For now, `columns == []` not supported. Just use
102
+ # default `open` command with `path` input
103
+ if columns is not None and len(columns) == 0:
104
+ return fs.open(path, mode=mode)
105
+
106
+ # Set the engine
107
+ engine = _set_engine(engine)
108
+
109
+ # Fetch the known byte ranges needed to read
110
+ # `columns` and/or `row_groups`
111
+ data = _get_parquet_byte_ranges(
112
+ [path],
113
+ fs,
114
+ metadata=metadata,
115
+ columns=columns,
116
+ row_groups=row_groups,
117
+ engine=engine,
118
+ max_gap=max_gap,
119
+ max_block=max_block,
120
+ footer_sample_size=footer_sample_size,
121
+ )
122
+
123
+ # Extract file name from `data`
124
+ fn = next(iter(data)) if data else path
125
+
126
+ # Call self.open with "parts" caching
127
+ options = kwargs.pop("cache_options", {}).copy()
128
+ return fs.open(
129
+ fn,
130
+ mode=mode,
131
+ cache_type="parts",
132
+ cache_options={
133
+ **options,
134
+ "data": data.get(fn, {}),
135
+ "strict": strict,
136
+ },
137
+ **kwargs,
138
+ )
139
+
140
+
141
+ def _get_parquet_byte_ranges(
142
+ paths,
143
+ fs,
144
+ metadata=None,
145
+ columns=None,
146
+ row_groups=None,
147
+ max_gap=64_000,
148
+ max_block=256_000_000,
149
+ footer_sample_size=1_000_000,
150
+ engine="auto",
151
+ ):
152
+ """Get a dictionary of the known byte ranges needed
153
+ to read a specific column/row-group selection from a
154
+ Parquet dataset. Each value in the output dictionary
155
+ is intended for use as the `data` argument for the
156
+ `KnownPartsOfAFile` caching strategy of a single path.
157
+ """
158
+
159
+ # Set engine if necessary
160
+ if isinstance(engine, str):
161
+ engine = _set_engine(engine)
162
+
163
+ # Pass to specialized function if metadata is defined
164
+ if metadata is not None:
165
+ # Use the provided parquet metadata object
166
+ # to avoid transferring/parsing footer metadata
167
+ return _get_parquet_byte_ranges_from_metadata(
168
+ metadata,
169
+ fs,
170
+ engine,
171
+ columns=columns,
172
+ row_groups=row_groups,
173
+ max_gap=max_gap,
174
+ max_block=max_block,
175
+ )
176
+
177
+ # Get file sizes asynchronously
178
+ file_sizes = fs.sizes(paths)
179
+
180
+ # Populate global paths, starts, & ends
181
+ result = {}
182
+ data_paths = []
183
+ data_starts = []
184
+ data_ends = []
185
+ add_header_magic = True
186
+ if columns is None and row_groups is None:
187
+ # We are NOT selecting specific columns or row-groups.
188
+ #
189
+ # We can avoid sampling the footers, and just transfer
190
+ # all file data with cat_ranges
191
+ for i, path in enumerate(paths):
192
+ result[path] = {}
193
+ for b in range(0, file_sizes[i], max_block):
194
+ data_paths.append(path)
195
+ data_starts.append(b)
196
+ data_ends.append(min(b + max_block, file_sizes[i]))
197
+ add_header_magic = False # "Magic" should already be included
198
+ else:
199
+ # We ARE selecting specific columns or row-groups.
200
+ #
201
+ # Gather file footers.
202
+ # We just take the last `footer_sample_size` bytes of each
203
+ # file (or the entire file if it is smaller than that)
204
+ footer_starts = []
205
+ footer_ends = []
206
+ for i, path in enumerate(paths):
207
+ footer_ends.append(file_sizes[i])
208
+ sample_size = max(0, file_sizes[i] - footer_sample_size)
209
+ footer_starts.append(sample_size)
210
+ footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
211
+
212
+ # Check our footer samples and re-sample if necessary.
213
+ missing_footer_starts = footer_starts.copy()
214
+ large_footer = 0
215
+ for i, path in enumerate(paths):
216
+ footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
217
+ real_footer_start = file_sizes[i] - (footer_size + 8)
218
+ if real_footer_start < footer_starts[i]:
219
+ missing_footer_starts[i] = real_footer_start
220
+ large_footer = max(large_footer, (footer_size + 8))
221
+ if large_footer:
222
+ warnings.warn(
223
+ f"Not enough data was used to sample the parquet footer. "
224
+ f"Try setting footer_sample_size >= {large_footer}."
225
+ )
226
+ for i, block in enumerate(
227
+ fs.cat_ranges(
228
+ paths,
229
+ missing_footer_starts,
230
+ footer_starts,
231
+ )
232
+ ):
233
+ footer_samples[i] = block + footer_samples[i]
234
+ footer_starts[i] = missing_footer_starts[i]
235
+
236
+ # Calculate required byte ranges for each path
237
+ for i, path in enumerate(paths):
238
+ # Deal with small-file case.
239
+ # Just include all remaining bytes of the file
240
+ # in a single range.
241
+ if file_sizes[i] < max_block:
242
+ if footer_starts[i] > 0:
243
+ # Only need to transfer the data if the
244
+ # footer sample isn't already the whole file
245
+ data_paths.append(path)
246
+ data_starts.append(0)
247
+ data_ends.append(footer_starts[i])
248
+ continue
249
+
250
+ # Use "engine" to collect data byte ranges
251
+ path_data_starts, path_data_ends = engine._parquet_byte_ranges(
252
+ columns,
253
+ row_groups=row_groups,
254
+ footer=footer_samples[i],
255
+ footer_start=footer_starts[i],
256
+ )
257
+
258
+ data_paths += [path] * len(path_data_starts)
259
+ data_starts += path_data_starts
260
+ data_ends += path_data_ends
261
+
262
+ # Merge adjacent offset ranges
263
+ data_paths, data_starts, data_ends = merge_offset_ranges(
264
+ data_paths,
265
+ data_starts,
266
+ data_ends,
267
+ max_gap=max_gap,
268
+ max_block=max_block,
269
+ sort=False, # Should already be sorted
270
+ )
271
+
272
+ # Start by populating `result` with footer samples
273
+ for i, path in enumerate(paths):
274
+ result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
275
+
276
+ # Transfer the data byte-ranges into local memory
277
+ _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
278
+
279
+ # Add b"PAR1" to header if necessary
280
+ if add_header_magic:
281
+ _add_header_magic(result)
282
+
283
+ return result
284
+
285
+
286
+ def _get_parquet_byte_ranges_from_metadata(
287
+ metadata,
288
+ fs,
289
+ engine,
290
+ columns=None,
291
+ row_groups=None,
292
+ max_gap=64_000,
293
+ max_block=256_000_000,
294
+ ):
295
+ """Simplified version of `_get_parquet_byte_ranges` for
296
+ the case that an engine-specific `metadata` object is
297
+ provided, and the remote footer metadata does not need to
298
+ be transferred before calculating the required byte ranges.
299
+ """
300
+
301
+ # Use "engine" to collect data byte ranges
302
+ data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
303
+ columns,
304
+ row_groups=row_groups,
305
+ metadata=metadata,
306
+ )
307
+
308
+ # Merge adjacent offset ranges
309
+ data_paths, data_starts, data_ends = merge_offset_ranges(
310
+ data_paths,
311
+ data_starts,
312
+ data_ends,
313
+ max_gap=max_gap,
314
+ max_block=max_block,
315
+ sort=False, # Should be sorted
316
+ )
317
+
318
+ # Transfer the data byte-ranges into local memory
319
+ result = {fn: {} for fn in list(set(data_paths))}
320
+ _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
321
+
322
+ # Add b"PAR1" to header
323
+ _add_header_magic(result)
324
+
325
+ return result
326
+
327
+
328
+ def _transfer_ranges(fs, blocks, paths, starts, ends):
329
+ # Use cat_ranges to gather the data byte_ranges
330
+ ranges = (paths, starts, ends)
331
+ for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
332
+ blocks[path][(start, stop)] = data
333
+
334
+
335
+ def _add_header_magic(data):
336
+ # Add b"PAR1" to file headers
337
+ for path in list(data.keys()):
338
+ add_magic = True
339
+ for k in data[path]:
340
+ if k[0] == 0 and k[1] >= 4:
341
+ add_magic = False
342
+ break
343
+ if add_magic:
344
+ data[path][(0, 4)] = b"PAR1"
345
+
346
+
347
+ def _set_engine(engine_str):
348
+ # Define a list of parquet engines to try
349
+ if engine_str == "auto":
350
+ try_engines = ("fastparquet", "pyarrow")
351
+ elif not isinstance(engine_str, str):
352
+ raise ValueError(
353
+ "Failed to set parquet engine! "
354
+ "Please pass 'fastparquet', 'pyarrow', or 'auto'"
355
+ )
356
+ elif engine_str not in ("fastparquet", "pyarrow"):
357
+ raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
358
+ else:
359
+ try_engines = [engine_str]
360
+
361
+ # Try importing the engines in `try_engines`,
362
+ # and choose the first one that succeeds
363
+ for engine in try_engines:
364
+ try:
365
+ if engine == "fastparquet":
366
+ return FastparquetEngine()
367
+ elif engine == "pyarrow":
368
+ return PyarrowEngine()
369
+ except ImportError:
370
+ pass
371
+
372
+ # Raise an error if a supported parquet engine
373
+ # was not found
374
+ raise ImportError(
375
+ f"The following parquet engines are not installed "
376
+ f"in your python environment: {try_engines}."
377
+ f"Please install 'fastparquert' or 'pyarrow' to "
378
+ f"utilize the `fsspec.parquet` module."
379
+ )
380
+
381
+
382
+ class FastparquetEngine:
383
+ # The purpose of the FastparquetEngine class is
384
+ # to check if fastparquet can be imported (on initialization)
385
+ # and to define a `_parquet_byte_ranges` method. In the
386
+ # future, this class may also be used to define other
387
+ # methods/logic that are specific to fastparquet.
388
+
389
+ def __init__(self):
390
+ import fastparquet as fp
391
+
392
+ self.fp = fp
393
+
394
+ def _row_group_filename(self, row_group, pf):
395
+ return pf.row_group_filename(row_group)
396
+
397
+ def _parquet_byte_ranges(
398
+ self,
399
+ columns,
400
+ row_groups=None,
401
+ metadata=None,
402
+ footer=None,
403
+ footer_start=None,
404
+ ):
405
+ # Initialize offset ranges and define ParqetFile metadata
406
+ pf = metadata
407
+ data_paths, data_starts, data_ends = [], [], []
408
+ if pf is None:
409
+ pf = self.fp.ParquetFile(io.BytesIO(footer))
410
+
411
+ # Convert columns to a set and add any index columns
412
+ # specified in the pandas metadata (just in case)
413
+ column_set = None if columns is None else set(columns)
414
+ if column_set is not None and hasattr(pf, "pandas_metadata"):
415
+ md_index = [
416
+ ind
417
+ for ind in pf.pandas_metadata.get("index_columns", [])
418
+ # Ignore RangeIndex information
419
+ if not isinstance(ind, dict)
420
+ ]
421
+ column_set |= set(md_index)
422
+
423
+ # Check if row_groups is a list of integers
424
+ # or a list of row-group metadata
425
+ if row_groups and not isinstance(row_groups[0], int):
426
+ # Input row_groups contains row-group metadata
427
+ row_group_indices = None
428
+ else:
429
+ # Input row_groups contains row-group indices
430
+ row_group_indices = row_groups
431
+ row_groups = pf.row_groups
432
+
433
+ # Loop through column chunks to add required byte ranges
434
+ for r, row_group in enumerate(row_groups):
435
+ # Skip this row-group if we are targeting
436
+ # specific row-groups
437
+ if row_group_indices is None or r in row_group_indices:
438
+ # Find the target parquet-file path for `row_group`
439
+ fn = self._row_group_filename(row_group, pf)
440
+
441
+ for column in row_group.columns:
442
+ name = column.meta_data.path_in_schema[0]
443
+ # Skip this column if we are targeting a
444
+ # specific columns
445
+ if column_set is None or name in column_set:
446
+ file_offset0 = column.meta_data.dictionary_page_offset
447
+ if file_offset0 is None:
448
+ file_offset0 = column.meta_data.data_page_offset
449
+ num_bytes = column.meta_data.total_compressed_size
450
+ if footer_start is None or file_offset0 < footer_start:
451
+ data_paths.append(fn)
452
+ data_starts.append(file_offset0)
453
+ data_ends.append(
454
+ min(
455
+ file_offset0 + num_bytes,
456
+ footer_start or (file_offset0 + num_bytes),
457
+ )
458
+ )
459
+
460
+ if metadata:
461
+ # The metadata in this call may map to multiple
462
+ # file paths. Need to include `data_paths`
463
+ return data_paths, data_starts, data_ends
464
+ return data_starts, data_ends
465
+
466
+
467
+ class PyarrowEngine:
468
+ # The purpose of the PyarrowEngine class is
469
+ # to check if pyarrow can be imported (on initialization)
470
+ # and to define a `_parquet_byte_ranges` method. In the
471
+ # future, this class may also be used to define other
472
+ # methods/logic that are specific to pyarrow.
473
+
474
+ def __init__(self):
475
+ import pyarrow.parquet as pq
476
+
477
+ self.pq = pq
478
+
479
+ def _row_group_filename(self, row_group, metadata):
480
+ raise NotImplementedError
481
+
482
+ def _parquet_byte_ranges(
483
+ self,
484
+ columns,
485
+ row_groups=None,
486
+ metadata=None,
487
+ footer=None,
488
+ footer_start=None,
489
+ ):
490
+ if metadata is not None:
491
+ raise ValueError("metadata input not supported for PyarrowEngine")
492
+
493
+ data_starts, data_ends = [], []
494
+ md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
495
+
496
+ # Convert columns to a set and add any index columns
497
+ # specified in the pandas metadata (just in case)
498
+ column_set = None if columns is None else set(columns)
499
+ if column_set is not None:
500
+ schema = md.schema.to_arrow_schema()
501
+ has_pandas_metadata = (
502
+ schema.metadata is not None and b"pandas" in schema.metadata
503
+ )
504
+ if has_pandas_metadata:
505
+ md_index = [
506
+ ind
507
+ for ind in json.loads(
508
+ schema.metadata[b"pandas"].decode("utf8")
509
+ ).get("index_columns", [])
510
+ # Ignore RangeIndex information
511
+ if not isinstance(ind, dict)
512
+ ]
513
+ column_set |= set(md_index)
514
+
515
+ # Loop through column chunks to add required byte ranges
516
+ for r in range(md.num_row_groups):
517
+ # Skip this row-group if we are targeting
518
+ # specific row-groups
519
+ if row_groups is None or r in row_groups:
520
+ row_group = md.row_group(r)
521
+ for c in range(row_group.num_columns):
522
+ column = row_group.column(c)
523
+ name = column.path_in_schema
524
+ # Skip this column if we are targeting a
525
+ # specific columns
526
+ split_name = name.split(".")[0]
527
+ if (
528
+ column_set is None
529
+ or name in column_set
530
+ or split_name in column_set
531
+ ):
532
+ file_offset0 = column.dictionary_page_offset
533
+ if file_offset0 is None:
534
+ file_offset0 = column.data_page_offset
535
+ num_bytes = column.total_compressed_size
536
+ if file_offset0 < footer_start:
537
+ data_starts.append(file_offset0)
538
+ data_ends.append(
539
+ min(file_offset0 + num_bytes, footer_start)
540
+ )
541
+ return data_starts, data_ends
meow/lib/python3.13/site-packages/fsspec/registry.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import types
5
+ import warnings
6
+
7
+ __all__ = ["registry", "get_filesystem_class", "default"]
8
+
9
+ # internal, mutable
10
+ _registry: dict[str, type] = {}
11
+
12
+ # external, immutable
13
+ registry = types.MappingProxyType(_registry)
14
+ default = "file"
15
+
16
+
17
+ def register_implementation(name, cls, clobber=False, errtxt=None):
18
+ """Add implementation class to the registry
19
+
20
+ Parameters
21
+ ----------
22
+ name: str
23
+ Protocol name to associate with the class
24
+ cls: class or str
25
+ if a class: fsspec-compliant implementation class (normally inherits from
26
+ ``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
27
+ str, the full path to an implementation class like package.module.class,
28
+ which gets added to known_implementations,
29
+ so the import is deferred until the filesystem is actually used.
30
+ clobber: bool (optional)
31
+ Whether to overwrite a protocol with the same name; if False, will raise
32
+ instead.
33
+ errtxt: str (optional)
34
+ If given, then a failure to import the given class will result in this
35
+ text being given.
36
+ """
37
+ if isinstance(cls, str):
38
+ if name in known_implementations and clobber is False:
39
+ if cls != known_implementations[name]["class"]:
40
+ raise ValueError(
41
+ f"Name ({name}) already in the known_implementations and clobber "
42
+ f"is False"
43
+ )
44
+ else:
45
+ known_implementations[name] = {
46
+ "class": cls,
47
+ "err": errtxt or f"{cls} import failed for protocol {name}",
48
+ }
49
+
50
+ else:
51
+ if name in registry and clobber is False:
52
+ if _registry[name] is not cls:
53
+ raise ValueError(
54
+ f"Name ({name}) already in the registry and clobber is False"
55
+ )
56
+ else:
57
+ _registry[name] = cls
58
+
59
+
60
+ # protocols mapped to the class which implements them. This dict can be
61
+ # updated with register_implementation
62
+ known_implementations = {
63
+ "abfs": {
64
+ "class": "adlfs.AzureBlobFileSystem",
65
+ "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
66
+ },
67
+ "adl": {
68
+ "class": "adlfs.AzureDatalakeFileSystem",
69
+ "err": "Install adlfs to access Azure Datalake Gen1",
70
+ },
71
+ "arrow_hdfs": {
72
+ "class": "fsspec.implementations.arrow.HadoopFileSystem",
73
+ "err": "pyarrow and local java libraries required for HDFS",
74
+ },
75
+ "asynclocal": {
76
+ "class": "morefs.asyn_local.AsyncLocalFileSystem",
77
+ "err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
78
+ },
79
+ "az": {
80
+ "class": "adlfs.AzureBlobFileSystem",
81
+ "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
82
+ },
83
+ "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
84
+ "box": {
85
+ "class": "boxfs.BoxFileSystem",
86
+ "err": "Please install boxfs to access BoxFileSystem",
87
+ },
88
+ "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
89
+ "dask": {
90
+ "class": "fsspec.implementations.dask.DaskWorkerFileSystem",
91
+ "err": "Install dask distributed to access worker file system",
92
+ },
93
+ "data": {"class": "fsspec.implementations.data.DataFileSystem"},
94
+ "dbfs": {
95
+ "class": "fsspec.implementations.dbfs.DatabricksFileSystem",
96
+ "err": "Install the requests package to use the DatabricksFileSystem",
97
+ },
98
+ "dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
99
+ "dropbox": {
100
+ "class": "dropboxdrivefs.DropboxDriveFileSystem",
101
+ "err": (
102
+ 'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
103
+ '"dropbox" to be installed'
104
+ ),
105
+ },
106
+ "dvc": {
107
+ "class": "dvc.api.DVCFileSystem",
108
+ "err": "Install dvc to access DVCFileSystem",
109
+ },
110
+ "file": {"class": "fsspec.implementations.local.LocalFileSystem"},
111
+ "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
112
+ "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
113
+ "gcs": {
114
+ "class": "gcsfs.GCSFileSystem",
115
+ "err": "Please install gcsfs to access Google Storage",
116
+ },
117
+ "gdrive": {
118
+ "class": "gdrivefs.GoogleDriveFileSystem",
119
+ "err": "Please install gdrivefs for access to Google Drive",
120
+ },
121
+ "generic": {"class": "fsspec.generic.GenericFileSystem"},
122
+ "git": {
123
+ "class": "fsspec.implementations.git.GitFileSystem",
124
+ "err": "Install pygit2 to browse local git repos",
125
+ },
126
+ "github": {
127
+ "class": "fsspec.implementations.github.GithubFileSystem",
128
+ "err": "Install the requests package to use the github FS",
129
+ },
130
+ "gs": {
131
+ "class": "gcsfs.GCSFileSystem",
132
+ "err": "Please install gcsfs to access Google Storage",
133
+ },
134
+ "hdfs": {
135
+ "class": "fsspec.implementations.arrow.HadoopFileSystem",
136
+ "err": "pyarrow and local java libraries required for HDFS",
137
+ },
138
+ "hf": {
139
+ "class": "huggingface_hub.HfFileSystem",
140
+ "err": "Install huggingface_hub to access HfFileSystem",
141
+ },
142
+ "http": {
143
+ "class": "fsspec.implementations.http.HTTPFileSystem",
144
+ "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
145
+ },
146
+ "https": {
147
+ "class": "fsspec.implementations.http.HTTPFileSystem",
148
+ "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
149
+ },
150
+ "jlab": {
151
+ "class": "fsspec.implementations.jupyter.JupyterFileSystem",
152
+ "err": "Jupyter FS requires requests to be installed",
153
+ },
154
+ "jupyter": {
155
+ "class": "fsspec.implementations.jupyter.JupyterFileSystem",
156
+ "err": "Jupyter FS requires requests to be installed",
157
+ },
158
+ "lakefs": {
159
+ "class": "lakefs_spec.LakeFSFileSystem",
160
+ "err": "Please install lakefs-spec to access LakeFSFileSystem",
161
+ },
162
+ "libarchive": {
163
+ "class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
164
+ "err": "LibArchive requires to be installed",
165
+ },
166
+ "local": {"class": "fsspec.implementations.local.LocalFileSystem"},
167
+ "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
168
+ "oci": {
169
+ "class": "ocifs.OCIFileSystem",
170
+ "err": "Install ocifs to access OCI Object Storage",
171
+ },
172
+ "ocilake": {
173
+ "class": "ocifs.OCIFileSystem",
174
+ "err": "Install ocifs to access OCI Data Lake",
175
+ },
176
+ "oss": {
177
+ "class": "ossfs.OSSFileSystem",
178
+ "err": "Install ossfs to access Alibaba Object Storage System",
179
+ },
180
+ "reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
181
+ "root": {
182
+ "class": "fsspec_xrootd.XRootDFileSystem",
183
+ "err": (
184
+ "Install fsspec-xrootd to access xrootd storage system. "
185
+ "Note: 'root' is the protocol name for xrootd storage systems, "
186
+ "not referring to root directories"
187
+ ),
188
+ },
189
+ "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
190
+ "s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
191
+ "sftp": {
192
+ "class": "fsspec.implementations.sftp.SFTPFileSystem",
193
+ "err": 'SFTPFileSystem requires "paramiko" to be installed',
194
+ },
195
+ "simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
196
+ "smb": {
197
+ "class": "fsspec.implementations.smb.SMBFileSystem",
198
+ "err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
199
+ },
200
+ "ssh": {
201
+ "class": "fsspec.implementations.sftp.SFTPFileSystem",
202
+ "err": 'SFTPFileSystem requires "paramiko" to be installed',
203
+ },
204
+ "tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
205
+ "tosfs": {
206
+ "class": "tosfs.TosFileSystem",
207
+ "err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
208
+ },
209
+ "wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
210
+ "webdav": {
211
+ "class": "webdav4.fsspec.WebdavFileSystem",
212
+ "err": "Install webdav4 to access WebDAV",
213
+ },
214
+ "webhdfs": {
215
+ "class": "fsspec.implementations.webhdfs.WebHDFS",
216
+ "err": 'webHDFS access requires "requests" to be installed',
217
+ },
218
+ "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
219
+ }
220
+
221
+ assert list(known_implementations) == sorted(
222
+ known_implementations
223
+ ), "Not in alphabetical order"
224
+
225
+
226
+ def get_filesystem_class(protocol):
227
+ """Fetch named protocol implementation from the registry
228
+
229
+ The dict ``known_implementations`` maps protocol names to the locations
230
+ of classes implementing the corresponding file-system. When used for the
231
+ first time, appropriate imports will happen and the class will be placed in
232
+ the registry. All subsequent calls will fetch directly from the registry.
233
+
234
+ Some protocol implementations require additional dependencies, and so the
235
+ import may fail. In this case, the string in the "err" field of the
236
+ ``known_implementations`` will be given as the error message.
237
+ """
238
+ if not protocol:
239
+ protocol = default
240
+
241
+ if protocol not in registry:
242
+ if protocol not in known_implementations:
243
+ raise ValueError(f"Protocol not known: {protocol}")
244
+ bit = known_implementations[protocol]
245
+ try:
246
+ register_implementation(protocol, _import_class(bit["class"]))
247
+ except ImportError as e:
248
+ raise ImportError(bit["err"]) from e
249
+ cls = registry[protocol]
250
+ if getattr(cls, "protocol", None) in ("abstract", None):
251
+ cls.protocol = protocol
252
+
253
+ return cls
254
+
255
+
256
+ s3_msg = """Your installed version of s3fs is very old and known to cause
257
+ severe performance issues, see also https://github.com/dask/dask/issues/10276
258
+
259
+ To fix, you should specify a lower version bound on s3fs, or
260
+ update the current installation.
261
+ """
262
+
263
+
264
+ def _import_class(fqp: str):
265
+ """Take a fully-qualified path and return the imported class or identifier.
266
+
267
+ ``fqp`` is of the form "package.module.klass" or
268
+ "package.module:subobject.klass".
269
+
270
+ Warnings
271
+ --------
272
+ This can import arbitrary modules. Make sure you haven't installed any modules
273
+ that may execute malicious code at import time.
274
+ """
275
+ if ":" in fqp:
276
+ mod, name = fqp.rsplit(":", 1)
277
+ else:
278
+ mod, name = fqp.rsplit(".", 1)
279
+
280
+ is_s3 = mod == "s3fs"
281
+ mod = importlib.import_module(mod)
282
+ if is_s3 and mod.__version__.split(".") < ["0", "5"]:
283
+ warnings.warn(s3_msg)
284
+ for part in name.split("."):
285
+ mod = getattr(mod, part)
286
+
287
+ if not isinstance(mod, type):
288
+ raise TypeError(f"{fqp} is not a class")
289
+
290
+ return mod
291
+
292
+
293
+ def filesystem(protocol, **storage_options):
294
+ """Instantiate filesystems for given protocol and arguments
295
+
296
+ ``storage_options`` are specific to the protocol being chosen, and are
297
+ passed directly to the class.
298
+ """
299
+ if protocol == "arrow_hdfs":
300
+ warnings.warn(
301
+ "The 'arrow_hdfs' protocol has been deprecated and will be "
302
+ "removed in the future. Specify it as 'hdfs'.",
303
+ DeprecationWarning,
304
+ )
305
+
306
+ cls = get_filesystem_class(protocol)
307
+ return cls(**storage_options)
308
+
309
+
310
+ def available_protocols():
311
+ """Return a list of the implemented protocols.
312
+
313
+ Note that any given protocol may require extra packages to be importable.
314
+ """
315
+ return list(known_implementations)
meow/lib/python3.13/site-packages/fsspec/spec.py ADDED
@@ -0,0 +1,2242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ import threading
8
+ import warnings
9
+ import weakref
10
+ from errno import ESPIPE
11
+ from glob import has_magic
12
+ from hashlib import sha256
13
+ from typing import Any, ClassVar
14
+
15
+ from .callbacks import DEFAULT_CALLBACK
16
+ from .config import apply_config, conf
17
+ from .dircache import DirCache
18
+ from .transaction import Transaction
19
+ from .utils import (
20
+ _unstrip_protocol,
21
+ glob_translate,
22
+ isfilelike,
23
+ other_paths,
24
+ read_block,
25
+ stringify_path,
26
+ tokenize,
27
+ )
28
+
29
+ logger = logging.getLogger("fsspec")
30
+
31
+
32
+ def make_instance(cls, args, kwargs):
33
+ return cls(*args, **kwargs)
34
+
35
+
36
+ class _Cached(type):
37
+ """
38
+ Metaclass for caching file system instances.
39
+
40
+ Notes
41
+ -----
42
+ Instances are cached according to
43
+
44
+ * The values of the class attributes listed in `_extra_tokenize_attributes`
45
+ * The arguments passed to ``__init__``.
46
+
47
+ This creates an additional reference to the filesystem, which prevents the
48
+ filesystem from being garbage collected when all *user* references go away.
49
+ A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
50
+ be made for a filesystem instance to be garbage collected.
51
+ """
52
+
53
+ def __init__(cls, *args, **kwargs):
54
+ super().__init__(*args, **kwargs)
55
+ # Note: we intentionally create a reference here, to avoid garbage
56
+ # collecting instances when all other references are gone. To really
57
+ # delete a FileSystem, the cache must be cleared.
58
+ if conf.get("weakref_instance_cache"): # pragma: no cover
59
+ # debug option for analysing fork/spawn conditions
60
+ cls._cache = weakref.WeakValueDictionary()
61
+ else:
62
+ cls._cache = {}
63
+ cls._pid = os.getpid()
64
+
65
+ def __call__(cls, *args, **kwargs):
66
+ kwargs = apply_config(cls, kwargs)
67
+ extra_tokens = tuple(
68
+ getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
69
+ )
70
+ token = tokenize(
71
+ cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
72
+ )
73
+ skip = kwargs.pop("skip_instance_cache", False)
74
+ if os.getpid() != cls._pid:
75
+ cls._cache.clear()
76
+ cls._pid = os.getpid()
77
+ if not skip and cls.cachable and token in cls._cache:
78
+ cls._latest = token
79
+ return cls._cache[token]
80
+ else:
81
+ obj = super().__call__(*args, **kwargs)
82
+ # Setting _fs_token here causes some static linters to complain.
83
+ obj._fs_token_ = token
84
+ obj.storage_args = args
85
+ obj.storage_options = kwargs
86
+ if obj.async_impl and obj.mirror_sync_methods:
87
+ from .asyn import mirror_sync_methods
88
+
89
+ mirror_sync_methods(obj)
90
+
91
+ if cls.cachable and not skip:
92
+ cls._latest = token
93
+ cls._cache[token] = obj
94
+ return obj
95
+
96
+
97
+ class AbstractFileSystem(metaclass=_Cached):
98
+ """
99
+ An abstract super-class for pythonic file-systems
100
+
101
+ Implementations are expected to be compatible with or, better, subclass
102
+ from here.
103
+ """
104
+
105
+ cachable = True # this class can be cached, instances reused
106
+ _cached = False
107
+ blocksize = 2**22
108
+ sep = "/"
109
+ protocol: ClassVar[str | tuple[str, ...]] = "abstract"
110
+ _latest = None
111
+ async_impl = False
112
+ mirror_sync_methods = False
113
+ root_marker = "" # For some FSs, may require leading '/' or other character
114
+ transaction_type = Transaction
115
+
116
+ #: Extra *class attributes* that should be considered when hashing.
117
+ _extra_tokenize_attributes = ()
118
+
119
+ # Set by _Cached metaclass
120
+ storage_args: tuple[Any, ...]
121
+ storage_options: dict[str, Any]
122
+
123
+ def __init__(self, *args, **storage_options):
124
+ """Create and configure file-system instance
125
+
126
+ Instances may be cachable, so if similar enough arguments are seen
127
+ a new instance is not required. The token attribute exists to allow
128
+ implementations to cache instances if they wish.
129
+
130
+ A reasonable default should be provided if there are no arguments.
131
+
132
+ Subclasses should call this method.
133
+
134
+ Parameters
135
+ ----------
136
+ use_listings_cache, listings_expiry_time, max_paths:
137
+ passed to ``DirCache``, if the implementation supports
138
+ directory listing caching. Pass use_listings_cache=False
139
+ to disable such caching.
140
+ skip_instance_cache: bool
141
+ If this is a cachable implementation, pass True here to force
142
+ creating a new instance even if a matching instance exists, and prevent
143
+ storing this instance.
144
+ asynchronous: bool
145
+ loop: asyncio-compatible IOLoop or None
146
+ """
147
+ if self._cached:
148
+ # reusing instance, don't change
149
+ return
150
+ self._cached = True
151
+ self._intrans = False
152
+ self._transaction = None
153
+ self._invalidated_caches_in_transaction = []
154
+ self.dircache = DirCache(**storage_options)
155
+
156
+ if storage_options.pop("add_docs", None):
157
+ warnings.warn("add_docs is no longer supported.", FutureWarning)
158
+
159
+ if storage_options.pop("add_aliases", None):
160
+ warnings.warn("add_aliases has been removed.", FutureWarning)
161
+ # This is set in _Cached
162
+ self._fs_token_ = None
163
+
164
+ @property
165
+ def fsid(self):
166
+ """Persistent filesystem id that can be used to compare filesystems
167
+ across sessions.
168
+ """
169
+ raise NotImplementedError
170
+
171
+ @property
172
+ def _fs_token(self):
173
+ return self._fs_token_
174
+
175
+ def __dask_tokenize__(self):
176
+ return self._fs_token
177
+
178
+ def __hash__(self):
179
+ return int(self._fs_token, 16)
180
+
181
+ def __eq__(self, other):
182
+ return isinstance(other, type(self)) and self._fs_token == other._fs_token
183
+
184
+ def __reduce__(self):
185
+ return make_instance, (type(self), self.storage_args, self.storage_options)
186
+
187
+ @classmethod
188
+ def _strip_protocol(cls, path):
189
+ """Turn path from fully-qualified to file-system-specific
190
+
191
+ May require FS-specific handling, e.g., for relative paths or links.
192
+ """
193
+ if isinstance(path, list):
194
+ return [cls._strip_protocol(p) for p in path]
195
+ path = stringify_path(path)
196
+ protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
197
+ for protocol in protos:
198
+ if path.startswith(protocol + "://"):
199
+ path = path[len(protocol) + 3 :]
200
+ elif path.startswith(protocol + "::"):
201
+ path = path[len(protocol) + 2 :]
202
+ path = path.rstrip("/")
203
+ # use of root_marker to make minimum required path, e.g., "/"
204
+ return path or cls.root_marker
205
+
206
+ def unstrip_protocol(self, name: str) -> str:
207
+ """Format FS-specific path to generic, including protocol"""
208
+ protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol
209
+ for protocol in protos:
210
+ if name.startswith(f"{protocol}://"):
211
+ return name
212
+ return f"{protos[0]}://{name}"
213
+
214
+ @staticmethod
215
+ def _get_kwargs_from_urls(path):
216
+ """If kwargs can be encoded in the paths, extract them here
217
+
218
+ This should happen before instantiation of the class; incoming paths
219
+ then should be amended to strip the options in methods.
220
+
221
+ Examples may look like an sftp path "sftp://user@host:/my/path", where
222
+ the user and host should become kwargs and later get stripped.
223
+ """
224
+ # by default, nothing happens
225
+ return {}
226
+
227
+ @classmethod
228
+ def current(cls):
229
+ """Return the most recently instantiated FileSystem
230
+
231
+ If no instance has been created, then create one with defaults
232
+ """
233
+ if cls._latest in cls._cache:
234
+ return cls._cache[cls._latest]
235
+ return cls()
236
+
237
+ @property
238
+ def transaction(self):
239
+ """A context within which files are committed together upon exit
240
+
241
+ Requires the file class to implement `.commit()` and `.discard()`
242
+ for the normal and exception cases.
243
+ """
244
+ if self._transaction is None:
245
+ self._transaction = self.transaction_type(self)
246
+ return self._transaction
247
+
248
+ def start_transaction(self):
249
+ """Begin write transaction for deferring files, non-context version"""
250
+ self._intrans = True
251
+ self._transaction = self.transaction_type(self)
252
+ return self.transaction
253
+
254
+ def end_transaction(self):
255
+ """Finish write transaction, non-context version"""
256
+ self.transaction.complete()
257
+ self._transaction = None
258
+ # The invalid cache must be cleared after the transaction is completed.
259
+ for path in self._invalidated_caches_in_transaction:
260
+ self.invalidate_cache(path)
261
+ self._invalidated_caches_in_transaction.clear()
262
+
263
+ def invalidate_cache(self, path=None):
264
+ """
265
+ Discard any cached directory information
266
+
267
+ Parameters
268
+ ----------
269
+ path: string or None
270
+ If None, clear all listings cached else listings at or under given
271
+ path.
272
+ """
273
+ # Not necessary to implement invalidation mechanism, may have no cache.
274
+ # But if have, you should call this method of parent class from your
275
+ # subclass to ensure expiring caches after transacations correctly.
276
+ # See the implementation of FTPFileSystem in ftp.py
277
+ if self._intrans:
278
+ self._invalidated_caches_in_transaction.append(path)
279
+
280
+ def mkdir(self, path, create_parents=True, **kwargs):
281
+ """
282
+ Create directory entry at path
283
+
284
+ For systems that don't have true directories, may create an for
285
+ this instance only and not touch the real filesystem
286
+
287
+ Parameters
288
+ ----------
289
+ path: str
290
+ location
291
+ create_parents: bool
292
+ if True, this is equivalent to ``makedirs``
293
+ kwargs:
294
+ may be permissions, etc.
295
+ """
296
+ pass # not necessary to implement, may not have directories
297
+
298
+ def makedirs(self, path, exist_ok=False):
299
+ """Recursively make directories
300
+
301
+ Creates directory at path and any intervening required directories.
302
+ Raises exception if, for instance, the path already exists but is a
303
+ file.
304
+
305
+ Parameters
306
+ ----------
307
+ path: str
308
+ leaf directory name
309
+ exist_ok: bool (False)
310
+ If False, will error if the target already exists
311
+ """
312
+ pass # not necessary to implement, may not have directories
313
+
314
+ def rmdir(self, path):
315
+ """Remove a directory, if empty"""
316
+ pass # not necessary to implement, may not have directories
317
+
318
+ def ls(self, path, detail=True, **kwargs):
319
+ """List objects at path.
320
+
321
+ This should include subdirectories and files at that location. The
322
+ difference between a file and a directory must be clear when details
323
+ are requested.
324
+
325
+ The specific keys, or perhaps a FileInfo class, or similar, is TBD,
326
+ but must be consistent across implementations.
327
+ Must include:
328
+
329
+ - full path to the entry (without protocol)
330
+ - size of the entry, in bytes. If the value cannot be determined, will
331
+ be ``None``.
332
+ - type of entry, "file", "directory" or other
333
+
334
+ Additional information
335
+ may be present, appropriate to the file-system, e.g., generation,
336
+ checksum, etc.
337
+
338
+ May use refresh=True|False to allow use of self._ls_from_cache to
339
+ check for a saved listing and avoid calling the backend. This would be
340
+ common where listing may be expensive.
341
+
342
+ Parameters
343
+ ----------
344
+ path: str
345
+ detail: bool
346
+ if True, gives a list of dictionaries, where each is the same as
347
+ the result of ``info(path)``. If False, gives a list of paths
348
+ (str).
349
+ kwargs: may have additional backend-specific options, such as version
350
+ information
351
+
352
+ Returns
353
+ -------
354
+ List of strings if detail is False, or list of directory information
355
+ dicts if detail is True.
356
+ """
357
+ raise NotImplementedError
358
+
359
+ def _ls_from_cache(self, path):
360
+ """Check cache for listing
361
+
362
+ Returns listing, if found (may be empty list for a directly that exists
363
+ but contains nothing), None if not in cache.
364
+ """
365
+ parent = self._parent(path)
366
+ try:
367
+ return self.dircache[path.rstrip("/")]
368
+ except KeyError:
369
+ pass
370
+ try:
371
+ files = [
372
+ f
373
+ for f in self.dircache[parent]
374
+ if f["name"] == path
375
+ or (f["name"] == path.rstrip("/") and f["type"] == "directory")
376
+ ]
377
+ if len(files) == 0:
378
+ # parent dir was listed but did not contain this file
379
+ raise FileNotFoundError(path)
380
+ return files
381
+ except KeyError:
382
+ pass
383
+
384
+ def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
385
+ """Return all files belows path
386
+
387
+ List all files, recursing into subdirectories; output is iterator-style,
388
+ like ``os.walk()``. For a simple list of files, ``find()`` is available.
389
+
390
+ When topdown is True, the caller can modify the dirnames list in-place (perhaps
391
+ using del or slice assignment), and walk() will
392
+ only recurse into the subdirectories whose names remain in dirnames;
393
+ this can be used to prune the search, impose a specific order of visiting,
394
+ or even to inform walk() about directories the caller creates or renames before
395
+ it resumes walk() again.
396
+ Modifying dirnames when topdown is False has no effect. (see os.walk)
397
+
398
+ Note that the "files" outputted will include anything that is not
399
+ a directory, such as links.
400
+
401
+ Parameters
402
+ ----------
403
+ path: str
404
+ Root to recurse into
405
+ maxdepth: int
406
+ Maximum recursion depth. None means limitless, but not recommended
407
+ on link-based file-systems.
408
+ topdown: bool (True)
409
+ Whether to walk the directory tree from the top downwards or from
410
+ the bottom upwards.
411
+ on_error: "omit", "raise", a callable
412
+ if omit (default), path with exception will simply be empty;
413
+ If raise, an underlying exception will be raised;
414
+ if callable, it will be called with a single OSError instance as argument
415
+ kwargs: passed to ``ls``
416
+ """
417
+ if maxdepth is not None and maxdepth < 1:
418
+ raise ValueError("maxdepth must be at least 1")
419
+
420
+ path = self._strip_protocol(path)
421
+ full_dirs = {}
422
+ dirs = {}
423
+ files = {}
424
+
425
+ detail = kwargs.pop("detail", False)
426
+ try:
427
+ listing = self.ls(path, detail=True, **kwargs)
428
+ except (FileNotFoundError, OSError) as e:
429
+ if on_error == "raise":
430
+ raise
431
+ if callable(on_error):
432
+ on_error(e)
433
+ return
434
+
435
+ for info in listing:
436
+ # each info name must be at least [path]/part , but here
437
+ # we check also for names like [path]/part/
438
+ pathname = info["name"].rstrip("/")
439
+ name = pathname.rsplit("/", 1)[-1]
440
+ if info["type"] == "directory" and pathname != path:
441
+ # do not include "self" path
442
+ full_dirs[name] = pathname
443
+ dirs[name] = info
444
+ elif pathname == path:
445
+ # file-like with same name as give path
446
+ files[""] = info
447
+ else:
448
+ files[name] = info
449
+
450
+ if not detail:
451
+ dirs = list(dirs)
452
+ files = list(files)
453
+
454
+ if topdown:
455
+ # Yield before recursion if walking top down
456
+ yield path, dirs, files
457
+
458
+ if maxdepth is not None:
459
+ maxdepth -= 1
460
+ if maxdepth < 1:
461
+ if not topdown:
462
+ yield path, dirs, files
463
+ return
464
+
465
+ for d in dirs:
466
+ yield from self.walk(
467
+ full_dirs[d],
468
+ maxdepth=maxdepth,
469
+ detail=detail,
470
+ topdown=topdown,
471
+ **kwargs,
472
+ )
473
+
474
+ if not topdown:
475
+ # Yield after recursion if walking bottom up
476
+ yield path, dirs, files
477
+
478
+ def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
479
+ """List all files below path.
480
+
481
+ Like posix ``find`` command without conditions
482
+
483
+ Parameters
484
+ ----------
485
+ path : str
486
+ maxdepth: int or None
487
+ If not None, the maximum number of levels to descend
488
+ withdirs: bool
489
+ Whether to include directory paths in the output. This is True
490
+ when used by glob, but users usually only want files.
491
+ kwargs are passed to ``ls``.
492
+ """
493
+ # TODO: allow equivalent of -name parameter
494
+ path = self._strip_protocol(path)
495
+ out = {}
496
+
497
+ # Add the root directory if withdirs is requested
498
+ # This is needed for posix glob compliance
499
+ if withdirs and path != "" and self.isdir(path):
500
+ out[path] = self.info(path)
501
+
502
+ for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
503
+ if withdirs:
504
+ files.update(dirs)
505
+ out.update({info["name"]: info for name, info in files.items()})
506
+ if not out and self.isfile(path):
507
+ # walk works on directories, but find should also return [path]
508
+ # when path happens to be a file
509
+ out[path] = {}
510
+ names = sorted(out)
511
+ if not detail:
512
+ return names
513
+ else:
514
+ return {name: out[name] for name in names}
515
+
516
+ def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
517
+ """Space used by files and optionally directories within a path
518
+
519
+ Directory size does not include the size of its contents.
520
+
521
+ Parameters
522
+ ----------
523
+ path: str
524
+ total: bool
525
+ Whether to sum all the file sizes
526
+ maxdepth: int or None
527
+ Maximum number of directory levels to descend, None for unlimited.
528
+ withdirs: bool
529
+ Whether to include directory paths in the output.
530
+ kwargs: passed to ``find``
531
+
532
+ Returns
533
+ -------
534
+ Dict of {path: size} if total=False, or int otherwise, where numbers
535
+ refer to bytes used.
536
+ """
537
+ sizes = {}
538
+ if withdirs and self.isdir(path):
539
+ # Include top-level directory in output
540
+ info = self.info(path)
541
+ sizes[info["name"]] = info["size"]
542
+ for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs):
543
+ info = self.info(f)
544
+ sizes[info["name"]] = info["size"]
545
+ if total:
546
+ return sum(sizes.values())
547
+ else:
548
+ return sizes
549
+
550
+ def glob(self, path, maxdepth=None, **kwargs):
551
+ """
552
+ Find files by glob-matching.
553
+
554
+ If the path ends with '/', only folders are returned.
555
+
556
+ We support ``"**"``,
557
+ ``"?"`` and ``"[..]"``. We do not support ^ for pattern negation.
558
+
559
+ The `maxdepth` option is applied on the first `**` found in the path.
560
+
561
+ kwargs are passed to ``ls``.
562
+ """
563
+ if maxdepth is not None and maxdepth < 1:
564
+ raise ValueError("maxdepth must be at least 1")
565
+
566
+ import re
567
+
568
+ seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
569
+ ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
570
+ path = self._strip_protocol(path)
571
+ append_slash_to_dirname = ends_with_sep or path.endswith(
572
+ tuple(sep + "**" for sep in seps)
573
+ )
574
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
575
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
576
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
577
+
578
+ min_idx = min(idx_star, idx_qmark, idx_brace)
579
+
580
+ detail = kwargs.pop("detail", False)
581
+
582
+ if not has_magic(path):
583
+ if self.exists(path, **kwargs):
584
+ if not detail:
585
+ return [path]
586
+ else:
587
+ return {path: self.info(path, **kwargs)}
588
+ else:
589
+ if not detail:
590
+ return [] # glob of non-existent returns empty
591
+ else:
592
+ return {}
593
+ elif "/" in path[:min_idx]:
594
+ min_idx = path[:min_idx].rindex("/")
595
+ root = path[: min_idx + 1]
596
+ depth = path[min_idx + 1 :].count("/") + 1
597
+ else:
598
+ root = ""
599
+ depth = path[min_idx + 1 :].count("/") + 1
600
+
601
+ if "**" in path:
602
+ if maxdepth is not None:
603
+ idx_double_stars = path.find("**")
604
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
605
+ depth = depth - depth_double_stars + maxdepth
606
+ else:
607
+ depth = None
608
+
609
+ allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
610
+
611
+ pattern = glob_translate(path + ("/" if ends_with_sep else ""))
612
+ pattern = re.compile(pattern)
613
+
614
+ out = {
615
+ p: info
616
+ for p, info in sorted(allpaths.items())
617
+ if pattern.match(
618
+ p + "/"
619
+ if append_slash_to_dirname and info["type"] == "directory"
620
+ else p
621
+ )
622
+ }
623
+
624
+ if detail:
625
+ return out
626
+ else:
627
+ return list(out)
628
+
629
+ def exists(self, path, **kwargs):
630
+ """Is there a file at the given path"""
631
+ try:
632
+ self.info(path, **kwargs)
633
+ return True
634
+ except: # noqa: E722
635
+ # any exception allowed bar FileNotFoundError?
636
+ return False
637
+
638
+ def lexists(self, path, **kwargs):
639
+ """If there is a file at the given path (including
640
+ broken links)"""
641
+ return self.exists(path)
642
+
643
+ def info(self, path, **kwargs):
644
+ """Give details of entry at path
645
+
646
+ Returns a single dictionary, with exactly the same information as ``ls``
647
+ would with ``detail=True``.
648
+
649
+ The default implementation calls ls and could be overridden by a
650
+ shortcut. kwargs are passed on to ```ls()``.
651
+
652
+ Some file systems might not be able to measure the file's size, in
653
+ which case, the returned dict will include ``'size': None``.
654
+
655
+ Returns
656
+ -------
657
+ dict with keys: name (full path in the FS), size (in bytes), type (file,
658
+ directory, or something else) and other FS-specific keys.
659
+ """
660
+ path = self._strip_protocol(path)
661
+ out = self.ls(self._parent(path), detail=True, **kwargs)
662
+ out = [o for o in out if o["name"].rstrip("/") == path]
663
+ if out:
664
+ return out[0]
665
+ out = self.ls(path, detail=True, **kwargs)
666
+ path = path.rstrip("/")
667
+ out1 = [o for o in out if o["name"].rstrip("/") == path]
668
+ if len(out1) == 1:
669
+ if "size" not in out1[0]:
670
+ out1[0]["size"] = None
671
+ return out1[0]
672
+ elif len(out1) > 1 or out:
673
+ return {"name": path, "size": 0, "type": "directory"}
674
+ else:
675
+ raise FileNotFoundError(path)
676
+
677
+ def checksum(self, path):
678
+ """Unique value for current version of file
679
+
680
+ If the checksum is the same from one moment to another, the contents
681
+ are guaranteed to be the same. If the checksum changes, the contents
682
+ *might* have changed.
683
+
684
+ This should normally be overridden; default will probably capture
685
+ creation/modification timestamp (which would be good) or maybe
686
+ access timestamp (which would be bad)
687
+ """
688
+ return int(tokenize(self.info(path)), 16)
689
+
690
+ def size(self, path):
691
+ """Size in bytes of file"""
692
+ return self.info(path).get("size", None)
693
+
694
+ def sizes(self, paths):
695
+ """Size in bytes of each file in a list of paths"""
696
+ return [self.size(p) for p in paths]
697
+
698
+ def isdir(self, path):
699
+ """Is this entry directory-like?"""
700
+ try:
701
+ return self.info(path)["type"] == "directory"
702
+ except OSError:
703
+ return False
704
+
705
+ def isfile(self, path):
706
+ """Is this entry file-like?"""
707
+ try:
708
+ return self.info(path)["type"] == "file"
709
+ except: # noqa: E722
710
+ return False
711
+
712
+ def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs):
713
+ """Get the contents of the file as a string.
714
+
715
+ Parameters
716
+ ----------
717
+ path: str
718
+ URL of file on this filesystems
719
+ encoding, errors, newline: same as `open`.
720
+ """
721
+ with self.open(
722
+ path,
723
+ mode="r",
724
+ encoding=encoding,
725
+ errors=errors,
726
+ newline=newline,
727
+ **kwargs,
728
+ ) as f:
729
+ return f.read()
730
+
731
+ def write_text(
732
+ self, path, value, encoding=None, errors=None, newline=None, **kwargs
733
+ ):
734
+ """Write the text to the given file.
735
+
736
+ An existing file will be overwritten.
737
+
738
+ Parameters
739
+ ----------
740
+ path: str
741
+ URL of file on this filesystems
742
+ value: str
743
+ Text to write.
744
+ encoding, errors, newline: same as `open`.
745
+ """
746
+ with self.open(
747
+ path,
748
+ mode="w",
749
+ encoding=encoding,
750
+ errors=errors,
751
+ newline=newline,
752
+ **kwargs,
753
+ ) as f:
754
+ return f.write(value)
755
+
756
+ def cat_file(self, path, start=None, end=None, **kwargs):
757
+ """Get the content of a file
758
+
759
+ Parameters
760
+ ----------
761
+ path: URL of file on this filesystems
762
+ start, end: int
763
+ Bytes limits of the read. If negative, backwards from end,
764
+ like usual python slices. Either can be None for start or
765
+ end of file, respectively
766
+ kwargs: passed to ``open()``.
767
+ """
768
+ # explicitly set buffering off?
769
+ with self.open(path, "rb", **kwargs) as f:
770
+ if start is not None:
771
+ if start >= 0:
772
+ f.seek(start)
773
+ else:
774
+ f.seek(max(0, f.size + start))
775
+ if end is not None:
776
+ if end < 0:
777
+ end = f.size + end
778
+ return f.read(end - f.tell())
779
+ return f.read()
780
+
781
+ def pipe_file(self, path, value, mode="overwrite", **kwargs):
782
+ """Set the bytes of given file"""
783
+ if mode == "create" and self.exists(path):
784
+ # non-atomic but simple way; or could use "xb" in open(), which is likely
785
+ # not as well supported
786
+ raise FileExistsError
787
+ with self.open(path, "wb", **kwargs) as f:
788
+ f.write(value)
789
+
790
+ def pipe(self, path, value=None, **kwargs):
791
+ """Put value into path
792
+
793
+ (counterpart to ``cat``)
794
+
795
+ Parameters
796
+ ----------
797
+ path: string or dict(str, bytes)
798
+ If a string, a single remote location to put ``value`` bytes; if a dict,
799
+ a mapping of {path: bytesvalue}.
800
+ value: bytes, optional
801
+ If using a single path, these are the bytes to put there. Ignored if
802
+ ``path`` is a dict
803
+ """
804
+ if isinstance(path, str):
805
+ self.pipe_file(self._strip_protocol(path), value, **kwargs)
806
+ elif isinstance(path, dict):
807
+ for k, v in path.items():
808
+ self.pipe_file(self._strip_protocol(k), v, **kwargs)
809
+ else:
810
+ raise ValueError("path must be str or dict")
811
+
812
+ def cat_ranges(
813
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
814
+ ):
815
+ """Get the contents of byte ranges from one or more files
816
+
817
+ Parameters
818
+ ----------
819
+ paths: list
820
+ A list of of filepaths on this filesystems
821
+ starts, ends: int or list
822
+ Bytes limits of the read. If using a single int, the same value will be
823
+ used to read all the specified files.
824
+ """
825
+ if max_gap is not None:
826
+ raise NotImplementedError
827
+ if not isinstance(paths, list):
828
+ raise TypeError
829
+ if not isinstance(starts, list):
830
+ starts = [starts] * len(paths)
831
+ if not isinstance(ends, list):
832
+ ends = [ends] * len(paths)
833
+ if len(starts) != len(paths) or len(ends) != len(paths):
834
+ raise ValueError
835
+ out = []
836
+ for p, s, e in zip(paths, starts, ends):
837
+ try:
838
+ out.append(self.cat_file(p, s, e))
839
+ except Exception as e:
840
+ if on_error == "return":
841
+ out.append(e)
842
+ else:
843
+ raise
844
+ return out
845
+
846
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
847
+ """Fetch (potentially multiple) paths' contents
848
+
849
+ Parameters
850
+ ----------
851
+ recursive: bool
852
+ If True, assume the path(s) are directories, and get all the
853
+ contained files
854
+ on_error : "raise", "omit", "return"
855
+ If raise, an underlying exception will be raised (converted to KeyError
856
+ if the type is in self.missing_exceptions); if omit, keys with exception
857
+ will simply not be included in the output; if "return", all keys are
858
+ included in the output, but the value will be bytes or an exception
859
+ instance.
860
+ kwargs: passed to cat_file
861
+
862
+ Returns
863
+ -------
864
+ dict of {path: contents} if there are multiple paths
865
+ or the path has been otherwise expanded
866
+ """
867
+ paths = self.expand_path(path, recursive=recursive)
868
+ if (
869
+ len(paths) > 1
870
+ or isinstance(path, list)
871
+ or paths[0] != self._strip_protocol(path)
872
+ ):
873
+ out = {}
874
+ for path in paths:
875
+ try:
876
+ out[path] = self.cat_file(path, **kwargs)
877
+ except Exception as e:
878
+ if on_error == "raise":
879
+ raise
880
+ if on_error == "return":
881
+ out[path] = e
882
+ return out
883
+ else:
884
+ return self.cat_file(paths[0], **kwargs)
885
+
886
+ def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
887
+ """Copy single remote file to local"""
888
+ from .implementations.local import LocalFileSystem
889
+
890
+ if isfilelike(lpath):
891
+ outfile = lpath
892
+ elif self.isdir(rpath):
893
+ os.makedirs(lpath, exist_ok=True)
894
+ return None
895
+
896
+ fs = LocalFileSystem(auto_mkdir=True)
897
+ fs.makedirs(fs._parent(lpath), exist_ok=True)
898
+
899
+ with self.open(rpath, "rb", **kwargs) as f1:
900
+ if outfile is None:
901
+ outfile = open(lpath, "wb")
902
+
903
+ try:
904
+ callback.set_size(getattr(f1, "size", None))
905
+ data = True
906
+ while data:
907
+ data = f1.read(self.blocksize)
908
+ segment_len = outfile.write(data)
909
+ if segment_len is None:
910
+ segment_len = len(data)
911
+ callback.relative_update(segment_len)
912
+ finally:
913
+ if not isfilelike(lpath):
914
+ outfile.close()
915
+
916
+ def get(
917
+ self,
918
+ rpath,
919
+ lpath,
920
+ recursive=False,
921
+ callback=DEFAULT_CALLBACK,
922
+ maxdepth=None,
923
+ **kwargs,
924
+ ):
925
+ """Copy file(s) to local.
926
+
927
+ Copies a specific file or tree of files (if recursive=True). If lpath
928
+ ends with a "/", it will be assumed to be a directory, and target files
929
+ will go within. Can submit a list of paths, which may be glob-patterns
930
+ and will be expanded.
931
+
932
+ Calls get_file for each source.
933
+ """
934
+ if isinstance(lpath, list) and isinstance(rpath, list):
935
+ # No need to expand paths when both source and destination
936
+ # are provided as lists
937
+ rpaths = rpath
938
+ lpaths = lpath
939
+ else:
940
+ from .implementations.local import (
941
+ LocalFileSystem,
942
+ make_path_posix,
943
+ trailing_sep,
944
+ )
945
+
946
+ source_is_str = isinstance(rpath, str)
947
+ rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
948
+ if source_is_str and (not recursive or maxdepth is not None):
949
+ # Non-recursive glob does not copy directories
950
+ rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
951
+ if not rpaths:
952
+ return
953
+
954
+ if isinstance(lpath, str):
955
+ lpath = make_path_posix(lpath)
956
+
957
+ source_is_file = len(rpaths) == 1
958
+ dest_is_dir = isinstance(lpath, str) and (
959
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
960
+ )
961
+
962
+ exists = source_is_str and (
963
+ (has_magic(rpath) and source_is_file)
964
+ or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
965
+ )
966
+ lpaths = other_paths(
967
+ rpaths,
968
+ lpath,
969
+ exists=exists,
970
+ flatten=not source_is_str,
971
+ )
972
+
973
+ callback.set_size(len(lpaths))
974
+ for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
975
+ with callback.branched(rpath, lpath) as child:
976
+ self.get_file(rpath, lpath, callback=child, **kwargs)
977
+
978
+ def put_file(
979
+ self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs
980
+ ):
981
+ """Copy single file to remote"""
982
+ if mode == "create" and self.exists(rpath):
983
+ raise FileExistsError
984
+ if os.path.isdir(lpath):
985
+ self.makedirs(rpath, exist_ok=True)
986
+ return None
987
+
988
+ with open(lpath, "rb") as f1:
989
+ size = f1.seek(0, 2)
990
+ callback.set_size(size)
991
+ f1.seek(0)
992
+
993
+ self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True)
994
+ with self.open(rpath, "wb", **kwargs) as f2:
995
+ while f1.tell() < size:
996
+ data = f1.read(self.blocksize)
997
+ segment_len = f2.write(data)
998
+ if segment_len is None:
999
+ segment_len = len(data)
1000
+ callback.relative_update(segment_len)
1001
+
1002
+ def put(
1003
+ self,
1004
+ lpath,
1005
+ rpath,
1006
+ recursive=False,
1007
+ callback=DEFAULT_CALLBACK,
1008
+ maxdepth=None,
1009
+ **kwargs,
1010
+ ):
1011
+ """Copy file(s) from local.
1012
+
1013
+ Copies a specific file or tree of files (if recursive=True). If rpath
1014
+ ends with a "/", it will be assumed to be a directory, and target files
1015
+ will go within.
1016
+
1017
+ Calls put_file for each source.
1018
+ """
1019
+ if isinstance(lpath, list) and isinstance(rpath, list):
1020
+ # No need to expand paths when both source and destination
1021
+ # are provided as lists
1022
+ rpaths = rpath
1023
+ lpaths = lpath
1024
+ else:
1025
+ from .implementations.local import (
1026
+ LocalFileSystem,
1027
+ make_path_posix,
1028
+ trailing_sep,
1029
+ )
1030
+
1031
+ source_is_str = isinstance(lpath, str)
1032
+ if source_is_str:
1033
+ lpath = make_path_posix(lpath)
1034
+ fs = LocalFileSystem()
1035
+ lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
1036
+ if source_is_str and (not recursive or maxdepth is not None):
1037
+ # Non-recursive glob does not copy directories
1038
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
1039
+ if not lpaths:
1040
+ return
1041
+
1042
+ source_is_file = len(lpaths) == 1
1043
+ dest_is_dir = isinstance(rpath, str) and (
1044
+ trailing_sep(rpath) or self.isdir(rpath)
1045
+ )
1046
+
1047
+ rpath = (
1048
+ self._strip_protocol(rpath)
1049
+ if isinstance(rpath, str)
1050
+ else [self._strip_protocol(p) for p in rpath]
1051
+ )
1052
+ exists = source_is_str and (
1053
+ (has_magic(lpath) and source_is_file)
1054
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
1055
+ )
1056
+ rpaths = other_paths(
1057
+ lpaths,
1058
+ rpath,
1059
+ exists=exists,
1060
+ flatten=not source_is_str,
1061
+ )
1062
+
1063
+ callback.set_size(len(rpaths))
1064
+ for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
1065
+ with callback.branched(lpath, rpath) as child:
1066
+ self.put_file(lpath, rpath, callback=child, **kwargs)
1067
+
1068
+ def head(self, path, size=1024):
1069
+ """Get the first ``size`` bytes from file"""
1070
+ with self.open(path, "rb") as f:
1071
+ return f.read(size)
1072
+
1073
+ def tail(self, path, size=1024):
1074
+ """Get the last ``size`` bytes from file"""
1075
+ with self.open(path, "rb") as f:
1076
+ f.seek(max(-size, -f.size), 2)
1077
+ return f.read()
1078
+
1079
+ def cp_file(self, path1, path2, **kwargs):
1080
+ raise NotImplementedError
1081
+
1082
+ def copy(
1083
+ self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs
1084
+ ):
1085
+ """Copy within two locations in the filesystem
1086
+
1087
+ on_error : "raise", "ignore"
1088
+ If raise, any not-found exceptions will be raised; if ignore any
1089
+ not-found exceptions will cause the path to be skipped; defaults to
1090
+ raise unless recursive is true, where the default is ignore
1091
+ """
1092
+ if on_error is None and recursive:
1093
+ on_error = "ignore"
1094
+ elif on_error is None:
1095
+ on_error = "raise"
1096
+
1097
+ if isinstance(path1, list) and isinstance(path2, list):
1098
+ # No need to expand paths when both source and destination
1099
+ # are provided as lists
1100
+ paths1 = path1
1101
+ paths2 = path2
1102
+ else:
1103
+ from .implementations.local import trailing_sep
1104
+
1105
+ source_is_str = isinstance(path1, str)
1106
+ paths1 = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth)
1107
+ if source_is_str and (not recursive or maxdepth is not None):
1108
+ # Non-recursive glob does not copy directories
1109
+ paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
1110
+ if not paths1:
1111
+ return
1112
+
1113
+ source_is_file = len(paths1) == 1
1114
+ dest_is_dir = isinstance(path2, str) and (
1115
+ trailing_sep(path2) or self.isdir(path2)
1116
+ )
1117
+
1118
+ exists = source_is_str and (
1119
+ (has_magic(path1) and source_is_file)
1120
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
1121
+ )
1122
+ paths2 = other_paths(
1123
+ paths1,
1124
+ path2,
1125
+ exists=exists,
1126
+ flatten=not source_is_str,
1127
+ )
1128
+
1129
+ for p1, p2 in zip(paths1, paths2):
1130
+ try:
1131
+ self.cp_file(p1, p2, **kwargs)
1132
+ except FileNotFoundError:
1133
+ if on_error == "raise":
1134
+ raise
1135
+
1136
+ def expand_path(self, path, recursive=False, maxdepth=None, **kwargs):
1137
+ """Turn one or more globs or directories into a list of all matching paths
1138
+ to files or directories.
1139
+
1140
+ kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls``
1141
+ """
1142
+
1143
+ if maxdepth is not None and maxdepth < 1:
1144
+ raise ValueError("maxdepth must be at least 1")
1145
+
1146
+ if isinstance(path, (str, os.PathLike)):
1147
+ out = self.expand_path([path], recursive, maxdepth)
1148
+ else:
1149
+ out = set()
1150
+ path = [self._strip_protocol(p) for p in path]
1151
+ for p in path:
1152
+ if has_magic(p):
1153
+ bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
1154
+ out |= bit
1155
+ if recursive:
1156
+ # glob call above expanded one depth so if maxdepth is defined
1157
+ # then decrement it in expand_path call below. If it is zero
1158
+ # after decrementing then avoid expand_path call.
1159
+ if maxdepth is not None and maxdepth <= 1:
1160
+ continue
1161
+ out |= set(
1162
+ self.expand_path(
1163
+ list(bit),
1164
+ recursive=recursive,
1165
+ maxdepth=maxdepth - 1 if maxdepth is not None else None,
1166
+ **kwargs,
1167
+ )
1168
+ )
1169
+ continue
1170
+ elif recursive:
1171
+ rec = set(
1172
+ self.find(
1173
+ p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs
1174
+ )
1175
+ )
1176
+ out |= rec
1177
+ if p not in out and (recursive is False or self.exists(p)):
1178
+ # should only check once, for the root
1179
+ out.add(p)
1180
+ if not out:
1181
+ raise FileNotFoundError(path)
1182
+ return sorted(out)
1183
+
1184
+ def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
1185
+ """Move file(s) from one location to another"""
1186
+ if path1 == path2:
1187
+ logger.debug("%s mv: The paths are the same, so no files were moved.", self)
1188
+ else:
1189
+ # explicitly raise exception to prevent data corruption
1190
+ self.copy(
1191
+ path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise"
1192
+ )
1193
+ self.rm(path1, recursive=recursive)
1194
+
1195
+ def rm_file(self, path):
1196
+ """Delete a file"""
1197
+ self._rm(path)
1198
+
1199
+ def _rm(self, path):
1200
+ """Delete one file"""
1201
+ # this is the old name for the method, prefer rm_file
1202
+ raise NotImplementedError
1203
+
1204
+ def rm(self, path, recursive=False, maxdepth=None):
1205
+ """Delete files.
1206
+
1207
+ Parameters
1208
+ ----------
1209
+ path: str or list of str
1210
+ File(s) to delete.
1211
+ recursive: bool
1212
+ If file(s) are directories, recursively delete contents and then
1213
+ also remove the directory
1214
+ maxdepth: int or None
1215
+ Depth to pass to walk for finding files to delete, if recursive.
1216
+ If None, there will be no limit and infinite recursion may be
1217
+ possible.
1218
+ """
1219
+ path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
1220
+ for p in reversed(path):
1221
+ self.rm_file(p)
1222
+
1223
+ @classmethod
1224
+ def _parent(cls, path):
1225
+ path = cls._strip_protocol(path)
1226
+ if "/" in path:
1227
+ parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker)
1228
+ return cls.root_marker + parent
1229
+ else:
1230
+ return cls.root_marker
1231
+
1232
+ def _open(
1233
+ self,
1234
+ path,
1235
+ mode="rb",
1236
+ block_size=None,
1237
+ autocommit=True,
1238
+ cache_options=None,
1239
+ **kwargs,
1240
+ ):
1241
+ """Return raw bytes-mode file-like from the file-system"""
1242
+ return AbstractBufferedFile(
1243
+ self,
1244
+ path,
1245
+ mode,
1246
+ block_size,
1247
+ autocommit,
1248
+ cache_options=cache_options,
1249
+ **kwargs,
1250
+ )
1251
+
1252
+ def open(
1253
+ self,
1254
+ path,
1255
+ mode="rb",
1256
+ block_size=None,
1257
+ cache_options=None,
1258
+ compression=None,
1259
+ **kwargs,
1260
+ ):
1261
+ """
1262
+ Return a file-like object from the filesystem
1263
+
1264
+ The resultant instance must function correctly in a context ``with``
1265
+ block.
1266
+
1267
+ Parameters
1268
+ ----------
1269
+ path: str
1270
+ Target file
1271
+ mode: str like 'rb', 'w'
1272
+ See builtin ``open()``
1273
+ Mode "x" (exclusive write) may be implemented by the backend. Even if
1274
+ it is, whether it is checked up front or on commit, and whether it is
1275
+ atomic is implementation-dependent.
1276
+ block_size: int
1277
+ Some indication of buffering - this is a value in bytes
1278
+ cache_options : dict, optional
1279
+ Extra arguments to pass through to the cache.
1280
+ compression: string or None
1281
+ If given, open file using compression codec. Can either be a compression
1282
+ name (a key in ``fsspec.compression.compr``) or "infer" to guess the
1283
+ compression from the filename suffix.
1284
+ encoding, errors, newline: passed on to TextIOWrapper for text mode
1285
+ """
1286
+ import io
1287
+
1288
+ path = self._strip_protocol(path)
1289
+ if "b" not in mode:
1290
+ mode = mode.replace("t", "") + "b"
1291
+
1292
+ text_kwargs = {
1293
+ k: kwargs.pop(k)
1294
+ for k in ["encoding", "errors", "newline"]
1295
+ if k in kwargs
1296
+ }
1297
+ return io.TextIOWrapper(
1298
+ self.open(
1299
+ path,
1300
+ mode,
1301
+ block_size=block_size,
1302
+ cache_options=cache_options,
1303
+ compression=compression,
1304
+ **kwargs,
1305
+ ),
1306
+ **text_kwargs,
1307
+ )
1308
+ else:
1309
+ ac = kwargs.pop("autocommit", not self._intrans)
1310
+ f = self._open(
1311
+ path,
1312
+ mode=mode,
1313
+ block_size=block_size,
1314
+ autocommit=ac,
1315
+ cache_options=cache_options,
1316
+ **kwargs,
1317
+ )
1318
+ if compression is not None:
1319
+ from fsspec.compression import compr
1320
+ from fsspec.core import get_compression
1321
+
1322
+ compression = get_compression(path, compression)
1323
+ compress = compr[compression]
1324
+ f = compress(f, mode=mode[0])
1325
+
1326
+ if not ac and "r" not in mode:
1327
+ self.transaction.files.append(f)
1328
+ return f
1329
+
1330
+ def touch(self, path, truncate=True, **kwargs):
1331
+ """Create empty file, or update timestamp
1332
+
1333
+ Parameters
1334
+ ----------
1335
+ path: str
1336
+ file location
1337
+ truncate: bool
1338
+ If True, always set file size to 0; if False, update timestamp and
1339
+ leave file unchanged, if backend allows this
1340
+ """
1341
+ if truncate or not self.exists(path):
1342
+ with self.open(path, "wb", **kwargs):
1343
+ pass
1344
+ else:
1345
+ raise NotImplementedError # update timestamp, if possible
1346
+
1347
+ def ukey(self, path):
1348
+ """Hash of file properties, to tell if it has changed"""
1349
+ return sha256(str(self.info(path)).encode()).hexdigest()
1350
+
1351
+ def read_block(self, fn, offset, length, delimiter=None):
1352
+ """Read a block of bytes from
1353
+
1354
+ Starting at ``offset`` of the file, read ``length`` bytes. If
1355
+ ``delimiter`` is set then we ensure that the read starts and stops at
1356
+ delimiter boundaries that follow the locations ``offset`` and ``offset
1357
+ + length``. If ``offset`` is zero then we start at zero. The
1358
+ bytestring returned WILL include the end delimiter string.
1359
+
1360
+ If offset+length is beyond the eof, reads to eof.
1361
+
1362
+ Parameters
1363
+ ----------
1364
+ fn: string
1365
+ Path to filename
1366
+ offset: int
1367
+ Byte offset to start read
1368
+ length: int
1369
+ Number of bytes to read. If None, read to end.
1370
+ delimiter: bytes (optional)
1371
+ Ensure reading starts and stops at delimiter bytestring
1372
+
1373
+ Examples
1374
+ --------
1375
+ >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP
1376
+ b'Alice, 100\\nBo'
1377
+ >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP
1378
+ b'Alice, 100\\nBob, 200\\n'
1379
+
1380
+ Use ``length=None`` to read to the end of the file.
1381
+ >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP
1382
+ b'Alice, 100\\nBob, 200\\nCharlie, 300'
1383
+
1384
+ See Also
1385
+ --------
1386
+ :func:`fsspec.utils.read_block`
1387
+ """
1388
+ with self.open(fn, "rb") as f:
1389
+ size = f.size
1390
+ if length is None:
1391
+ length = size
1392
+ if size is not None and offset + length > size:
1393
+ length = size - offset
1394
+ return read_block(f, offset, length, delimiter)
1395
+
1396
+ def to_json(self, *, include_password: bool = True) -> str:
1397
+ """
1398
+ JSON representation of this filesystem instance.
1399
+
1400
+ Parameters
1401
+ ----------
1402
+ include_password: bool, default True
1403
+ Whether to include the password (if any) in the output.
1404
+
1405
+ Returns
1406
+ -------
1407
+ JSON string with keys ``cls`` (the python location of this class),
1408
+ protocol (text name of this class's protocol, first one in case of
1409
+ multiple), ``args`` (positional args, usually empty), and all other
1410
+ keyword arguments as their own keys.
1411
+
1412
+ Warnings
1413
+ --------
1414
+ Serialized filesystems may contain sensitive information which have been
1415
+ passed to the constructor, such as passwords and tokens. Make sure you
1416
+ store and send them in a secure environment!
1417
+ """
1418
+ from .json import FilesystemJSONEncoder
1419
+
1420
+ return json.dumps(
1421
+ self,
1422
+ cls=type(
1423
+ "_FilesystemJSONEncoder",
1424
+ (FilesystemJSONEncoder,),
1425
+ {"include_password": include_password},
1426
+ ),
1427
+ )
1428
+
1429
+ @staticmethod
1430
+ def from_json(blob: str) -> AbstractFileSystem:
1431
+ """
1432
+ Recreate a filesystem instance from JSON representation.
1433
+
1434
+ See ``.to_json()`` for the expected structure of the input.
1435
+
1436
+ Parameters
1437
+ ----------
1438
+ blob: str
1439
+
1440
+ Returns
1441
+ -------
1442
+ file system instance, not necessarily of this particular class.
1443
+
1444
+ Warnings
1445
+ --------
1446
+ This can import arbitrary modules (as determined by the ``cls`` key).
1447
+ Make sure you haven't installed any modules that may execute malicious code
1448
+ at import time.
1449
+ """
1450
+ from .json import FilesystemJSONDecoder
1451
+
1452
+ return json.loads(blob, cls=FilesystemJSONDecoder)
1453
+
1454
+ def to_dict(self, *, include_password: bool = True) -> dict[str, Any]:
1455
+ """
1456
+ JSON-serializable dictionary representation of this filesystem instance.
1457
+
1458
+ Parameters
1459
+ ----------
1460
+ include_password: bool, default True
1461
+ Whether to include the password (if any) in the output.
1462
+
1463
+ Returns
1464
+ -------
1465
+ Dictionary with keys ``cls`` (the python location of this class),
1466
+ protocol (text name of this class's protocol, first one in case of
1467
+ multiple), ``args`` (positional args, usually empty), and all other
1468
+ keyword arguments as their own keys.
1469
+
1470
+ Warnings
1471
+ --------
1472
+ Serialized filesystems may contain sensitive information which have been
1473
+ passed to the constructor, such as passwords and tokens. Make sure you
1474
+ store and send them in a secure environment!
1475
+ """
1476
+ from .json import FilesystemJSONEncoder
1477
+
1478
+ json_encoder = FilesystemJSONEncoder()
1479
+
1480
+ cls = type(self)
1481
+ proto = self.protocol
1482
+
1483
+ storage_options = dict(self.storage_options)
1484
+ if not include_password:
1485
+ storage_options.pop("password", None)
1486
+
1487
+ return dict(
1488
+ cls=f"{cls.__module__}:{cls.__name__}",
1489
+ protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
1490
+ args=json_encoder.make_serializable(self.storage_args),
1491
+ **json_encoder.make_serializable(storage_options),
1492
+ )
1493
+
1494
+ @staticmethod
1495
+ def from_dict(dct: dict[str, Any]) -> AbstractFileSystem:
1496
+ """
1497
+ Recreate a filesystem instance from dictionary representation.
1498
+
1499
+ See ``.to_dict()`` for the expected structure of the input.
1500
+
1501
+ Parameters
1502
+ ----------
1503
+ dct: Dict[str, Any]
1504
+
1505
+ Returns
1506
+ -------
1507
+ file system instance, not necessarily of this particular class.
1508
+
1509
+ Warnings
1510
+ --------
1511
+ This can import arbitrary modules (as determined by the ``cls`` key).
1512
+ Make sure you haven't installed any modules that may execute malicious code
1513
+ at import time.
1514
+ """
1515
+ from .json import FilesystemJSONDecoder
1516
+
1517
+ json_decoder = FilesystemJSONDecoder()
1518
+
1519
+ dct = dict(dct) # Defensive copy
1520
+
1521
+ cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
1522
+ if cls is None:
1523
+ raise ValueError("Not a serialized AbstractFileSystem")
1524
+
1525
+ dct.pop("cls", None)
1526
+ dct.pop("protocol", None)
1527
+
1528
+ return cls(
1529
+ *json_decoder.unmake_serializable(dct.pop("args", ())),
1530
+ **json_decoder.unmake_serializable(dct),
1531
+ )
1532
+
1533
+ def _get_pyarrow_filesystem(self):
1534
+ """
1535
+ Make a version of the FS instance which will be acceptable to pyarrow
1536
+ """
1537
+ # all instances already also derive from pyarrow
1538
+ return self
1539
+
1540
+ def get_mapper(self, root="", check=False, create=False, missing_exceptions=None):
1541
+ """Create key/value store based on this file-system
1542
+
1543
+ Makes a MutableMapping interface to the FS at the given root path.
1544
+ See ``fsspec.mapping.FSMap`` for further details.
1545
+ """
1546
+ from .mapping import FSMap
1547
+
1548
+ return FSMap(
1549
+ root,
1550
+ self,
1551
+ check=check,
1552
+ create=create,
1553
+ missing_exceptions=missing_exceptions,
1554
+ )
1555
+
1556
+ @classmethod
1557
+ def clear_instance_cache(cls):
1558
+ """
1559
+ Clear the cache of filesystem instances.
1560
+
1561
+ Notes
1562
+ -----
1563
+ Unless overridden by setting the ``cachable`` class attribute to False,
1564
+ the filesystem class stores a reference to newly created instances. This
1565
+ prevents Python's normal rules around garbage collection from working,
1566
+ since the instances refcount will not drop to zero until
1567
+ ``clear_instance_cache`` is called.
1568
+ """
1569
+ cls._cache.clear()
1570
+
1571
+ def created(self, path):
1572
+ """Return the created timestamp of a file as a datetime.datetime"""
1573
+ raise NotImplementedError
1574
+
1575
+ def modified(self, path):
1576
+ """Return the modified timestamp of a file as a datetime.datetime"""
1577
+ raise NotImplementedError
1578
+
1579
+ def tree(
1580
+ self,
1581
+ path: str = "/",
1582
+ recursion_limit: int = 2,
1583
+ max_display: int = 25,
1584
+ display_size: bool = False,
1585
+ prefix: str = "",
1586
+ is_last: bool = True,
1587
+ first: bool = True,
1588
+ indent_size: int = 4,
1589
+ ) -> str:
1590
+ """
1591
+ Return a tree-like structure of the filesystem starting from the given path as a string.
1592
+
1593
+ Parameters
1594
+ ----------
1595
+ path: Root path to start traversal from
1596
+ recursion_limit: Maximum depth of directory traversal
1597
+ max_display: Maximum number of items to display per directory
1598
+ display_size: Whether to display file sizes
1599
+ prefix: Current line prefix for visual tree structure
1600
+ is_last: Whether current item is last in its level
1601
+ first: Whether this is the first call (displays root path)
1602
+ indent_size: Number of spaces by indent
1603
+
1604
+ Returns
1605
+ -------
1606
+ str: A string representing the tree structure.
1607
+
1608
+ Example
1609
+ -------
1610
+ >>> from fsspec import filesystem
1611
+
1612
+ >>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password')
1613
+ >>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10)
1614
+ >>> print(tree)
1615
+ """
1616
+
1617
+ def format_bytes(n: int) -> str:
1618
+ """Format bytes as text."""
1619
+ for prefix, k in (
1620
+ ("P", 2**50),
1621
+ ("T", 2**40),
1622
+ ("G", 2**30),
1623
+ ("M", 2**20),
1624
+ ("k", 2**10),
1625
+ ):
1626
+ if n >= 0.9 * k:
1627
+ return f"{n / k:.2f} {prefix}b"
1628
+ return f"{n}B"
1629
+
1630
+ result = []
1631
+
1632
+ if first:
1633
+ result.append(path)
1634
+
1635
+ if recursion_limit:
1636
+ indent = " " * indent_size
1637
+ contents = self.ls(path, detail=True)
1638
+ contents.sort(
1639
+ key=lambda x: (x.get("type") != "directory", x.get("name", ""))
1640
+ )
1641
+
1642
+ if max_display is not None and len(contents) > max_display:
1643
+ displayed_contents = contents[:max_display]
1644
+ remaining_count = len(contents) - max_display
1645
+ else:
1646
+ displayed_contents = contents
1647
+ remaining_count = 0
1648
+
1649
+ for i, item in enumerate(displayed_contents):
1650
+ is_last_item = (i == len(displayed_contents) - 1) and (
1651
+ remaining_count == 0
1652
+ )
1653
+
1654
+ branch = (
1655
+ "└" + ("─" * (indent_size - 2))
1656
+ if is_last_item
1657
+ else "├" + ("─" * (indent_size - 2))
1658
+ )
1659
+ branch += " "
1660
+ new_prefix = prefix + (
1661
+ indent if is_last_item else "│" + " " * (indent_size - 1)
1662
+ )
1663
+
1664
+ name = os.path.basename(item.get("name", ""))
1665
+
1666
+ if display_size and item.get("type") == "directory":
1667
+ sub_contents = self.ls(item.get("name", ""), detail=True)
1668
+ num_files = sum(
1669
+ 1 for sub_item in sub_contents if sub_item.get("type") == "file"
1670
+ )
1671
+ num_folders = sum(
1672
+ 1
1673
+ for sub_item in sub_contents
1674
+ if sub_item.get("type") == "directory"
1675
+ )
1676
+
1677
+ if num_files == 0 and num_folders == 0:
1678
+ size = " (empty folder)"
1679
+ elif num_files == 0:
1680
+ size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})"
1681
+ elif num_folders == 0:
1682
+ size = f" ({num_files} file{'s' if num_files > 1 else ''})"
1683
+ else:
1684
+ size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})"
1685
+ elif display_size and item.get("type") == "file":
1686
+ size = f" ({format_bytes(item.get('size', 0))})"
1687
+ else:
1688
+ size = ""
1689
+
1690
+ result.append(f"{prefix}{branch}{name}{size}")
1691
+
1692
+ if item.get("type") == "directory" and recursion_limit > 0:
1693
+ result.append(
1694
+ self.tree(
1695
+ path=item.get("name", ""),
1696
+ recursion_limit=recursion_limit - 1,
1697
+ max_display=max_display,
1698
+ display_size=display_size,
1699
+ prefix=new_prefix,
1700
+ is_last=is_last_item,
1701
+ first=False,
1702
+ indent_size=indent_size,
1703
+ )
1704
+ )
1705
+
1706
+ if remaining_count > 0:
1707
+ more_message = f"{remaining_count} more item(s) not displayed."
1708
+ result.append(
1709
+ f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}"
1710
+ )
1711
+
1712
+ return "\n".join(_ for _ in result if _)
1713
+
1714
+ # ------------------------------------------------------------------------
1715
+ # Aliases
1716
+
1717
+ def read_bytes(self, path, start=None, end=None, **kwargs):
1718
+ """Alias of `AbstractFileSystem.cat_file`."""
1719
+ return self.cat_file(path, start=start, end=end, **kwargs)
1720
+
1721
+ def write_bytes(self, path, value, **kwargs):
1722
+ """Alias of `AbstractFileSystem.pipe_file`."""
1723
+ self.pipe_file(path, value, **kwargs)
1724
+
1725
+ def makedir(self, path, create_parents=True, **kwargs):
1726
+ """Alias of `AbstractFileSystem.mkdir`."""
1727
+ return self.mkdir(path, create_parents=create_parents, **kwargs)
1728
+
1729
+ def mkdirs(self, path, exist_ok=False):
1730
+ """Alias of `AbstractFileSystem.makedirs`."""
1731
+ return self.makedirs(path, exist_ok=exist_ok)
1732
+
1733
+ def listdir(self, path, detail=True, **kwargs):
1734
+ """Alias of `AbstractFileSystem.ls`."""
1735
+ return self.ls(path, detail=detail, **kwargs)
1736
+
1737
+ def cp(self, path1, path2, **kwargs):
1738
+ """Alias of `AbstractFileSystem.copy`."""
1739
+ return self.copy(path1, path2, **kwargs)
1740
+
1741
+ def move(self, path1, path2, **kwargs):
1742
+ """Alias of `AbstractFileSystem.mv`."""
1743
+ return self.mv(path1, path2, **kwargs)
1744
+
1745
+ def stat(self, path, **kwargs):
1746
+ """Alias of `AbstractFileSystem.info`."""
1747
+ return self.info(path, **kwargs)
1748
+
1749
+ def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
1750
+ """Alias of `AbstractFileSystem.du`."""
1751
+ return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
1752
+
1753
+ def rename(self, path1, path2, **kwargs):
1754
+ """Alias of `AbstractFileSystem.mv`."""
1755
+ return self.mv(path1, path2, **kwargs)
1756
+
1757
+ def delete(self, path, recursive=False, maxdepth=None):
1758
+ """Alias of `AbstractFileSystem.rm`."""
1759
+ return self.rm(path, recursive=recursive, maxdepth=maxdepth)
1760
+
1761
+ def upload(self, lpath, rpath, recursive=False, **kwargs):
1762
+ """Alias of `AbstractFileSystem.put`."""
1763
+ return self.put(lpath, rpath, recursive=recursive, **kwargs)
1764
+
1765
+ def download(self, rpath, lpath, recursive=False, **kwargs):
1766
+ """Alias of `AbstractFileSystem.get`."""
1767
+ return self.get(rpath, lpath, recursive=recursive, **kwargs)
1768
+
1769
+ def sign(self, path, expiration=100, **kwargs):
1770
+ """Create a signed URL representing the given path
1771
+
1772
+ Some implementations allow temporary URLs to be generated, as a
1773
+ way of delegating credentials.
1774
+
1775
+ Parameters
1776
+ ----------
1777
+ path : str
1778
+ The path on the filesystem
1779
+ expiration : int
1780
+ Number of seconds to enable the URL for (if supported)
1781
+
1782
+ Returns
1783
+ -------
1784
+ URL : str
1785
+ The signed URL
1786
+
1787
+ Raises
1788
+ ------
1789
+ NotImplementedError : if method is not implemented for a filesystem
1790
+ """
1791
+ raise NotImplementedError("Sign is not implemented for this filesystem")
1792
+
1793
+ def _isfilestore(self):
1794
+ # Originally inherited from pyarrow DaskFileSystem. Keeping this
1795
+ # here for backwards compatibility as long as pyarrow uses its
1796
+ # legacy fsspec-compatible filesystems and thus accepts fsspec
1797
+ # filesystems as well
1798
+ return False
1799
+
1800
+
1801
+ class AbstractBufferedFile(io.IOBase):
1802
+ """Convenient class to derive from to provide buffering
1803
+
1804
+ In the case that the backend does not provide a pythonic file-like object
1805
+ already, this class contains much of the logic to build one. The only
1806
+ methods that need to be overridden are ``_upload_chunk``,
1807
+ ``_initiate_upload`` and ``_fetch_range``.
1808
+ """
1809
+
1810
+ DEFAULT_BLOCK_SIZE = 5 * 2**20
1811
+ _details = None
1812
+
1813
+ def __init__(
1814
+ self,
1815
+ fs,
1816
+ path,
1817
+ mode="rb",
1818
+ block_size="default",
1819
+ autocommit=True,
1820
+ cache_type="readahead",
1821
+ cache_options=None,
1822
+ size=None,
1823
+ **kwargs,
1824
+ ):
1825
+ """
1826
+ Template for files with buffered reading and writing
1827
+
1828
+ Parameters
1829
+ ----------
1830
+ fs: instance of FileSystem
1831
+ path: str
1832
+ location in file-system
1833
+ mode: str
1834
+ Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
1835
+ systems may be read-only, and some may not support append.
1836
+ block_size: int
1837
+ Buffer size for reading or writing, 'default' for class default
1838
+ autocommit: bool
1839
+ Whether to write to final destination; may only impact what
1840
+ happens when file is being closed.
1841
+ cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
1842
+ Caching policy in read mode. See the definitions in ``core``.
1843
+ cache_options : dict
1844
+ Additional options passed to the constructor for the cache specified
1845
+ by `cache_type`.
1846
+ size: int
1847
+ If given and in read mode, suppressed having to look up the file size
1848
+ kwargs:
1849
+ Gets stored as self.kwargs
1850
+ """
1851
+ from .core import caches
1852
+
1853
+ self.path = path
1854
+ self.fs = fs
1855
+ self.mode = mode
1856
+ self.blocksize = (
1857
+ self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
1858
+ )
1859
+ self.loc = 0
1860
+ self.autocommit = autocommit
1861
+ self.end = None
1862
+ self.start = None
1863
+ self.closed = False
1864
+
1865
+ if cache_options is None:
1866
+ cache_options = {}
1867
+
1868
+ if "trim" in kwargs:
1869
+ warnings.warn(
1870
+ "Passing 'trim' to control the cache behavior has been deprecated. "
1871
+ "Specify it within the 'cache_options' argument instead.",
1872
+ FutureWarning,
1873
+ )
1874
+ cache_options["trim"] = kwargs.pop("trim")
1875
+
1876
+ self.kwargs = kwargs
1877
+
1878
+ if mode not in {"ab", "rb", "wb", "xb"}:
1879
+ raise NotImplementedError("File mode not supported")
1880
+ if mode == "rb":
1881
+ if size is not None:
1882
+ self.size = size
1883
+ else:
1884
+ self.size = self.details["size"]
1885
+ self.cache = caches[cache_type](
1886
+ self.blocksize, self._fetch_range, self.size, **cache_options
1887
+ )
1888
+ else:
1889
+ self.buffer = io.BytesIO()
1890
+ self.offset = None
1891
+ self.forced = False
1892
+ self.location = None
1893
+
1894
+ @property
1895
+ def details(self):
1896
+ if self._details is None:
1897
+ self._details = self.fs.info(self.path)
1898
+ return self._details
1899
+
1900
+ @details.setter
1901
+ def details(self, value):
1902
+ self._details = value
1903
+ self.size = value["size"]
1904
+
1905
+ @property
1906
+ def full_name(self):
1907
+ return _unstrip_protocol(self.path, self.fs)
1908
+
1909
+ @property
1910
+ def closed(self):
1911
+ # get around this attr being read-only in IOBase
1912
+ # use getattr here, since this can be called during del
1913
+ return getattr(self, "_closed", True)
1914
+
1915
+ @closed.setter
1916
+ def closed(self, c):
1917
+ self._closed = c
1918
+
1919
+ def __hash__(self):
1920
+ if "w" in self.mode:
1921
+ return id(self)
1922
+ else:
1923
+ return int(tokenize(self.details), 16)
1924
+
1925
+ def __eq__(self, other):
1926
+ """Files are equal if they have the same checksum, only in read mode"""
1927
+ if self is other:
1928
+ return True
1929
+ return (
1930
+ isinstance(other, type(self))
1931
+ and self.mode == "rb"
1932
+ and other.mode == "rb"
1933
+ and hash(self) == hash(other)
1934
+ )
1935
+
1936
+ def commit(self):
1937
+ """Move from temp to final destination"""
1938
+
1939
+ def discard(self):
1940
+ """Throw away temporary file"""
1941
+
1942
+ def info(self):
1943
+ """File information about this path"""
1944
+ if self.readable():
1945
+ return self.details
1946
+ else:
1947
+ raise ValueError("Info not available while writing")
1948
+
1949
+ def tell(self):
1950
+ """Current file location"""
1951
+ return self.loc
1952
+
1953
+ def seek(self, loc, whence=0):
1954
+ """Set current file location
1955
+
1956
+ Parameters
1957
+ ----------
1958
+ loc: int
1959
+ byte location
1960
+ whence: {0, 1, 2}
1961
+ from start of file, current location or end of file, resp.
1962
+ """
1963
+ loc = int(loc)
1964
+ if not self.mode == "rb":
1965
+ raise OSError(ESPIPE, "Seek only available in read mode")
1966
+ if whence == 0:
1967
+ nloc = loc
1968
+ elif whence == 1:
1969
+ nloc = self.loc + loc
1970
+ elif whence == 2:
1971
+ nloc = self.size + loc
1972
+ else:
1973
+ raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
1974
+ if nloc < 0:
1975
+ raise ValueError("Seek before start of file")
1976
+ self.loc = nloc
1977
+ return self.loc
1978
+
1979
+ def write(self, data):
1980
+ """
1981
+ Write data to buffer.
1982
+
1983
+ Buffer only sent on flush() or if buffer is greater than
1984
+ or equal to blocksize.
1985
+
1986
+ Parameters
1987
+ ----------
1988
+ data: bytes
1989
+ Set of bytes to be written.
1990
+ """
1991
+ if not self.writable():
1992
+ raise ValueError("File not in write mode")
1993
+ if self.closed:
1994
+ raise ValueError("I/O operation on closed file.")
1995
+ if self.forced:
1996
+ raise ValueError("This file has been force-flushed, can only close")
1997
+ out = self.buffer.write(data)
1998
+ self.loc += out
1999
+ if self.buffer.tell() >= self.blocksize:
2000
+ self.flush()
2001
+ return out
2002
+
2003
+ def flush(self, force=False):
2004
+ """
2005
+ Write buffered data to backend store.
2006
+
2007
+ Writes the current buffer, if it is larger than the block-size, or if
2008
+ the file is being closed.
2009
+
2010
+ Parameters
2011
+ ----------
2012
+ force: bool
2013
+ When closing, write the last block even if it is smaller than
2014
+ blocks are allowed to be. Disallows further writing to this file.
2015
+ """
2016
+
2017
+ if self.closed:
2018
+ raise ValueError("Flush on closed file")
2019
+ if force and self.forced:
2020
+ raise ValueError("Force flush cannot be called more than once")
2021
+ if force:
2022
+ self.forced = True
2023
+
2024
+ if self.readable():
2025
+ # no-op to flush on read-mode
2026
+ return
2027
+
2028
+ if not force and self.buffer.tell() < self.blocksize:
2029
+ # Defer write on small block
2030
+ return
2031
+
2032
+ if self.offset is None:
2033
+ # Initialize a multipart upload
2034
+ self.offset = 0
2035
+ try:
2036
+ self._initiate_upload()
2037
+ except:
2038
+ self.closed = True
2039
+ raise
2040
+
2041
+ if self._upload_chunk(final=force) is not False:
2042
+ self.offset += self.buffer.seek(0, 2)
2043
+ self.buffer = io.BytesIO()
2044
+
2045
+ def _upload_chunk(self, final=False):
2046
+ """Write one part of a multi-block file upload
2047
+
2048
+ Parameters
2049
+ ==========
2050
+ final: bool
2051
+ This is the last block, so should complete file, if
2052
+ self.autocommit is True.
2053
+ """
2054
+ # may not yet have been initialized, may need to call _initialize_upload
2055
+
2056
+ def _initiate_upload(self):
2057
+ """Create remote file/upload"""
2058
+ pass
2059
+
2060
+ def _fetch_range(self, start, end):
2061
+ """Get the specified set of bytes from remote"""
2062
+ return self.fs.cat_file(self.path, start=start, end=end)
2063
+
2064
+ def read(self, length=-1):
2065
+ """
2066
+ Return data from cache, or fetch pieces as necessary
2067
+
2068
+ Parameters
2069
+ ----------
2070
+ length: int (-1)
2071
+ Number of bytes to read; if <0, all remaining bytes.
2072
+ """
2073
+ length = -1 if length is None else int(length)
2074
+ if self.mode != "rb":
2075
+ raise ValueError("File not in read mode")
2076
+ if length < 0:
2077
+ length = self.size - self.loc
2078
+ if self.closed:
2079
+ raise ValueError("I/O operation on closed file.")
2080
+ if length == 0:
2081
+ # don't even bother calling fetch
2082
+ return b""
2083
+ out = self.cache._fetch(self.loc, self.loc + length)
2084
+
2085
+ logger.debug(
2086
+ "%s read: %i - %i %s",
2087
+ self,
2088
+ self.loc,
2089
+ self.loc + length,
2090
+ self.cache._log_stats(),
2091
+ )
2092
+ self.loc += len(out)
2093
+ return out
2094
+
2095
+ def readinto(self, b):
2096
+ """mirrors builtin file's readinto method
2097
+
2098
+ https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
2099
+ """
2100
+ out = memoryview(b).cast("B")
2101
+ data = self.read(out.nbytes)
2102
+ out[: len(data)] = data
2103
+ return len(data)
2104
+
2105
+ def readuntil(self, char=b"\n", blocks=None):
2106
+ """Return data between current position and first occurrence of char
2107
+
2108
+ char is included in the output, except if the end of the tile is
2109
+ encountered first.
2110
+
2111
+ Parameters
2112
+ ----------
2113
+ char: bytes
2114
+ Thing to find
2115
+ blocks: None or int
2116
+ How much to read in each go. Defaults to file blocksize - which may
2117
+ mean a new read on every call.
2118
+ """
2119
+ out = []
2120
+ while True:
2121
+ start = self.tell()
2122
+ part = self.read(blocks or self.blocksize)
2123
+ if len(part) == 0:
2124
+ break
2125
+ found = part.find(char)
2126
+ if found > -1:
2127
+ out.append(part[: found + len(char)])
2128
+ self.seek(start + found + len(char))
2129
+ break
2130
+ out.append(part)
2131
+ return b"".join(out)
2132
+
2133
+ def readline(self):
2134
+ """Read until first occurrence of newline character
2135
+
2136
+ Note that, because of character encoding, this is not necessarily a
2137
+ true line ending.
2138
+ """
2139
+ return self.readuntil(b"\n")
2140
+
2141
+ def __next__(self):
2142
+ out = self.readline()
2143
+ if out:
2144
+ return out
2145
+ raise StopIteration
2146
+
2147
+ def __iter__(self):
2148
+ return self
2149
+
2150
+ def readlines(self):
2151
+ """Return all data, split by the newline character"""
2152
+ data = self.read()
2153
+ lines = data.split(b"\n")
2154
+ out = [l + b"\n" for l in lines[:-1]]
2155
+ if data.endswith(b"\n"):
2156
+ return out
2157
+ else:
2158
+ return out + [lines[-1]]
2159
+ # return list(self) ???
2160
+
2161
+ def readinto1(self, b):
2162
+ return self.readinto(b)
2163
+
2164
+ def close(self):
2165
+ """Close file
2166
+
2167
+ Finalizes writes, discards cache
2168
+ """
2169
+ if getattr(self, "_unclosable", False):
2170
+ return
2171
+ if self.closed:
2172
+ return
2173
+ try:
2174
+ if self.mode == "rb":
2175
+ self.cache = None
2176
+ else:
2177
+ if not self.forced:
2178
+ self.flush(force=True)
2179
+
2180
+ if self.fs is not None:
2181
+ self.fs.invalidate_cache(self.path)
2182
+ self.fs.invalidate_cache(self.fs._parent(self.path))
2183
+ finally:
2184
+ self.closed = True
2185
+
2186
+ def readable(self):
2187
+ """Whether opened for reading"""
2188
+ return "r" in self.mode and not self.closed
2189
+
2190
+ def seekable(self):
2191
+ """Whether is seekable (only in read mode)"""
2192
+ return self.readable()
2193
+
2194
+ def writable(self):
2195
+ """Whether opened for writing"""
2196
+ return self.mode in {"wb", "ab", "xb"} and not self.closed
2197
+
2198
+ def __reduce__(self):
2199
+ if self.mode != "rb":
2200
+ raise RuntimeError("Pickling a writeable file is not supported")
2201
+
2202
+ return reopen, (
2203
+ self.fs,
2204
+ self.path,
2205
+ self.mode,
2206
+ self.blocksize,
2207
+ self.loc,
2208
+ self.size,
2209
+ self.autocommit,
2210
+ self.cache.name if self.cache else "none",
2211
+ self.kwargs,
2212
+ )
2213
+
2214
+ def __del__(self):
2215
+ if not self.closed:
2216
+ self.close()
2217
+
2218
+ def __str__(self):
2219
+ return f"<File-like object {type(self.fs).__name__}, {self.path}>"
2220
+
2221
+ __repr__ = __str__
2222
+
2223
+ def __enter__(self):
2224
+ return self
2225
+
2226
+ def __exit__(self, *args):
2227
+ self.close()
2228
+
2229
+
2230
+ def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs):
2231
+ file = fs.open(
2232
+ path,
2233
+ mode=mode,
2234
+ block_size=blocksize,
2235
+ autocommit=autocommit,
2236
+ cache_type=cache_type,
2237
+ size=size,
2238
+ **kwargs,
2239
+ )
2240
+ if loc > 0:
2241
+ file.seek(loc)
2242
+ return file
meow/lib/python3.13/site-packages/fsspec/transaction.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+
3
+
4
+ class Transaction:
5
+ """Filesystem transaction write context
6
+
7
+ Gathers files for deferred commit or discard, so that several write
8
+ operations can be finalized semi-atomically. This works by having this
9
+ instance as the ``.transaction`` attribute of the given filesystem
10
+ """
11
+
12
+ def __init__(self, fs, **kwargs):
13
+ """
14
+ Parameters
15
+ ----------
16
+ fs: FileSystem instance
17
+ """
18
+ self.fs = fs
19
+ self.files = deque()
20
+
21
+ def __enter__(self):
22
+ self.start()
23
+ return self
24
+
25
+ def __exit__(self, exc_type, exc_val, exc_tb):
26
+ """End transaction and commit, if exit is not due to exception"""
27
+ # only commit if there was no exception
28
+ self.complete(commit=exc_type is None)
29
+ if self.fs:
30
+ self.fs._intrans = False
31
+ self.fs._transaction = None
32
+ self.fs = None
33
+
34
+ def start(self):
35
+ """Start a transaction on this FileSystem"""
36
+ self.files = deque() # clean up after previous failed completions
37
+ self.fs._intrans = True
38
+
39
+ def complete(self, commit=True):
40
+ """Finish transaction: commit or discard all deferred files"""
41
+ while self.files:
42
+ f = self.files.popleft()
43
+ if commit:
44
+ f.commit()
45
+ else:
46
+ f.discard()
47
+ self.fs._intrans = False
48
+ self.fs._transaction = None
49
+ self.fs = None
50
+
51
+
52
+ class FileActor:
53
+ def __init__(self):
54
+ self.files = []
55
+
56
+ def commit(self):
57
+ for f in self.files:
58
+ f.commit()
59
+ self.files.clear()
60
+
61
+ def discard(self):
62
+ for f in self.files:
63
+ f.discard()
64
+ self.files.clear()
65
+
66
+ def append(self, f):
67
+ self.files.append(f)
68
+
69
+
70
+ class DaskTransaction(Transaction):
71
+ def __init__(self, fs):
72
+ """
73
+ Parameters
74
+ ----------
75
+ fs: FileSystem instance
76
+ """
77
+ import distributed
78
+
79
+ super().__init__(fs)
80
+ client = distributed.default_client()
81
+ self.files = client.submit(FileActor, actor=True).result()
82
+
83
+ def complete(self, commit=True):
84
+ """Finish transaction: commit or discard all deferred files"""
85
+ if commit:
86
+ self.files.commit().result()
87
+ else:
88
+ self.files.discard().result()
89
+ self.fs._intrans = False
90
+ self.fs = None
meow/lib/python3.13/site-packages/fsspec/utils.py ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import logging
5
+ import math
6
+ import os
7
+ import re
8
+ import sys
9
+ import tempfile
10
+ from functools import partial
11
+ from hashlib import md5
12
+ from importlib.metadata import version
13
+ from typing import (
14
+ IO,
15
+ TYPE_CHECKING,
16
+ Any,
17
+ Callable,
18
+ Iterable,
19
+ Iterator,
20
+ Sequence,
21
+ TypeVar,
22
+ )
23
+ from urllib.parse import urlsplit
24
+
25
+ if TYPE_CHECKING:
26
+ import pathlib
27
+
28
+ from typing_extensions import TypeGuard
29
+
30
+ from fsspec.spec import AbstractFileSystem
31
+
32
+
33
+ DEFAULT_BLOCK_SIZE = 5 * 2**20
34
+
35
+ T = TypeVar("T")
36
+
37
+
38
+ def infer_storage_options(
39
+ urlpath: str, inherit_storage_options: dict[str, Any] | None = None
40
+ ) -> dict[str, Any]:
41
+ """Infer storage options from URL path and merge it with existing storage
42
+ options.
43
+
44
+ Parameters
45
+ ----------
46
+ urlpath: str or unicode
47
+ Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
48
+ inherit_storage_options: dict (optional)
49
+ Its contents will get merged with the inferred information from the
50
+ given path
51
+
52
+ Returns
53
+ -------
54
+ Storage options dict.
55
+
56
+ Examples
57
+ --------
58
+ >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP
59
+ {"protocol": "file", "path", "/mnt/datasets/test.csv"}
60
+ >>> infer_storage_options(
61
+ ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
62
+ ... inherit_storage_options={'extra': 'value'},
63
+ ... ) # doctest: +SKIP
64
+ {"protocol": "hdfs", "username": "username", "password": "pwd",
65
+ "host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
66
+ "url_query": "q=1", "extra": "value"}
67
+ """
68
+ # Handle Windows paths including disk name in this special case
69
+ if (
70
+ re.match(r"^[a-zA-Z]:[\\/]", urlpath)
71
+ or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None
72
+ ):
73
+ return {"protocol": "file", "path": urlpath}
74
+
75
+ parsed_path = urlsplit(urlpath)
76
+ protocol = parsed_path.scheme or "file"
77
+ if parsed_path.fragment:
78
+ path = "#".join([parsed_path.path, parsed_path.fragment])
79
+ else:
80
+ path = parsed_path.path
81
+ if protocol == "file":
82
+ # Special case parsing file protocol URL on Windows according to:
83
+ # https://msdn.microsoft.com/en-us/library/jj710207.aspx
84
+ windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
85
+ if windows_path:
86
+ drive, path = windows_path.groups()
87
+ path = f"{drive}:{path}"
88
+
89
+ if protocol in ["http", "https"]:
90
+ # for HTTP, we don't want to parse, as requests will anyway
91
+ return {"protocol": protocol, "path": urlpath}
92
+
93
+ options: dict[str, Any] = {"protocol": protocol, "path": path}
94
+
95
+ if parsed_path.netloc:
96
+ # Parse `hostname` from netloc manually because `parsed_path.hostname`
97
+ # lowercases the hostname which is not always desirable (e.g. in S3):
98
+ # https://github.com/dask/dask/issues/1417
99
+ options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
100
+
101
+ if protocol in ("s3", "s3a", "gcs", "gs"):
102
+ options["path"] = options["host"] + options["path"]
103
+ else:
104
+ options["host"] = options["host"]
105
+ if parsed_path.port:
106
+ options["port"] = parsed_path.port
107
+ if parsed_path.username:
108
+ options["username"] = parsed_path.username
109
+ if parsed_path.password:
110
+ options["password"] = parsed_path.password
111
+
112
+ if parsed_path.query:
113
+ options["url_query"] = parsed_path.query
114
+ if parsed_path.fragment:
115
+ options["url_fragment"] = parsed_path.fragment
116
+
117
+ if inherit_storage_options:
118
+ update_storage_options(options, inherit_storage_options)
119
+
120
+ return options
121
+
122
+
123
+ def update_storage_options(
124
+ options: dict[str, Any], inherited: dict[str, Any] | None = None
125
+ ) -> None:
126
+ if not inherited:
127
+ inherited = {}
128
+ collisions = set(options) & set(inherited)
129
+ if collisions:
130
+ for collision in collisions:
131
+ if options.get(collision) != inherited.get(collision):
132
+ raise KeyError(
133
+ f"Collision between inferred and specified storage "
134
+ f"option:\n{collision}"
135
+ )
136
+ options.update(inherited)
137
+
138
+
139
+ # Compression extensions registered via fsspec.compression.register_compression
140
+ compressions: dict[str, str] = {}
141
+
142
+
143
+ def infer_compression(filename: str) -> str | None:
144
+ """Infer compression, if available, from filename.
145
+
146
+ Infer a named compression type, if registered and available, from filename
147
+ extension. This includes builtin (gz, bz2, zip) compressions, as well as
148
+ optional compressions. See fsspec.compression.register_compression.
149
+ """
150
+ extension = os.path.splitext(filename)[-1].strip(".").lower()
151
+ if extension in compressions:
152
+ return compressions[extension]
153
+ return None
154
+
155
+
156
+ def build_name_function(max_int: float) -> Callable[[int], str]:
157
+ """Returns a function that receives a single integer
158
+ and returns it as a string padded by enough zero characters
159
+ to align with maximum possible integer
160
+
161
+ >>> name_f = build_name_function(57)
162
+
163
+ >>> name_f(7)
164
+ '07'
165
+ >>> name_f(31)
166
+ '31'
167
+ >>> build_name_function(1000)(42)
168
+ '0042'
169
+ >>> build_name_function(999)(42)
170
+ '042'
171
+ >>> build_name_function(0)(0)
172
+ '0'
173
+ """
174
+ # handle corner cases max_int is 0 or exact power of 10
175
+ max_int += 1e-8
176
+
177
+ pad_length = int(math.ceil(math.log10(max_int)))
178
+
179
+ def name_function(i: int) -> str:
180
+ return str(i).zfill(pad_length)
181
+
182
+ return name_function
183
+
184
+
185
+ def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool:
186
+ r"""Seek current file to file start, file end, or byte after delimiter seq.
187
+
188
+ Seeks file to next chunk delimiter, where chunks are defined on file start,
189
+ a delimiting sequence, and file end. Use file.tell() to see location afterwards.
190
+ Note that file start is a valid split, so must be at offset > 0 to seek for
191
+ delimiter.
192
+
193
+ Parameters
194
+ ----------
195
+ file: a file
196
+ delimiter: bytes
197
+ a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
198
+ blocksize: int
199
+ Number of bytes to read from the file at once.
200
+
201
+
202
+ Returns
203
+ -------
204
+ Returns True if a delimiter was found, False if at file start or end.
205
+
206
+ """
207
+
208
+ if file.tell() == 0:
209
+ # beginning-of-file, return without seek
210
+ return False
211
+
212
+ # Interface is for binary IO, with delimiter as bytes, but initialize last
213
+ # with result of file.read to preserve compatibility with text IO.
214
+ last: bytes | None = None
215
+ while True:
216
+ current = file.read(blocksize)
217
+ if not current:
218
+ # end-of-file without delimiter
219
+ return False
220
+ full = last + current if last else current
221
+ try:
222
+ if delimiter in full:
223
+ i = full.index(delimiter)
224
+ file.seek(file.tell() - (len(full) - i) + len(delimiter))
225
+ return True
226
+ elif len(current) < blocksize:
227
+ # end-of-file without delimiter
228
+ return False
229
+ except (OSError, ValueError):
230
+ pass
231
+ last = full[-len(delimiter) :]
232
+
233
+
234
+ def read_block(
235
+ f: IO[bytes],
236
+ offset: int,
237
+ length: int | None,
238
+ delimiter: bytes | None = None,
239
+ split_before: bool = False,
240
+ ) -> bytes:
241
+ """Read a block of bytes from a file
242
+
243
+ Parameters
244
+ ----------
245
+ f: File
246
+ Open file
247
+ offset: int
248
+ Byte offset to start read
249
+ length: int
250
+ Number of bytes to read, read through end of file if None
251
+ delimiter: bytes (optional)
252
+ Ensure reading starts and stops at delimiter bytestring
253
+ split_before: bool (optional)
254
+ Start/stop read *before* delimiter bytestring.
255
+
256
+
257
+ If using the ``delimiter=`` keyword argument we ensure that the read
258
+ starts and stops at delimiter boundaries that follow the locations
259
+ ``offset`` and ``offset + length``. If ``offset`` is zero then we
260
+ start at zero, regardless of delimiter. The bytestring returned WILL
261
+ include the terminating delimiter string.
262
+
263
+ Examples
264
+ --------
265
+
266
+ >>> from io import BytesIO # doctest: +SKIP
267
+ >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
268
+ >>> read_block(f, 0, 13) # doctest: +SKIP
269
+ b'Alice, 100\\nBo'
270
+
271
+ >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
272
+ b'Alice, 100\\nBob, 200\\n'
273
+
274
+ >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
275
+ b'Bob, 200\\nCharlie, 300'
276
+ """
277
+ if delimiter:
278
+ f.seek(offset)
279
+ found_start_delim = seek_delimiter(f, delimiter, 2**16)
280
+ if length is None:
281
+ return f.read()
282
+ start = f.tell()
283
+ length -= start - offset
284
+
285
+ f.seek(start + length)
286
+ found_end_delim = seek_delimiter(f, delimiter, 2**16)
287
+ end = f.tell()
288
+
289
+ # Adjust split location to before delimiter if seek found the
290
+ # delimiter sequence, not start or end of file.
291
+ if found_start_delim and split_before:
292
+ start -= len(delimiter)
293
+
294
+ if found_end_delim and split_before:
295
+ end -= len(delimiter)
296
+
297
+ offset = start
298
+ length = end - start
299
+
300
+ f.seek(offset)
301
+
302
+ # TODO: allow length to be None and read to the end of the file?
303
+ assert length is not None
304
+ b = f.read(length)
305
+ return b
306
+
307
+
308
+ def tokenize(*args: Any, **kwargs: Any) -> str:
309
+ """Deterministic token
310
+
311
+ (modified from dask.base)
312
+
313
+ >>> tokenize([1, 2, '3'])
314
+ '9d71491b50023b06fc76928e6eddb952'
315
+
316
+ >>> tokenize('Hello') == tokenize('Hello')
317
+ True
318
+ """
319
+ if kwargs:
320
+ args += (kwargs,)
321
+ try:
322
+ h = md5(str(args).encode())
323
+ except ValueError:
324
+ # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380
325
+ h = md5(str(args).encode(), usedforsecurity=False)
326
+ return h.hexdigest()
327
+
328
+
329
+ def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str:
330
+ """Attempt to convert a path-like object to a string.
331
+
332
+ Parameters
333
+ ----------
334
+ filepath: object to be converted
335
+
336
+ Returns
337
+ -------
338
+ filepath_str: maybe a string version of the object
339
+
340
+ Notes
341
+ -----
342
+ Objects supporting the fspath protocol are coerced according to its
343
+ __fspath__ method.
344
+
345
+ For backwards compatibility with older Python version, pathlib.Path
346
+ objects are specially coerced.
347
+
348
+ Any other object is passed through unchanged, which includes bytes,
349
+ strings, buffers, or anything else that's not even path-like.
350
+ """
351
+ if isinstance(filepath, str):
352
+ return filepath
353
+ elif hasattr(filepath, "__fspath__"):
354
+ return filepath.__fspath__()
355
+ elif hasattr(filepath, "path"):
356
+ return filepath.path
357
+ else:
358
+ return filepath # type: ignore[return-value]
359
+
360
+
361
+ def make_instance(
362
+ cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any]
363
+ ) -> T:
364
+ inst = cls(*args, **kwargs)
365
+ inst._determine_worker() # type: ignore[attr-defined]
366
+ return inst
367
+
368
+
369
+ def common_prefix(paths: Iterable[str]) -> str:
370
+ """For a list of paths, find the shortest prefix common to all"""
371
+ parts = [p.split("/") for p in paths]
372
+ lmax = min(len(p) for p in parts)
373
+ end = 0
374
+ for i in range(lmax):
375
+ end = all(p[i] == parts[0][i] for p in parts)
376
+ if not end:
377
+ break
378
+ i += end
379
+ return "/".join(parts[0][:i])
380
+
381
+
382
+ def other_paths(
383
+ paths: list[str],
384
+ path2: str | list[str],
385
+ exists: bool = False,
386
+ flatten: bool = False,
387
+ ) -> list[str]:
388
+ """In bulk file operations, construct a new file tree from a list of files
389
+
390
+ Parameters
391
+ ----------
392
+ paths: list of str
393
+ The input file tree
394
+ path2: str or list of str
395
+ Root to construct the new list in. If this is already a list of str, we just
396
+ assert it has the right number of elements.
397
+ exists: bool (optional)
398
+ For a str destination, it is already exists (and is a dir), files should
399
+ end up inside.
400
+ flatten: bool (optional)
401
+ Whether to flatten the input directory tree structure so that the output files
402
+ are in the same directory.
403
+
404
+ Returns
405
+ -------
406
+ list of str
407
+ """
408
+
409
+ if isinstance(path2, str):
410
+ path2 = path2.rstrip("/")
411
+
412
+ if flatten:
413
+ path2 = ["/".join((path2, p.split("/")[-1])) for p in paths]
414
+ else:
415
+ cp = common_prefix(paths)
416
+ if exists:
417
+ cp = cp.rsplit("/", 1)[0]
418
+ if not cp and all(not s.startswith("/") for s in paths):
419
+ path2 = ["/".join([path2, p]) for p in paths]
420
+ else:
421
+ path2 = [p.replace(cp, path2, 1) for p in paths]
422
+ else:
423
+ assert len(paths) == len(path2)
424
+ return path2
425
+
426
+
427
+ def is_exception(obj: Any) -> bool:
428
+ return isinstance(obj, BaseException)
429
+
430
+
431
+ def isfilelike(f: Any) -> TypeGuard[IO[bytes]]:
432
+ return all(hasattr(f, attr) for attr in ["read", "close", "tell"])
433
+
434
+
435
+ def get_protocol(url: str) -> str:
436
+ url = stringify_path(url)
437
+ parts = re.split(r"(\:\:|\://)", url, maxsplit=1)
438
+ if len(parts) > 1:
439
+ return parts[0]
440
+ return "file"
441
+
442
+
443
+ def can_be_local(path: str) -> bool:
444
+ """Can the given URL be used with open_local?"""
445
+ from fsspec import get_filesystem_class
446
+
447
+ try:
448
+ return getattr(get_filesystem_class(get_protocol(path)), "local_file", False)
449
+ except (ValueError, ImportError):
450
+ # not in registry or import failed
451
+ return False
452
+
453
+
454
+ def get_package_version_without_import(name: str) -> str | None:
455
+ """For given package name, try to find the version without importing it
456
+
457
+ Import and package.__version__ is still the backup here, so an import
458
+ *might* happen.
459
+
460
+ Returns either the version string, or None if the package
461
+ or the version was not readily found.
462
+ """
463
+ if name in sys.modules:
464
+ mod = sys.modules[name]
465
+ if hasattr(mod, "__version__"):
466
+ return mod.__version__
467
+ try:
468
+ return version(name)
469
+ except: # noqa: E722
470
+ pass
471
+ try:
472
+ import importlib
473
+
474
+ mod = importlib.import_module(name)
475
+ return mod.__version__
476
+ except (ImportError, AttributeError):
477
+ return None
478
+
479
+
480
+ def setup_logging(
481
+ logger: logging.Logger | None = None,
482
+ logger_name: str | None = None,
483
+ level: str = "DEBUG",
484
+ clear: bool = True,
485
+ ) -> logging.Logger:
486
+ if logger is None and logger_name is None:
487
+ raise ValueError("Provide either logger object or logger name")
488
+ logger = logger or logging.getLogger(logger_name)
489
+ handle = logging.StreamHandler()
490
+ formatter = logging.Formatter(
491
+ "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s"
492
+ )
493
+ handle.setFormatter(formatter)
494
+ if clear:
495
+ logger.handlers.clear()
496
+ logger.addHandler(handle)
497
+ logger.setLevel(level)
498
+ return logger
499
+
500
+
501
+ def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str:
502
+ return fs.unstrip_protocol(name)
503
+
504
+
505
+ def mirror_from(
506
+ origin_name: str, methods: Iterable[str]
507
+ ) -> Callable[[type[T]], type[T]]:
508
+ """Mirror attributes and methods from the given
509
+ origin_name attribute of the instance to the
510
+ decorated class"""
511
+
512
+ def origin_getter(method: str, self: Any) -> Any:
513
+ origin = getattr(self, origin_name)
514
+ return getattr(origin, method)
515
+
516
+ def wrapper(cls: type[T]) -> type[T]:
517
+ for method in methods:
518
+ wrapped_method = partial(origin_getter, method)
519
+ setattr(cls, method, property(wrapped_method))
520
+ return cls
521
+
522
+ return wrapper
523
+
524
+
525
+ @contextlib.contextmanager
526
+ def nullcontext(obj: T) -> Iterator[T]:
527
+ yield obj
528
+
529
+
530
+ def merge_offset_ranges(
531
+ paths: list[str],
532
+ starts: list[int] | int,
533
+ ends: list[int] | int,
534
+ max_gap: int = 0,
535
+ max_block: int | None = None,
536
+ sort: bool = True,
537
+ ) -> tuple[list[str], list[int], list[int]]:
538
+ """Merge adjacent byte-offset ranges when the inter-range
539
+ gap is <= `max_gap`, and when the merged byte range does not
540
+ exceed `max_block` (if specified). By default, this function
541
+ will re-order the input paths and byte ranges to ensure sorted
542
+ order. If the user can guarantee that the inputs are already
543
+ sorted, passing `sort=False` will skip the re-ordering.
544
+ """
545
+ # Check input
546
+ if not isinstance(paths, list):
547
+ raise TypeError
548
+ if not isinstance(starts, list):
549
+ starts = [starts] * len(paths)
550
+ if not isinstance(ends, list):
551
+ ends = [ends] * len(paths)
552
+ if len(starts) != len(paths) or len(ends) != len(paths):
553
+ raise ValueError
554
+
555
+ # Early Return
556
+ if len(starts) <= 1:
557
+ return paths, starts, ends
558
+
559
+ starts = [s or 0 for s in starts]
560
+ # Sort by paths and then ranges if `sort=True`
561
+ if sort:
562
+ paths, starts, ends = (
563
+ list(v)
564
+ for v in zip(
565
+ *sorted(
566
+ zip(paths, starts, ends),
567
+ )
568
+ )
569
+ )
570
+
571
+ if paths:
572
+ # Loop through the coupled `paths`, `starts`, and
573
+ # `ends`, and merge adjacent blocks when appropriate
574
+ new_paths = paths[:1]
575
+ new_starts = starts[:1]
576
+ new_ends = ends[:1]
577
+ for i in range(1, len(paths)):
578
+ if paths[i] == paths[i - 1] and new_ends[-1] is None:
579
+ continue
580
+ elif (
581
+ paths[i] != paths[i - 1]
582
+ or ((starts[i] - new_ends[-1]) > max_gap)
583
+ or (max_block is not None and (ends[i] - new_starts[-1]) > max_block)
584
+ ):
585
+ # Cannot merge with previous block.
586
+ # Add new `paths`, `starts`, and `ends` elements
587
+ new_paths.append(paths[i])
588
+ new_starts.append(starts[i])
589
+ new_ends.append(ends[i])
590
+ else:
591
+ # Merge with previous block by updating the
592
+ # last element of `ends`
593
+ new_ends[-1] = ends[i]
594
+ return new_paths, new_starts, new_ends
595
+
596
+ # `paths` is empty. Just return input lists
597
+ return paths, starts, ends
598
+
599
+
600
+ def file_size(filelike: IO[bytes]) -> int:
601
+ """Find length of any open read-mode file-like"""
602
+ pos = filelike.tell()
603
+ try:
604
+ return filelike.seek(0, 2)
605
+ finally:
606
+ filelike.seek(pos)
607
+
608
+
609
+ @contextlib.contextmanager
610
+ def atomic_write(path: str, mode: str = "wb"):
611
+ """
612
+ A context manager that opens a temporary file next to `path` and, on exit,
613
+ replaces `path` with the temporary file, thereby updating `path`
614
+ atomically.
615
+ """
616
+ fd, fn = tempfile.mkstemp(
617
+ dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
618
+ )
619
+ try:
620
+ with open(fd, mode) as fp:
621
+ yield fp
622
+ except BaseException:
623
+ with contextlib.suppress(FileNotFoundError):
624
+ os.unlink(fn)
625
+ raise
626
+ else:
627
+ os.replace(fn, path)
628
+
629
+
630
+ def _translate(pat, STAR, QUESTION_MARK):
631
+ # Copied from: https://github.com/python/cpython/pull/106703.
632
+ res: list[str] = []
633
+ add = res.append
634
+ i, n = 0, len(pat)
635
+ while i < n:
636
+ c = pat[i]
637
+ i = i + 1
638
+ if c == "*":
639
+ # compress consecutive `*` into one
640
+ if (not res) or res[-1] is not STAR:
641
+ add(STAR)
642
+ elif c == "?":
643
+ add(QUESTION_MARK)
644
+ elif c == "[":
645
+ j = i
646
+ if j < n and pat[j] == "!":
647
+ j = j + 1
648
+ if j < n and pat[j] == "]":
649
+ j = j + 1
650
+ while j < n and pat[j] != "]":
651
+ j = j + 1
652
+ if j >= n:
653
+ add("\\[")
654
+ else:
655
+ stuff = pat[i:j]
656
+ if "-" not in stuff:
657
+ stuff = stuff.replace("\\", r"\\")
658
+ else:
659
+ chunks = []
660
+ k = i + 2 if pat[i] == "!" else i + 1
661
+ while True:
662
+ k = pat.find("-", k, j)
663
+ if k < 0:
664
+ break
665
+ chunks.append(pat[i:k])
666
+ i = k + 1
667
+ k = k + 3
668
+ chunk = pat[i:j]
669
+ if chunk:
670
+ chunks.append(chunk)
671
+ else:
672
+ chunks[-1] += "-"
673
+ # Remove empty ranges -- invalid in RE.
674
+ for k in range(len(chunks) - 1, 0, -1):
675
+ if chunks[k - 1][-1] > chunks[k][0]:
676
+ chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
677
+ del chunks[k]
678
+ # Escape backslashes and hyphens for set difference (--).
679
+ # Hyphens that create ranges shouldn't be escaped.
680
+ stuff = "-".join(
681
+ s.replace("\\", r"\\").replace("-", r"\-") for s in chunks
682
+ )
683
+ # Escape set operations (&&, ~~ and ||).
684
+ stuff = re.sub(r"([&~|])", r"\\\1", stuff)
685
+ i = j + 1
686
+ if not stuff:
687
+ # Empty range: never match.
688
+ add("(?!)")
689
+ elif stuff == "!":
690
+ # Negated empty range: match any character.
691
+ add(".")
692
+ else:
693
+ if stuff[0] == "!":
694
+ stuff = "^" + stuff[1:]
695
+ elif stuff[0] in ("^", "["):
696
+ stuff = "\\" + stuff
697
+ add(f"[{stuff}]")
698
+ else:
699
+ add(re.escape(c))
700
+ assert i == n
701
+ return res
702
+
703
+
704
+ def glob_translate(pat):
705
+ # Copied from: https://github.com/python/cpython/pull/106703.
706
+ # The keyword parameters' values are fixed to:
707
+ # recursive=True, include_hidden=True, seps=None
708
+ """Translate a pathname with shell wildcards to a regular expression."""
709
+ if os.path.altsep:
710
+ seps = os.path.sep + os.path.altsep
711
+ else:
712
+ seps = os.path.sep
713
+ escaped_seps = "".join(map(re.escape, seps))
714
+ any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps
715
+ not_sep = f"[^{escaped_seps}]"
716
+ one_last_segment = f"{not_sep}+"
717
+ one_segment = f"{one_last_segment}{any_sep}"
718
+ any_segments = f"(?:.+{any_sep})?"
719
+ any_last_segments = ".*"
720
+ results = []
721
+ parts = re.split(any_sep, pat)
722
+ last_part_idx = len(parts) - 1
723
+ for idx, part in enumerate(parts):
724
+ if part == "*":
725
+ results.append(one_segment if idx < last_part_idx else one_last_segment)
726
+ continue
727
+ if part == "**":
728
+ results.append(any_segments if idx < last_part_idx else any_last_segments)
729
+ continue
730
+ elif "**" in part:
731
+ raise ValueError(
732
+ "Invalid pattern: '**' can only be an entire path component"
733
+ )
734
+ if part:
735
+ results.extend(_translate(part, f"{not_sep}*", not_sep))
736
+ if idx < last_part_idx:
737
+ results.append(any_sep)
738
+ res = "".join(results)
739
+ return rf"(?s:{res})\Z"
meow/lib/python3.13/site-packages/huggingface_hub-0.27.0.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
meow/lib/python3.13/site-packages/huggingface_hub-0.27.0.dist-info/METADATA ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: huggingface-hub
3
+ Version: 0.27.0
4
+ Summary: Client library to download and publish models, datasets and other repos on the huggingface.co hub
5
+ Home-page: https://github.com/huggingface/huggingface_hub
6
+ Author: Hugging Face, Inc.
7
+ Author-email: [email protected]
8
+ License: Apache
9
+ Keywords: model-hub machine-learning models natural-language-processing deep-learning pytorch pretrained-models
10
+ Platform: UNKNOWN
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Requires-Python: >=3.8.0
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: filelock
29
+ Requires-Dist: fsspec>=2023.5.0
30
+ Requires-Dist: packaging>=20.9
31
+ Requires-Dist: pyyaml>=5.1
32
+ Requires-Dist: requests
33
+ Requires-Dist: tqdm>=4.42.1
34
+ Requires-Dist: typing-extensions>=3.7.4.3
35
+ Provides-Extra: all
36
+ Requires-Dist: InquirerPy==0.3.4; extra == "all"
37
+ Requires-Dist: aiohttp; extra == "all"
38
+ Requires-Dist: jedi; extra == "all"
39
+ Requires-Dist: Jinja2; extra == "all"
40
+ Requires-Dist: pytest<8.2.2,>=8.1.1; extra == "all"
41
+ Requires-Dist: pytest-cov; extra == "all"
42
+ Requires-Dist: pytest-env; extra == "all"
43
+ Requires-Dist: pytest-xdist; extra == "all"
44
+ Requires-Dist: pytest-vcr; extra == "all"
45
+ Requires-Dist: pytest-asyncio; extra == "all"
46
+ Requires-Dist: pytest-rerunfailures; extra == "all"
47
+ Requires-Dist: pytest-mock; extra == "all"
48
+ Requires-Dist: urllib3<2.0; extra == "all"
49
+ Requires-Dist: soundfile; extra == "all"
50
+ Requires-Dist: Pillow; extra == "all"
51
+ Requires-Dist: gradio>=4.0.0; extra == "all"
52
+ Requires-Dist: numpy; extra == "all"
53
+ Requires-Dist: fastapi; extra == "all"
54
+ Requires-Dist: ruff>=0.5.0; extra == "all"
55
+ Requires-Dist: mypy==1.5.1; extra == "all"
56
+ Requires-Dist: libcst==1.4.0; extra == "all"
57
+ Requires-Dist: typing-extensions>=4.8.0; extra == "all"
58
+ Requires-Dist: types-PyYAML; extra == "all"
59
+ Requires-Dist: types-requests; extra == "all"
60
+ Requires-Dist: types-simplejson; extra == "all"
61
+ Requires-Dist: types-toml; extra == "all"
62
+ Requires-Dist: types-tqdm; extra == "all"
63
+ Requires-Dist: types-urllib3; extra == "all"
64
+ Provides-Extra: cli
65
+ Requires-Dist: InquirerPy==0.3.4; extra == "cli"
66
+ Provides-Extra: dev
67
+ Requires-Dist: InquirerPy==0.3.4; extra == "dev"
68
+ Requires-Dist: aiohttp; extra == "dev"
69
+ Requires-Dist: jedi; extra == "dev"
70
+ Requires-Dist: Jinja2; extra == "dev"
71
+ Requires-Dist: pytest<8.2.2,>=8.1.1; extra == "dev"
72
+ Requires-Dist: pytest-cov; extra == "dev"
73
+ Requires-Dist: pytest-env; extra == "dev"
74
+ Requires-Dist: pytest-xdist; extra == "dev"
75
+ Requires-Dist: pytest-vcr; extra == "dev"
76
+ Requires-Dist: pytest-asyncio; extra == "dev"
77
+ Requires-Dist: pytest-rerunfailures; extra == "dev"
78
+ Requires-Dist: pytest-mock; extra == "dev"
79
+ Requires-Dist: urllib3<2.0; extra == "dev"
80
+ Requires-Dist: soundfile; extra == "dev"
81
+ Requires-Dist: Pillow; extra == "dev"
82
+ Requires-Dist: gradio>=4.0.0; extra == "dev"
83
+ Requires-Dist: numpy; extra == "dev"
84
+ Requires-Dist: fastapi; extra == "dev"
85
+ Requires-Dist: ruff>=0.5.0; extra == "dev"
86
+ Requires-Dist: mypy==1.5.1; extra == "dev"
87
+ Requires-Dist: libcst==1.4.0; extra == "dev"
88
+ Requires-Dist: typing-extensions>=4.8.0; extra == "dev"
89
+ Requires-Dist: types-PyYAML; extra == "dev"
90
+ Requires-Dist: types-requests; extra == "dev"
91
+ Requires-Dist: types-simplejson; extra == "dev"
92
+ Requires-Dist: types-toml; extra == "dev"
93
+ Requires-Dist: types-tqdm; extra == "dev"
94
+ Requires-Dist: types-urllib3; extra == "dev"
95
+ Provides-Extra: fastai
96
+ Requires-Dist: toml; extra == "fastai"
97
+ Requires-Dist: fastai>=2.4; extra == "fastai"
98
+ Requires-Dist: fastcore>=1.3.27; extra == "fastai"
99
+ Provides-Extra: hf_transfer
100
+ Requires-Dist: hf-transfer>=0.1.4; extra == "hf-transfer"
101
+ Provides-Extra: inference
102
+ Requires-Dist: aiohttp; extra == "inference"
103
+ Provides-Extra: quality
104
+ Requires-Dist: ruff>=0.5.0; extra == "quality"
105
+ Requires-Dist: mypy==1.5.1; extra == "quality"
106
+ Requires-Dist: libcst==1.4.0; extra == "quality"
107
+ Provides-Extra: tensorflow
108
+ Requires-Dist: tensorflow; extra == "tensorflow"
109
+ Requires-Dist: pydot; extra == "tensorflow"
110
+ Requires-Dist: graphviz; extra == "tensorflow"
111
+ Provides-Extra: tensorflow-testing
112
+ Requires-Dist: tensorflow; extra == "tensorflow-testing"
113
+ Requires-Dist: keras<3.0; extra == "tensorflow-testing"
114
+ Provides-Extra: testing
115
+ Requires-Dist: InquirerPy==0.3.4; extra == "testing"
116
+ Requires-Dist: aiohttp; extra == "testing"
117
+ Requires-Dist: jedi; extra == "testing"
118
+ Requires-Dist: Jinja2; extra == "testing"
119
+ Requires-Dist: pytest<8.2.2,>=8.1.1; extra == "testing"
120
+ Requires-Dist: pytest-cov; extra == "testing"
121
+ Requires-Dist: pytest-env; extra == "testing"
122
+ Requires-Dist: pytest-xdist; extra == "testing"
123
+ Requires-Dist: pytest-vcr; extra == "testing"
124
+ Requires-Dist: pytest-asyncio; extra == "testing"
125
+ Requires-Dist: pytest-rerunfailures; extra == "testing"
126
+ Requires-Dist: pytest-mock; extra == "testing"
127
+ Requires-Dist: urllib3<2.0; extra == "testing"
128
+ Requires-Dist: soundfile; extra == "testing"
129
+ Requires-Dist: Pillow; extra == "testing"
130
+ Requires-Dist: gradio>=4.0.0; extra == "testing"
131
+ Requires-Dist: numpy; extra == "testing"
132
+ Requires-Dist: fastapi; extra == "testing"
133
+ Provides-Extra: torch
134
+ Requires-Dist: torch; extra == "torch"
135
+ Requires-Dist: safetensors[torch]; extra == "torch"
136
+ Provides-Extra: typing
137
+ Requires-Dist: typing-extensions>=4.8.0; extra == "typing"
138
+ Requires-Dist: types-PyYAML; extra == "typing"
139
+ Requires-Dist: types-requests; extra == "typing"
140
+ Requires-Dist: types-simplejson; extra == "typing"
141
+ Requires-Dist: types-toml; extra == "typing"
142
+ Requires-Dist: types-tqdm; extra == "typing"
143
+ Requires-Dist: types-urllib3; extra == "typing"
144
+
145
+ <p align="center">
146
+ <picture>
147
+ <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/huggingface_hub-dark.svg">
148
+ <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/huggingface_hub.svg">
149
+ <img alt="huggingface_hub library logo" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/huggingface_hub.svg" width="352" height="59" style="max-width: 100%;">
150
+ </picture>
151
+ <br/>
152
+ <br/>
153
+ </p>
154
+
155
+ <p align="center">
156
+ <i>The official Python client for the Huggingface Hub.</i>
157
+ </p>
158
+
159
+ <p align="center">
160
+ <a href="https://huggingface.co/docs/huggingface_hub/en/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/huggingface_hub/index.svg?down_color=red&down_message=offline&up_message=online&label=doc"></a>
161
+ <a href="https://github.com/huggingface/huggingface_hub/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/huggingface_hub.svg"></a>
162
+ <a href="https://github.com/huggingface/huggingface_hub"><img alt="PyPi version" src="https://img.shields.io/pypi/pyversions/huggingface_hub.svg"></a>
163
+ <a href="https://pypi.org/project/huggingface-hub"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/huggingface_hub"></a>
164
+ <a href="https://codecov.io/gh/huggingface/huggingface_hub"><img alt="Code coverage" src="https://codecov.io/gh/huggingface/huggingface_hub/branch/main/graph/badge.svg?token=RXP95LE2XL"></a>
165
+ </p>
166
+
167
+ <h4 align="center">
168
+ <p>
169
+ <b>English</b> |
170
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_de.md">Deutsch</a> |
171
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_hi.md">हिंदी</a> |
172
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_ko.md">한국어</a> |
173
+ <a href="https://github.com/huggingface/huggingface_hub/blob/main/i18n/README_cn.md">中文(简体)</a>
174
+ <p>
175
+ </h4>
176
+
177
+ ---
178
+
179
+ **Documentation**: <a href="https://hf.co/docs/huggingface_hub" target="_blank">https://hf.co/docs/huggingface_hub</a>
180
+
181
+ **Source Code**: <a href="https://github.com/huggingface/huggingface_hub" target="_blank">https://github.com/huggingface/huggingface_hub</a>
182
+
183
+ ---
184
+
185
+ ## Welcome to the huggingface_hub library
186
+
187
+ The `huggingface_hub` library allows you to interact with the [Hugging Face Hub](https://huggingface.co/), a platform democratizing open-source Machine Learning for creators and collaborators. Discover pre-trained models and datasets for your projects or play with the thousands of machine learning apps hosted on the Hub. You can also create and share your own models, datasets and demos with the community. The `huggingface_hub` library provides a simple way to do all these things with Python.
188
+
189
+ ## Key features
190
+
191
+ - [Download files](https://huggingface.co/docs/huggingface_hub/en/guides/download) from the Hub.
192
+ - [Upload files](https://huggingface.co/docs/huggingface_hub/en/guides/upload) to the Hub.
193
+ - [Manage your repositories](https://huggingface.co/docs/huggingface_hub/en/guides/repository).
194
+ - [Run Inference](https://huggingface.co/docs/huggingface_hub/en/guides/inference) on deployed models.
195
+ - [Search](https://huggingface.co/docs/huggingface_hub/en/guides/search) for models, datasets and Spaces.
196
+ - [Share Model Cards](https://huggingface.co/docs/huggingface_hub/en/guides/model-cards) to document your models.
197
+ - [Engage with the community](https://huggingface.co/docs/huggingface_hub/en/guides/community) through PRs and comments.
198
+
199
+ ## Installation
200
+
201
+ Install the `huggingface_hub` package with [pip](https://pypi.org/project/huggingface-hub/):
202
+
203
+ ```bash
204
+ pip install huggingface_hub
205
+ ```
206
+
207
+ If you prefer, you can also install it with [conda](https://huggingface.co/docs/huggingface_hub/en/installation#install-with-conda).
208
+
209
+ In order to keep the package minimal by default, `huggingface_hub` comes with optional dependencies useful for some use cases. For example, if you want have a complete experience for Inference, run:
210
+
211
+ ```bash
212
+ pip install huggingface_hub[inference]
213
+ ```
214
+
215
+ To learn more installation and optional dependencies, check out the [installation guide](https://huggingface.co/docs/huggingface_hub/en/installation).
216
+
217
+ ## Quick start
218
+
219
+ ### Download files
220
+
221
+ Download a single file
222
+
223
+ ```py
224
+ from huggingface_hub import hf_hub_download
225
+
226
+ hf_hub_download(repo_id="tiiuae/falcon-7b-instruct", filename="config.json")
227
+ ```
228
+
229
+ Or an entire repository
230
+
231
+ ```py
232
+ from huggingface_hub import snapshot_download
233
+
234
+ snapshot_download("stabilityai/stable-diffusion-2-1")
235
+ ```
236
+
237
+ Files will be downloaded in a local cache folder. More details in [this guide](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache).
238
+
239
+ ### Login
240
+
241
+ The Hugging Face Hub uses tokens to authenticate applications (see [docs](https://huggingface.co/docs/hub/security-tokens)). To log in your machine, run the following CLI:
242
+
243
+ ```bash
244
+ huggingface-cli login
245
+ # or using an environment variable
246
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
247
+ ```
248
+
249
+ ### Create a repository
250
+
251
+ ```py
252
+ from huggingface_hub import create_repo
253
+
254
+ create_repo(repo_id="super-cool-model")
255
+ ```
256
+
257
+ ### Upload files
258
+
259
+ Upload a single file
260
+
261
+ ```py
262
+ from huggingface_hub import upload_file
263
+
264
+ upload_file(
265
+ path_or_fileobj="/home/lysandre/dummy-test/README.md",
266
+ path_in_repo="README.md",
267
+ repo_id="lysandre/test-model",
268
+ )
269
+ ```
270
+
271
+ Or an entire folder
272
+
273
+ ```py
274
+ from huggingface_hub import upload_folder
275
+
276
+ upload_folder(
277
+ folder_path="/path/to/local/space",
278
+ repo_id="username/my-cool-space",
279
+ repo_type="space",
280
+ )
281
+ ```
282
+
283
+ For details in the [upload guide](https://huggingface.co/docs/huggingface_hub/en/guides/upload).
284
+
285
+ ## Integrating to the Hub.
286
+
287
+ We're partnering with cool open source ML libraries to provide free model hosting and versioning. You can find the existing integrations [here](https://huggingface.co/docs/hub/libraries).
288
+
289
+ The advantages are:
290
+
291
+ - Free model or dataset hosting for libraries and their users.
292
+ - Built-in file versioning, even with very large files, thanks to a git-based approach.
293
+ - Serverless inference API for all models publicly available.
294
+ - In-browser widgets to play with the uploaded models.
295
+ - Anyone can upload a new model for your library, they just need to add the corresponding tag for the model to be discoverable.
296
+ - Fast downloads! We use Cloudfront (a CDN) to geo-replicate downloads so they're blazing fast from anywhere on the globe.
297
+ - Usage stats and more features to come.
298
+
299
+ If you would like to integrate your library, feel free to open an issue to begin the discussion. We wrote a [step-by-step guide](https://huggingface.co/docs/hub/adding-a-library) with ❤️ showing how to do this integration.
300
+
301
+ ## Contributions (feature requests, bugs, etc.) are super welcome 💙💚💛💜🧡❤️
302
+
303
+ Everyone is welcome to contribute, and we value everybody's contribution. Code is not the only way to help the community.
304
+ Answering questions, helping others, reaching out and improving the documentations are immensely valuable to the community.
305
+ We wrote a [contribution guide](https://github.com/huggingface/huggingface_hub/blob/main/CONTRIBUTING.md) to summarize
306
+ how to get started to contribute to this repository.
307
+
308
+
meow/lib/python3.13/site-packages/huggingface_hub/__init__.py ADDED
@@ -0,0 +1,1028 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # ***********
16
+ # `huggingface_hub` init has 2 modes:
17
+ # - Normal usage:
18
+ # If imported to use it, all modules and functions are lazy-loaded. This means
19
+ # they exist at top level in module but are imported only the first time they are
20
+ # used. This way, `from huggingface_hub import something` will import `something`
21
+ # quickly without the hassle of importing all the features from `huggingface_hub`.
22
+ # - Static check:
23
+ # If statically analyzed, all modules and functions are loaded normally. This way
24
+ # static typing check works properly as well as autocomplete in text editors and
25
+ # IDEs.
26
+ #
27
+ # The static model imports are done inside the `if TYPE_CHECKING:` statement at
28
+ # the bottom of this file. Since module/functions imports are duplicated, it is
29
+ # mandatory to make sure to add them twice when adding one. This is checked in the
30
+ # `make quality` command.
31
+ #
32
+ # To update the static imports, please run the following command and commit the changes.
33
+ # ```
34
+ # # Use script
35
+ # python utils/check_static_imports.py --update-file
36
+ #
37
+ # # Or run style on codebase
38
+ # make style
39
+ # ```
40
+ #
41
+ # ***********
42
+ # Lazy loader vendored from https://github.com/scientific-python/lazy_loader
43
+ import importlib
44
+ import os
45
+ import sys
46
+ from typing import TYPE_CHECKING
47
+
48
+
49
+ __version__ = "0.27.0"
50
+
51
+ # Alphabetical order of definitions is ensured in tests
52
+ # WARNING: any comment added in this dictionary definition will be lost when
53
+ # re-generating the file !
54
+ _SUBMOD_ATTRS = {
55
+ "_commit_scheduler": [
56
+ "CommitScheduler",
57
+ ],
58
+ "_inference_endpoints": [
59
+ "InferenceEndpoint",
60
+ "InferenceEndpointError",
61
+ "InferenceEndpointStatus",
62
+ "InferenceEndpointTimeoutError",
63
+ "InferenceEndpointType",
64
+ ],
65
+ "_login": [
66
+ "auth_list",
67
+ "auth_switch",
68
+ "interpreter_login",
69
+ "login",
70
+ "logout",
71
+ "notebook_login",
72
+ ],
73
+ "_snapshot_download": [
74
+ "snapshot_download",
75
+ ],
76
+ "_space_api": [
77
+ "SpaceHardware",
78
+ "SpaceRuntime",
79
+ "SpaceStage",
80
+ "SpaceStorage",
81
+ "SpaceVariable",
82
+ ],
83
+ "_tensorboard_logger": [
84
+ "HFSummaryWriter",
85
+ ],
86
+ "_webhooks_payload": [
87
+ "WebhookPayload",
88
+ "WebhookPayloadComment",
89
+ "WebhookPayloadDiscussion",
90
+ "WebhookPayloadDiscussionChanges",
91
+ "WebhookPayloadEvent",
92
+ "WebhookPayloadMovedTo",
93
+ "WebhookPayloadRepo",
94
+ "WebhookPayloadUrl",
95
+ "WebhookPayloadWebhook",
96
+ ],
97
+ "_webhooks_server": [
98
+ "WebhooksServer",
99
+ "webhook_endpoint",
100
+ ],
101
+ "community": [
102
+ "Discussion",
103
+ "DiscussionComment",
104
+ "DiscussionCommit",
105
+ "DiscussionEvent",
106
+ "DiscussionStatusChange",
107
+ "DiscussionTitleChange",
108
+ "DiscussionWithDetails",
109
+ ],
110
+ "constants": [
111
+ "CONFIG_NAME",
112
+ "FLAX_WEIGHTS_NAME",
113
+ "HUGGINGFACE_CO_URL_HOME",
114
+ "HUGGINGFACE_CO_URL_TEMPLATE",
115
+ "PYTORCH_WEIGHTS_NAME",
116
+ "REPO_TYPE_DATASET",
117
+ "REPO_TYPE_MODEL",
118
+ "REPO_TYPE_SPACE",
119
+ "TF2_WEIGHTS_NAME",
120
+ "TF_WEIGHTS_NAME",
121
+ ],
122
+ "fastai_utils": [
123
+ "_save_pretrained_fastai",
124
+ "from_pretrained_fastai",
125
+ "push_to_hub_fastai",
126
+ ],
127
+ "file_download": [
128
+ "HfFileMetadata",
129
+ "_CACHED_NO_EXIST",
130
+ "get_hf_file_metadata",
131
+ "hf_hub_download",
132
+ "hf_hub_url",
133
+ "try_to_load_from_cache",
134
+ ],
135
+ "hf_api": [
136
+ "Collection",
137
+ "CollectionItem",
138
+ "CommitInfo",
139
+ "CommitOperation",
140
+ "CommitOperationAdd",
141
+ "CommitOperationCopy",
142
+ "CommitOperationDelete",
143
+ "DatasetInfo",
144
+ "GitCommitInfo",
145
+ "GitRefInfo",
146
+ "GitRefs",
147
+ "HfApi",
148
+ "ModelInfo",
149
+ "RepoUrl",
150
+ "SpaceInfo",
151
+ "User",
152
+ "UserLikes",
153
+ "WebhookInfo",
154
+ "WebhookWatchedItem",
155
+ "accept_access_request",
156
+ "add_collection_item",
157
+ "add_space_secret",
158
+ "add_space_variable",
159
+ "auth_check",
160
+ "cancel_access_request",
161
+ "change_discussion_status",
162
+ "comment_discussion",
163
+ "create_branch",
164
+ "create_collection",
165
+ "create_commit",
166
+ "create_discussion",
167
+ "create_inference_endpoint",
168
+ "create_pull_request",
169
+ "create_repo",
170
+ "create_tag",
171
+ "create_webhook",
172
+ "dataset_info",
173
+ "delete_branch",
174
+ "delete_collection",
175
+ "delete_collection_item",
176
+ "delete_file",
177
+ "delete_folder",
178
+ "delete_inference_endpoint",
179
+ "delete_repo",
180
+ "delete_space_secret",
181
+ "delete_space_storage",
182
+ "delete_space_variable",
183
+ "delete_tag",
184
+ "delete_webhook",
185
+ "disable_webhook",
186
+ "duplicate_space",
187
+ "edit_discussion_comment",
188
+ "enable_webhook",
189
+ "file_exists",
190
+ "get_collection",
191
+ "get_dataset_tags",
192
+ "get_discussion_details",
193
+ "get_full_repo_name",
194
+ "get_inference_endpoint",
195
+ "get_model_tags",
196
+ "get_paths_info",
197
+ "get_repo_discussions",
198
+ "get_safetensors_metadata",
199
+ "get_space_runtime",
200
+ "get_space_variables",
201
+ "get_token_permission",
202
+ "get_user_overview",
203
+ "get_webhook",
204
+ "grant_access",
205
+ "like",
206
+ "list_accepted_access_requests",
207
+ "list_collections",
208
+ "list_datasets",
209
+ "list_inference_endpoints",
210
+ "list_liked_repos",
211
+ "list_models",
212
+ "list_organization_members",
213
+ "list_papers",
214
+ "list_pending_access_requests",
215
+ "list_rejected_access_requests",
216
+ "list_repo_commits",
217
+ "list_repo_files",
218
+ "list_repo_likers",
219
+ "list_repo_refs",
220
+ "list_repo_tree",
221
+ "list_spaces",
222
+ "list_user_followers",
223
+ "list_user_following",
224
+ "list_webhooks",
225
+ "merge_pull_request",
226
+ "model_info",
227
+ "move_repo",
228
+ "paper_info",
229
+ "parse_safetensors_file_metadata",
230
+ "pause_inference_endpoint",
231
+ "pause_space",
232
+ "preupload_lfs_files",
233
+ "reject_access_request",
234
+ "rename_discussion",
235
+ "repo_exists",
236
+ "repo_info",
237
+ "repo_type_and_id_from_hf_id",
238
+ "request_space_hardware",
239
+ "request_space_storage",
240
+ "restart_space",
241
+ "resume_inference_endpoint",
242
+ "revision_exists",
243
+ "run_as_future",
244
+ "scale_to_zero_inference_endpoint",
245
+ "set_space_sleep_time",
246
+ "space_info",
247
+ "super_squash_history",
248
+ "unlike",
249
+ "update_collection_item",
250
+ "update_collection_metadata",
251
+ "update_inference_endpoint",
252
+ "update_repo_settings",
253
+ "update_repo_visibility",
254
+ "update_webhook",
255
+ "upload_file",
256
+ "upload_folder",
257
+ "upload_large_folder",
258
+ "whoami",
259
+ ],
260
+ "hf_file_system": [
261
+ "HfFileSystem",
262
+ "HfFileSystemFile",
263
+ "HfFileSystemResolvedPath",
264
+ "HfFileSystemStreamFile",
265
+ ],
266
+ "hub_mixin": [
267
+ "ModelHubMixin",
268
+ "PyTorchModelHubMixin",
269
+ ],
270
+ "inference._client": [
271
+ "InferenceClient",
272
+ "InferenceTimeoutError",
273
+ ],
274
+ "inference._generated._async_client": [
275
+ "AsyncInferenceClient",
276
+ ],
277
+ "inference._generated.types": [
278
+ "AudioClassificationInput",
279
+ "AudioClassificationOutputElement",
280
+ "AudioClassificationOutputTransform",
281
+ "AudioClassificationParameters",
282
+ "AudioToAudioInput",
283
+ "AudioToAudioOutputElement",
284
+ "AutomaticSpeechRecognitionEarlyStoppingEnum",
285
+ "AutomaticSpeechRecognitionGenerationParameters",
286
+ "AutomaticSpeechRecognitionInput",
287
+ "AutomaticSpeechRecognitionOutput",
288
+ "AutomaticSpeechRecognitionOutputChunk",
289
+ "AutomaticSpeechRecognitionParameters",
290
+ "ChatCompletionInput",
291
+ "ChatCompletionInputFunctionDefinition",
292
+ "ChatCompletionInputFunctionName",
293
+ "ChatCompletionInputGrammarType",
294
+ "ChatCompletionInputGrammarTypeType",
295
+ "ChatCompletionInputMessage",
296
+ "ChatCompletionInputMessageChunk",
297
+ "ChatCompletionInputMessageChunkType",
298
+ "ChatCompletionInputStreamOptions",
299
+ "ChatCompletionInputTool",
300
+ "ChatCompletionInputToolChoiceClass",
301
+ "ChatCompletionInputToolChoiceEnum",
302
+ "ChatCompletionInputURL",
303
+ "ChatCompletionOutput",
304
+ "ChatCompletionOutputComplete",
305
+ "ChatCompletionOutputFunctionDefinition",
306
+ "ChatCompletionOutputLogprob",
307
+ "ChatCompletionOutputLogprobs",
308
+ "ChatCompletionOutputMessage",
309
+ "ChatCompletionOutputToolCall",
310
+ "ChatCompletionOutputTopLogprob",
311
+ "ChatCompletionOutputUsage",
312
+ "ChatCompletionStreamOutput",
313
+ "ChatCompletionStreamOutputChoice",
314
+ "ChatCompletionStreamOutputDelta",
315
+ "ChatCompletionStreamOutputDeltaToolCall",
316
+ "ChatCompletionStreamOutputFunction",
317
+ "ChatCompletionStreamOutputLogprob",
318
+ "ChatCompletionStreamOutputLogprobs",
319
+ "ChatCompletionStreamOutputTopLogprob",
320
+ "ChatCompletionStreamOutputUsage",
321
+ "DepthEstimationInput",
322
+ "DepthEstimationOutput",
323
+ "DocumentQuestionAnsweringInput",
324
+ "DocumentQuestionAnsweringInputData",
325
+ "DocumentQuestionAnsweringOutputElement",
326
+ "DocumentQuestionAnsweringParameters",
327
+ "FeatureExtractionInput",
328
+ "FeatureExtractionInputTruncationDirection",
329
+ "FillMaskInput",
330
+ "FillMaskOutputElement",
331
+ "FillMaskParameters",
332
+ "ImageClassificationInput",
333
+ "ImageClassificationOutputElement",
334
+ "ImageClassificationOutputTransform",
335
+ "ImageClassificationParameters",
336
+ "ImageSegmentationInput",
337
+ "ImageSegmentationOutputElement",
338
+ "ImageSegmentationParameters",
339
+ "ImageSegmentationSubtask",
340
+ "ImageToImageInput",
341
+ "ImageToImageOutput",
342
+ "ImageToImageParameters",
343
+ "ImageToImageTargetSize",
344
+ "ImageToTextEarlyStoppingEnum",
345
+ "ImageToTextGenerationParameters",
346
+ "ImageToTextInput",
347
+ "ImageToTextOutput",
348
+ "ImageToTextParameters",
349
+ "ObjectDetectionBoundingBox",
350
+ "ObjectDetectionInput",
351
+ "ObjectDetectionOutputElement",
352
+ "ObjectDetectionParameters",
353
+ "Padding",
354
+ "QuestionAnsweringInput",
355
+ "QuestionAnsweringInputData",
356
+ "QuestionAnsweringOutputElement",
357
+ "QuestionAnsweringParameters",
358
+ "SentenceSimilarityInput",
359
+ "SentenceSimilarityInputData",
360
+ "SummarizationInput",
361
+ "SummarizationOutput",
362
+ "SummarizationParameters",
363
+ "SummarizationTruncationStrategy",
364
+ "TableQuestionAnsweringInput",
365
+ "TableQuestionAnsweringInputData",
366
+ "TableQuestionAnsweringOutputElement",
367
+ "TableQuestionAnsweringParameters",
368
+ "Text2TextGenerationInput",
369
+ "Text2TextGenerationOutput",
370
+ "Text2TextGenerationParameters",
371
+ "Text2TextGenerationTruncationStrategy",
372
+ "TextClassificationInput",
373
+ "TextClassificationOutputElement",
374
+ "TextClassificationOutputTransform",
375
+ "TextClassificationParameters",
376
+ "TextGenerationInput",
377
+ "TextGenerationInputGenerateParameters",
378
+ "TextGenerationInputGrammarType",
379
+ "TextGenerationOutput",
380
+ "TextGenerationOutputBestOfSequence",
381
+ "TextGenerationOutputDetails",
382
+ "TextGenerationOutputFinishReason",
383
+ "TextGenerationOutputPrefillToken",
384
+ "TextGenerationOutputToken",
385
+ "TextGenerationStreamOutput",
386
+ "TextGenerationStreamOutputStreamDetails",
387
+ "TextGenerationStreamOutputToken",
388
+ "TextToAudioEarlyStoppingEnum",
389
+ "TextToAudioGenerationParameters",
390
+ "TextToAudioInput",
391
+ "TextToAudioOutput",
392
+ "TextToAudioParameters",
393
+ "TextToImageInput",
394
+ "TextToImageOutput",
395
+ "TextToImageParameters",
396
+ "TextToImageTargetSize",
397
+ "TextToSpeechEarlyStoppingEnum",
398
+ "TextToSpeechGenerationParameters",
399
+ "TextToSpeechInput",
400
+ "TextToSpeechOutput",
401
+ "TextToSpeechParameters",
402
+ "TokenClassificationAggregationStrategy",
403
+ "TokenClassificationInput",
404
+ "TokenClassificationOutputElement",
405
+ "TokenClassificationParameters",
406
+ "TranslationInput",
407
+ "TranslationOutput",
408
+ "TranslationParameters",
409
+ "TranslationTruncationStrategy",
410
+ "TypeEnum",
411
+ "VideoClassificationInput",
412
+ "VideoClassificationOutputElement",
413
+ "VideoClassificationOutputTransform",
414
+ "VideoClassificationParameters",
415
+ "VisualQuestionAnsweringInput",
416
+ "VisualQuestionAnsweringInputData",
417
+ "VisualQuestionAnsweringOutputElement",
418
+ "VisualQuestionAnsweringParameters",
419
+ "ZeroShotClassificationInput",
420
+ "ZeroShotClassificationOutputElement",
421
+ "ZeroShotClassificationParameters",
422
+ "ZeroShotImageClassificationInput",
423
+ "ZeroShotImageClassificationOutputElement",
424
+ "ZeroShotImageClassificationParameters",
425
+ "ZeroShotObjectDetectionBoundingBox",
426
+ "ZeroShotObjectDetectionInput",
427
+ "ZeroShotObjectDetectionOutputElement",
428
+ "ZeroShotObjectDetectionParameters",
429
+ ],
430
+ "inference_api": [
431
+ "InferenceApi",
432
+ ],
433
+ "keras_mixin": [
434
+ "KerasModelHubMixin",
435
+ "from_pretrained_keras",
436
+ "push_to_hub_keras",
437
+ "save_pretrained_keras",
438
+ ],
439
+ "repocard": [
440
+ "DatasetCard",
441
+ "ModelCard",
442
+ "RepoCard",
443
+ "SpaceCard",
444
+ "metadata_eval_result",
445
+ "metadata_load",
446
+ "metadata_save",
447
+ "metadata_update",
448
+ ],
449
+ "repocard_data": [
450
+ "CardData",
451
+ "DatasetCardData",
452
+ "EvalResult",
453
+ "ModelCardData",
454
+ "SpaceCardData",
455
+ ],
456
+ "repository": [
457
+ "Repository",
458
+ ],
459
+ "serialization": [
460
+ "StateDictSplit",
461
+ "get_tf_storage_size",
462
+ "get_torch_storage_id",
463
+ "get_torch_storage_size",
464
+ "load_state_dict_from_file",
465
+ "load_torch_model",
466
+ "save_torch_model",
467
+ "save_torch_state_dict",
468
+ "split_state_dict_into_shards_factory",
469
+ "split_tf_state_dict_into_shards",
470
+ "split_torch_state_dict_into_shards",
471
+ ],
472
+ "serialization._dduf": [
473
+ "DDUFEntry",
474
+ "export_entries_as_dduf",
475
+ "export_folder_as_dduf",
476
+ "read_dduf_file",
477
+ ],
478
+ "utils": [
479
+ "CacheNotFound",
480
+ "CachedFileInfo",
481
+ "CachedRepoInfo",
482
+ "CachedRevisionInfo",
483
+ "CorruptedCacheException",
484
+ "DeleteCacheStrategy",
485
+ "HFCacheInfo",
486
+ "HfFolder",
487
+ "cached_assets_path",
488
+ "configure_http_backend",
489
+ "dump_environment_info",
490
+ "get_session",
491
+ "get_token",
492
+ "logging",
493
+ "scan_cache_dir",
494
+ ],
495
+ }
496
+
497
+
498
+ def _attach(package_name, submodules=None, submod_attrs=None):
499
+ """Attach lazily loaded submodules, functions, or other attributes.
500
+
501
+ Typically, modules import submodules and attributes as follows:
502
+
503
+ ```py
504
+ import mysubmodule
505
+ import anothersubmodule
506
+
507
+ from .foo import someattr
508
+ ```
509
+
510
+ The idea is to replace a package's `__getattr__`, `__dir__`, and
511
+ `__all__`, such that all imports work exactly the way they would
512
+ with normal imports, except that the import occurs upon first use.
513
+
514
+ The typical way to call this function, replacing the above imports, is:
515
+
516
+ ```python
517
+ __getattr__, __dir__, __all__ = lazy.attach(
518
+ __name__,
519
+ ['mysubmodule', 'anothersubmodule'],
520
+ {'foo': ['someattr']}
521
+ )
522
+ ```
523
+ This functionality requires Python 3.7 or higher.
524
+
525
+ Args:
526
+ package_name (`str`):
527
+ Typically use `__name__`.
528
+ submodules (`set`):
529
+ List of submodules to attach.
530
+ submod_attrs (`dict`):
531
+ Dictionary of submodule -> list of attributes / functions.
532
+ These attributes are imported as they are used.
533
+
534
+ Returns:
535
+ __getattr__, __dir__, __all__
536
+
537
+ """
538
+ if submod_attrs is None:
539
+ submod_attrs = {}
540
+
541
+ if submodules is None:
542
+ submodules = set()
543
+ else:
544
+ submodules = set(submodules)
545
+
546
+ attr_to_modules = {attr: mod for mod, attrs in submod_attrs.items() for attr in attrs}
547
+
548
+ __all__ = list(submodules | attr_to_modules.keys())
549
+
550
+ def __getattr__(name):
551
+ if name in submodules:
552
+ try:
553
+ return importlib.import_module(f"{package_name}.{name}")
554
+ except Exception as e:
555
+ print(f"Error importing {package_name}.{name}: {e}")
556
+ raise
557
+ elif name in attr_to_modules:
558
+ submod_path = f"{package_name}.{attr_to_modules[name]}"
559
+ try:
560
+ submod = importlib.import_module(submod_path)
561
+ except Exception as e:
562
+ print(f"Error importing {submod_path}: {e}")
563
+ raise
564
+ attr = getattr(submod, name)
565
+
566
+ # If the attribute lives in a file (module) with the same
567
+ # name as the attribute, ensure that the attribute and *not*
568
+ # the module is accessible on the package.
569
+ if name == attr_to_modules[name]:
570
+ pkg = sys.modules[package_name]
571
+ pkg.__dict__[name] = attr
572
+
573
+ return attr
574
+ else:
575
+ raise AttributeError(f"No {package_name} attribute {name}")
576
+
577
+ def __dir__():
578
+ return __all__
579
+
580
+ return __getattr__, __dir__, list(__all__)
581
+
582
+
583
+ __getattr__, __dir__, __all__ = _attach(__name__, submodules=[], submod_attrs=_SUBMOD_ATTRS)
584
+
585
+ if os.environ.get("EAGER_IMPORT", ""):
586
+ for attr in __all__:
587
+ __getattr__(attr)
588
+
589
+ # WARNING: any content below this statement is generated automatically. Any manual edit
590
+ # will be lost when re-generating this file !
591
+ #
592
+ # To update the static imports, please run the following command and commit the changes.
593
+ # ```
594
+ # # Use script
595
+ # python utils/check_static_imports.py --update-file
596
+ #
597
+ # # Or run style on codebase
598
+ # make style
599
+ # ```
600
+ if TYPE_CHECKING: # pragma: no cover
601
+ from ._commit_scheduler import CommitScheduler # noqa: F401
602
+ from ._inference_endpoints import (
603
+ InferenceEndpoint, # noqa: F401
604
+ InferenceEndpointError, # noqa: F401
605
+ InferenceEndpointStatus, # noqa: F401
606
+ InferenceEndpointTimeoutError, # noqa: F401
607
+ InferenceEndpointType, # noqa: F401
608
+ )
609
+ from ._login import (
610
+ auth_list, # noqa: F401
611
+ auth_switch, # noqa: F401
612
+ interpreter_login, # noqa: F401
613
+ login, # noqa: F401
614
+ logout, # noqa: F401
615
+ notebook_login, # noqa: F401
616
+ )
617
+ from ._snapshot_download import snapshot_download # noqa: F401
618
+ from ._space_api import (
619
+ SpaceHardware, # noqa: F401
620
+ SpaceRuntime, # noqa: F401
621
+ SpaceStage, # noqa: F401
622
+ SpaceStorage, # noqa: F401
623
+ SpaceVariable, # noqa: F401
624
+ )
625
+ from ._tensorboard_logger import HFSummaryWriter # noqa: F401
626
+ from ._webhooks_payload import (
627
+ WebhookPayload, # noqa: F401
628
+ WebhookPayloadComment, # noqa: F401
629
+ WebhookPayloadDiscussion, # noqa: F401
630
+ WebhookPayloadDiscussionChanges, # noqa: F401
631
+ WebhookPayloadEvent, # noqa: F401
632
+ WebhookPayloadMovedTo, # noqa: F401
633
+ WebhookPayloadRepo, # noqa: F401
634
+ WebhookPayloadUrl, # noqa: F401
635
+ WebhookPayloadWebhook, # noqa: F401
636
+ )
637
+ from ._webhooks_server import (
638
+ WebhooksServer, # noqa: F401
639
+ webhook_endpoint, # noqa: F401
640
+ )
641
+ from .community import (
642
+ Discussion, # noqa: F401
643
+ DiscussionComment, # noqa: F401
644
+ DiscussionCommit, # noqa: F401
645
+ DiscussionEvent, # noqa: F401
646
+ DiscussionStatusChange, # noqa: F401
647
+ DiscussionTitleChange, # noqa: F401
648
+ DiscussionWithDetails, # noqa: F401
649
+ )
650
+ from .constants import (
651
+ CONFIG_NAME, # noqa: F401
652
+ FLAX_WEIGHTS_NAME, # noqa: F401
653
+ HUGGINGFACE_CO_URL_HOME, # noqa: F401
654
+ HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401
655
+ PYTORCH_WEIGHTS_NAME, # noqa: F401
656
+ REPO_TYPE_DATASET, # noqa: F401
657
+ REPO_TYPE_MODEL, # noqa: F401
658
+ REPO_TYPE_SPACE, # noqa: F401
659
+ TF2_WEIGHTS_NAME, # noqa: F401
660
+ TF_WEIGHTS_NAME, # noqa: F401
661
+ )
662
+ from .fastai_utils import (
663
+ _save_pretrained_fastai, # noqa: F401
664
+ from_pretrained_fastai, # noqa: F401
665
+ push_to_hub_fastai, # noqa: F401
666
+ )
667
+ from .file_download import (
668
+ _CACHED_NO_EXIST, # noqa: F401
669
+ HfFileMetadata, # noqa: F401
670
+ get_hf_file_metadata, # noqa: F401
671
+ hf_hub_download, # noqa: F401
672
+ hf_hub_url, # noqa: F401
673
+ try_to_load_from_cache, # noqa: F401
674
+ )
675
+ from .hf_api import (
676
+ Collection, # noqa: F401
677
+ CollectionItem, # noqa: F401
678
+ CommitInfo, # noqa: F401
679
+ CommitOperation, # noqa: F401
680
+ CommitOperationAdd, # noqa: F401
681
+ CommitOperationCopy, # noqa: F401
682
+ CommitOperationDelete, # noqa: F401
683
+ DatasetInfo, # noqa: F401
684
+ GitCommitInfo, # noqa: F401
685
+ GitRefInfo, # noqa: F401
686
+ GitRefs, # noqa: F401
687
+ HfApi, # noqa: F401
688
+ ModelInfo, # noqa: F401
689
+ RepoUrl, # noqa: F401
690
+ SpaceInfo, # noqa: F401
691
+ User, # noqa: F401
692
+ UserLikes, # noqa: F401
693
+ WebhookInfo, # noqa: F401
694
+ WebhookWatchedItem, # noqa: F401
695
+ accept_access_request, # noqa: F401
696
+ add_collection_item, # noqa: F401
697
+ add_space_secret, # noqa: F401
698
+ add_space_variable, # noqa: F401
699
+ auth_check, # noqa: F401
700
+ cancel_access_request, # noqa: F401
701
+ change_discussion_status, # noqa: F401
702
+ comment_discussion, # noqa: F401
703
+ create_branch, # noqa: F401
704
+ create_collection, # noqa: F401
705
+ create_commit, # noqa: F401
706
+ create_discussion, # noqa: F401
707
+ create_inference_endpoint, # noqa: F401
708
+ create_pull_request, # noqa: F401
709
+ create_repo, # noqa: F401
710
+ create_tag, # noqa: F401
711
+ create_webhook, # noqa: F401
712
+ dataset_info, # noqa: F401
713
+ delete_branch, # noqa: F401
714
+ delete_collection, # noqa: F401
715
+ delete_collection_item, # noqa: F401
716
+ delete_file, # noqa: F401
717
+ delete_folder, # noqa: F401
718
+ delete_inference_endpoint, # noqa: F401
719
+ delete_repo, # noqa: F401
720
+ delete_space_secret, # noqa: F401
721
+ delete_space_storage, # noqa: F401
722
+ delete_space_variable, # noqa: F401
723
+ delete_tag, # noqa: F401
724
+ delete_webhook, # noqa: F401
725
+ disable_webhook, # noqa: F401
726
+ duplicate_space, # noqa: F401
727
+ edit_discussion_comment, # noqa: F401
728
+ enable_webhook, # noqa: F401
729
+ file_exists, # noqa: F401
730
+ get_collection, # noqa: F401
731
+ get_dataset_tags, # noqa: F401
732
+ get_discussion_details, # noqa: F401
733
+ get_full_repo_name, # noqa: F401
734
+ get_inference_endpoint, # noqa: F401
735
+ get_model_tags, # noqa: F401
736
+ get_paths_info, # noqa: F401
737
+ get_repo_discussions, # noqa: F401
738
+ get_safetensors_metadata, # noqa: F401
739
+ get_space_runtime, # noqa: F401
740
+ get_space_variables, # noqa: F401
741
+ get_token_permission, # noqa: F401
742
+ get_user_overview, # noqa: F401
743
+ get_webhook, # noqa: F401
744
+ grant_access, # noqa: F401
745
+ like, # noqa: F401
746
+ list_accepted_access_requests, # noqa: F401
747
+ list_collections, # noqa: F401
748
+ list_datasets, # noqa: F401
749
+ list_inference_endpoints, # noqa: F401
750
+ list_liked_repos, # noqa: F401
751
+ list_models, # noqa: F401
752
+ list_organization_members, # noqa: F401
753
+ list_papers, # noqa: F401
754
+ list_pending_access_requests, # noqa: F401
755
+ list_rejected_access_requests, # noqa: F401
756
+ list_repo_commits, # noqa: F401
757
+ list_repo_files, # noqa: F401
758
+ list_repo_likers, # noqa: F401
759
+ list_repo_refs, # noqa: F401
760
+ list_repo_tree, # noqa: F401
761
+ list_spaces, # noqa: F401
762
+ list_user_followers, # noqa: F401
763
+ list_user_following, # noqa: F401
764
+ list_webhooks, # noqa: F401
765
+ merge_pull_request, # noqa: F401
766
+ model_info, # noqa: F401
767
+ move_repo, # noqa: F401
768
+ paper_info, # noqa: F401
769
+ parse_safetensors_file_metadata, # noqa: F401
770
+ pause_inference_endpoint, # noqa: F401
771
+ pause_space, # noqa: F401
772
+ preupload_lfs_files, # noqa: F401
773
+ reject_access_request, # noqa: F401
774
+ rename_discussion, # noqa: F401
775
+ repo_exists, # noqa: F401
776
+ repo_info, # noqa: F401
777
+ repo_type_and_id_from_hf_id, # noqa: F401
778
+ request_space_hardware, # noqa: F401
779
+ request_space_storage, # noqa: F401
780
+ restart_space, # noqa: F401
781
+ resume_inference_endpoint, # noqa: F401
782
+ revision_exists, # noqa: F401
783
+ run_as_future, # noqa: F401
784
+ scale_to_zero_inference_endpoint, # noqa: F401
785
+ set_space_sleep_time, # noqa: F401
786
+ space_info, # noqa: F401
787
+ super_squash_history, # noqa: F401
788
+ unlike, # noqa: F401
789
+ update_collection_item, # noqa: F401
790
+ update_collection_metadata, # noqa: F401
791
+ update_inference_endpoint, # noqa: F401
792
+ update_repo_settings, # noqa: F401
793
+ update_repo_visibility, # noqa: F401
794
+ update_webhook, # noqa: F401
795
+ upload_file, # noqa: F401
796
+ upload_folder, # noqa: F401
797
+ upload_large_folder, # noqa: F401
798
+ whoami, # noqa: F401
799
+ )
800
+ from .hf_file_system import (
801
+ HfFileSystem, # noqa: F401
802
+ HfFileSystemFile, # noqa: F401
803
+ HfFileSystemResolvedPath, # noqa: F401
804
+ HfFileSystemStreamFile, # noqa: F401
805
+ )
806
+ from .hub_mixin import (
807
+ ModelHubMixin, # noqa: F401
808
+ PyTorchModelHubMixin, # noqa: F401
809
+ )
810
+ from .inference._client import (
811
+ InferenceClient, # noqa: F401
812
+ InferenceTimeoutError, # noqa: F401
813
+ )
814
+ from .inference._generated._async_client import AsyncInferenceClient # noqa: F401
815
+ from .inference._generated.types import (
816
+ AudioClassificationInput, # noqa: F401
817
+ AudioClassificationOutputElement, # noqa: F401
818
+ AudioClassificationOutputTransform, # noqa: F401
819
+ AudioClassificationParameters, # noqa: F401
820
+ AudioToAudioInput, # noqa: F401
821
+ AudioToAudioOutputElement, # noqa: F401
822
+ AutomaticSpeechRecognitionEarlyStoppingEnum, # noqa: F401
823
+ AutomaticSpeechRecognitionGenerationParameters, # noqa: F401
824
+ AutomaticSpeechRecognitionInput, # noqa: F401
825
+ AutomaticSpeechRecognitionOutput, # noqa: F401
826
+ AutomaticSpeechRecognitionOutputChunk, # noqa: F401
827
+ AutomaticSpeechRecognitionParameters, # noqa: F401
828
+ ChatCompletionInput, # noqa: F401
829
+ ChatCompletionInputFunctionDefinition, # noqa: F401
830
+ ChatCompletionInputFunctionName, # noqa: F401
831
+ ChatCompletionInputGrammarType, # noqa: F401
832
+ ChatCompletionInputGrammarTypeType, # noqa: F401
833
+ ChatCompletionInputMessage, # noqa: F401
834
+ ChatCompletionInputMessageChunk, # noqa: F401
835
+ ChatCompletionInputMessageChunkType, # noqa: F401
836
+ ChatCompletionInputStreamOptions, # noqa: F401
837
+ ChatCompletionInputTool, # noqa: F401
838
+ ChatCompletionInputToolChoiceClass, # noqa: F401
839
+ ChatCompletionInputToolChoiceEnum, # noqa: F401
840
+ ChatCompletionInputURL, # noqa: F401
841
+ ChatCompletionOutput, # noqa: F401
842
+ ChatCompletionOutputComplete, # noqa: F401
843
+ ChatCompletionOutputFunctionDefinition, # noqa: F401
844
+ ChatCompletionOutputLogprob, # noqa: F401
845
+ ChatCompletionOutputLogprobs, # noqa: F401
846
+ ChatCompletionOutputMessage, # noqa: F401
847
+ ChatCompletionOutputToolCall, # noqa: F401
848
+ ChatCompletionOutputTopLogprob, # noqa: F401
849
+ ChatCompletionOutputUsage, # noqa: F401
850
+ ChatCompletionStreamOutput, # noqa: F401
851
+ ChatCompletionStreamOutputChoice, # noqa: F401
852
+ ChatCompletionStreamOutputDelta, # noqa: F401
853
+ ChatCompletionStreamOutputDeltaToolCall, # noqa: F401
854
+ ChatCompletionStreamOutputFunction, # noqa: F401
855
+ ChatCompletionStreamOutputLogprob, # noqa: F401
856
+ ChatCompletionStreamOutputLogprobs, # noqa: F401
857
+ ChatCompletionStreamOutputTopLogprob, # noqa: F401
858
+ ChatCompletionStreamOutputUsage, # noqa: F401
859
+ DepthEstimationInput, # noqa: F401
860
+ DepthEstimationOutput, # noqa: F401
861
+ DocumentQuestionAnsweringInput, # noqa: F401
862
+ DocumentQuestionAnsweringInputData, # noqa: F401
863
+ DocumentQuestionAnsweringOutputElement, # noqa: F401
864
+ DocumentQuestionAnsweringParameters, # noqa: F401
865
+ FeatureExtractionInput, # noqa: F401
866
+ FeatureExtractionInputTruncationDirection, # noqa: F401
867
+ FillMaskInput, # noqa: F401
868
+ FillMaskOutputElement, # noqa: F401
869
+ FillMaskParameters, # noqa: F401
870
+ ImageClassificationInput, # noqa: F401
871
+ ImageClassificationOutputElement, # noqa: F401
872
+ ImageClassificationOutputTransform, # noqa: F401
873
+ ImageClassificationParameters, # noqa: F401
874
+ ImageSegmentationInput, # noqa: F401
875
+ ImageSegmentationOutputElement, # noqa: F401
876
+ ImageSegmentationParameters, # noqa: F401
877
+ ImageSegmentationSubtask, # noqa: F401
878
+ ImageToImageInput, # noqa: F401
879
+ ImageToImageOutput, # noqa: F401
880
+ ImageToImageParameters, # noqa: F401
881
+ ImageToImageTargetSize, # noqa: F401
882
+ ImageToTextEarlyStoppingEnum, # noqa: F401
883
+ ImageToTextGenerationParameters, # noqa: F401
884
+ ImageToTextInput, # noqa: F401
885
+ ImageToTextOutput, # noqa: F401
886
+ ImageToTextParameters, # noqa: F401
887
+ ObjectDetectionBoundingBox, # noqa: F401
888
+ ObjectDetectionInput, # noqa: F401
889
+ ObjectDetectionOutputElement, # noqa: F401
890
+ ObjectDetectionParameters, # noqa: F401
891
+ Padding, # noqa: F401
892
+ QuestionAnsweringInput, # noqa: F401
893
+ QuestionAnsweringInputData, # noqa: F401
894
+ QuestionAnsweringOutputElement, # noqa: F401
895
+ QuestionAnsweringParameters, # noqa: F401
896
+ SentenceSimilarityInput, # noqa: F401
897
+ SentenceSimilarityInputData, # noqa: F401
898
+ SummarizationInput, # noqa: F401
899
+ SummarizationOutput, # noqa: F401
900
+ SummarizationParameters, # noqa: F401
901
+ SummarizationTruncationStrategy, # noqa: F401
902
+ TableQuestionAnsweringInput, # noqa: F401
903
+ TableQuestionAnsweringInputData, # noqa: F401
904
+ TableQuestionAnsweringOutputElement, # noqa: F401
905
+ TableQuestionAnsweringParameters, # noqa: F401
906
+ Text2TextGenerationInput, # noqa: F401
907
+ Text2TextGenerationOutput, # noqa: F401
908
+ Text2TextGenerationParameters, # noqa: F401
909
+ Text2TextGenerationTruncationStrategy, # noqa: F401
910
+ TextClassificationInput, # noqa: F401
911
+ TextClassificationOutputElement, # noqa: F401
912
+ TextClassificationOutputTransform, # noqa: F401
913
+ TextClassificationParameters, # noqa: F401
914
+ TextGenerationInput, # noqa: F401
915
+ TextGenerationInputGenerateParameters, # noqa: F401
916
+ TextGenerationInputGrammarType, # noqa: F401
917
+ TextGenerationOutput, # noqa: F401
918
+ TextGenerationOutputBestOfSequence, # noqa: F401
919
+ TextGenerationOutputDetails, # noqa: F401
920
+ TextGenerationOutputFinishReason, # noqa: F401
921
+ TextGenerationOutputPrefillToken, # noqa: F401
922
+ TextGenerationOutputToken, # noqa: F401
923
+ TextGenerationStreamOutput, # noqa: F401
924
+ TextGenerationStreamOutputStreamDetails, # noqa: F401
925
+ TextGenerationStreamOutputToken, # noqa: F401
926
+ TextToAudioEarlyStoppingEnum, # noqa: F401
927
+ TextToAudioGenerationParameters, # noqa: F401
928
+ TextToAudioInput, # noqa: F401
929
+ TextToAudioOutput, # noqa: F401
930
+ TextToAudioParameters, # noqa: F401
931
+ TextToImageInput, # noqa: F401
932
+ TextToImageOutput, # noqa: F401
933
+ TextToImageParameters, # noqa: F401
934
+ TextToImageTargetSize, # noqa: F401
935
+ TextToSpeechEarlyStoppingEnum, # noqa: F401
936
+ TextToSpeechGenerationParameters, # noqa: F401
937
+ TextToSpeechInput, # noqa: F401
938
+ TextToSpeechOutput, # noqa: F401
939
+ TextToSpeechParameters, # noqa: F401
940
+ TokenClassificationAggregationStrategy, # noqa: F401
941
+ TokenClassificationInput, # noqa: F401
942
+ TokenClassificationOutputElement, # noqa: F401
943
+ TokenClassificationParameters, # noqa: F401
944
+ TranslationInput, # noqa: F401
945
+ TranslationOutput, # noqa: F401
946
+ TranslationParameters, # noqa: F401
947
+ TranslationTruncationStrategy, # noqa: F401
948
+ TypeEnum, # noqa: F401
949
+ VideoClassificationInput, # noqa: F401
950
+ VideoClassificationOutputElement, # noqa: F401
951
+ VideoClassificationOutputTransform, # noqa: F401
952
+ VideoClassificationParameters, # noqa: F401
953
+ VisualQuestionAnsweringInput, # noqa: F401
954
+ VisualQuestionAnsweringInputData, # noqa: F401
955
+ VisualQuestionAnsweringOutputElement, # noqa: F401
956
+ VisualQuestionAnsweringParameters, # noqa: F401
957
+ ZeroShotClassificationInput, # noqa: F401
958
+ ZeroShotClassificationOutputElement, # noqa: F401
959
+ ZeroShotClassificationParameters, # noqa: F401
960
+ ZeroShotImageClassificationInput, # noqa: F401
961
+ ZeroShotImageClassificationOutputElement, # noqa: F401
962
+ ZeroShotImageClassificationParameters, # noqa: F401
963
+ ZeroShotObjectDetectionBoundingBox, # noqa: F401
964
+ ZeroShotObjectDetectionInput, # noqa: F401
965
+ ZeroShotObjectDetectionOutputElement, # noqa: F401
966
+ ZeroShotObjectDetectionParameters, # noqa: F401
967
+ )
968
+ from .inference_api import InferenceApi # noqa: F401
969
+ from .keras_mixin import (
970
+ KerasModelHubMixin, # noqa: F401
971
+ from_pretrained_keras, # noqa: F401
972
+ push_to_hub_keras, # noqa: F401
973
+ save_pretrained_keras, # noqa: F401
974
+ )
975
+ from .repocard import (
976
+ DatasetCard, # noqa: F401
977
+ ModelCard, # noqa: F401
978
+ RepoCard, # noqa: F401
979
+ SpaceCard, # noqa: F401
980
+ metadata_eval_result, # noqa: F401
981
+ metadata_load, # noqa: F401
982
+ metadata_save, # noqa: F401
983
+ metadata_update, # noqa: F401
984
+ )
985
+ from .repocard_data import (
986
+ CardData, # noqa: F401
987
+ DatasetCardData, # noqa: F401
988
+ EvalResult, # noqa: F401
989
+ ModelCardData, # noqa: F401
990
+ SpaceCardData, # noqa: F401
991
+ )
992
+ from .repository import Repository # noqa: F401
993
+ from .serialization import (
994
+ StateDictSplit, # noqa: F401
995
+ get_tf_storage_size, # noqa: F401
996
+ get_torch_storage_id, # noqa: F401
997
+ get_torch_storage_size, # noqa: F401
998
+ load_state_dict_from_file, # noqa: F401
999
+ load_torch_model, # noqa: F401
1000
+ save_torch_model, # noqa: F401
1001
+ save_torch_state_dict, # noqa: F401
1002
+ split_state_dict_into_shards_factory, # noqa: F401
1003
+ split_tf_state_dict_into_shards, # noqa: F401
1004
+ split_torch_state_dict_into_shards, # noqa: F401
1005
+ )
1006
+ from .serialization._dduf import (
1007
+ DDUFEntry, # noqa: F401
1008
+ export_entries_as_dduf, # noqa: F401
1009
+ export_folder_as_dduf, # noqa: F401
1010
+ read_dduf_file, # noqa: F401
1011
+ )
1012
+ from .utils import (
1013
+ CachedFileInfo, # noqa: F401
1014
+ CachedRepoInfo, # noqa: F401
1015
+ CachedRevisionInfo, # noqa: F401
1016
+ CacheNotFound, # noqa: F401
1017
+ CorruptedCacheException, # noqa: F401
1018
+ DeleteCacheStrategy, # noqa: F401
1019
+ HFCacheInfo, # noqa: F401
1020
+ HfFolder, # noqa: F401
1021
+ cached_assets_path, # noqa: F401
1022
+ configure_http_backend, # noqa: F401
1023
+ dump_environment_info, # noqa: F401
1024
+ get_session, # noqa: F401
1025
+ get_token, # noqa: F401
1026
+ logging, # noqa: F401
1027
+ scan_cache_dir, # noqa: F401
1028
+ )
meow/lib/python3.13/site-packages/huggingface_hub/_commit_scheduler.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import atexit
2
+ import logging
3
+ import os
4
+ import time
5
+ from concurrent.futures import Future
6
+ from dataclasses import dataclass
7
+ from io import SEEK_END, SEEK_SET, BytesIO
8
+ from pathlib import Path
9
+ from threading import Lock, Thread
10
+ from typing import Dict, List, Optional, Union
11
+
12
+ from .hf_api import DEFAULT_IGNORE_PATTERNS, CommitInfo, CommitOperationAdd, HfApi
13
+ from .utils import filter_repo_objects
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class _FileToUpload:
21
+ """Temporary dataclass to store info about files to upload. Not meant to be used directly."""
22
+
23
+ local_path: Path
24
+ path_in_repo: str
25
+ size_limit: int
26
+ last_modified: float
27
+
28
+
29
+ class CommitScheduler:
30
+ """
31
+ Scheduler to upload a local folder to the Hub at regular intervals (e.g. push to hub every 5 minutes).
32
+
33
+ The recommended way to use the scheduler is to use it as a context manager. This ensures that the scheduler is
34
+ properly stopped and the last commit is triggered when the script ends. The scheduler can also be stopped manually
35
+ with the `stop` method. Checkout the [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#scheduled-uploads)
36
+ to learn more about how to use it.
37
+
38
+ Args:
39
+ repo_id (`str`):
40
+ The id of the repo to commit to.
41
+ folder_path (`str` or `Path`):
42
+ Path to the local folder to upload regularly.
43
+ every (`int` or `float`, *optional*):
44
+ The number of minutes between each commit. Defaults to 5 minutes.
45
+ path_in_repo (`str`, *optional*):
46
+ Relative path of the directory in the repo, for example: `"checkpoints/"`. Defaults to the root folder
47
+ of the repository.
48
+ repo_type (`str`, *optional*):
49
+ The type of the repo to commit to. Defaults to `model`.
50
+ revision (`str`, *optional*):
51
+ The revision of the repo to commit to. Defaults to `main`.
52
+ private (`bool`, *optional*):
53
+ Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
54
+ token (`str`, *optional*):
55
+ The token to use to commit to the repo. Defaults to the token saved on the machine.
56
+ allow_patterns (`List[str]` or `str`, *optional*):
57
+ If provided, only files matching at least one pattern are uploaded.
58
+ ignore_patterns (`List[str]` or `str`, *optional*):
59
+ If provided, files matching any of the patterns are not uploaded.
60
+ squash_history (`bool`, *optional*):
61
+ Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
62
+ useful to avoid degraded performances on the repo when it grows too large.
63
+ hf_api (`HfApi`, *optional*):
64
+ The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).
65
+
66
+ Example:
67
+ ```py
68
+ >>> from pathlib import Path
69
+ >>> from huggingface_hub import CommitScheduler
70
+
71
+ # Scheduler uploads every 10 minutes
72
+ >>> csv_path = Path("watched_folder/data.csv")
73
+ >>> CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path=csv_path.parent, every=10)
74
+
75
+ >>> with csv_path.open("a") as f:
76
+ ... f.write("first line")
77
+
78
+ # Some time later (...)
79
+ >>> with csv_path.open("a") as f:
80
+ ... f.write("second line")
81
+ ```
82
+
83
+ Example using a context manager:
84
+ ```py
85
+ >>> from pathlib import Path
86
+ >>> from huggingface_hub import CommitScheduler
87
+
88
+ >>> with CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path="watched_folder", every=10) as scheduler:
89
+ ... csv_path = Path("watched_folder/data.csv")
90
+ ... with csv_path.open("a") as f:
91
+ ... f.write("first line")
92
+ ... (...)
93
+ ... with csv_path.open("a") as f:
94
+ ... f.write("second line")
95
+
96
+ # Scheduler is now stopped and last commit have been triggered
97
+ ```
98
+ """
99
+
100
+ def __init__(
101
+ self,
102
+ *,
103
+ repo_id: str,
104
+ folder_path: Union[str, Path],
105
+ every: Union[int, float] = 5,
106
+ path_in_repo: Optional[str] = None,
107
+ repo_type: Optional[str] = None,
108
+ revision: Optional[str] = None,
109
+ private: Optional[bool] = None,
110
+ token: Optional[str] = None,
111
+ allow_patterns: Optional[Union[List[str], str]] = None,
112
+ ignore_patterns: Optional[Union[List[str], str]] = None,
113
+ squash_history: bool = False,
114
+ hf_api: Optional["HfApi"] = None,
115
+ ) -> None:
116
+ self.api = hf_api or HfApi(token=token)
117
+
118
+ # Folder
119
+ self.folder_path = Path(folder_path).expanduser().resolve()
120
+ self.path_in_repo = path_in_repo or ""
121
+ self.allow_patterns = allow_patterns
122
+
123
+ if ignore_patterns is None:
124
+ ignore_patterns = []
125
+ elif isinstance(ignore_patterns, str):
126
+ ignore_patterns = [ignore_patterns]
127
+ self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
128
+
129
+ if self.folder_path.is_file():
130
+ raise ValueError(f"'folder_path' must be a directory, not a file: '{self.folder_path}'.")
131
+ self.folder_path.mkdir(parents=True, exist_ok=True)
132
+
133
+ # Repository
134
+ repo_url = self.api.create_repo(repo_id=repo_id, private=private, repo_type=repo_type, exist_ok=True)
135
+ self.repo_id = repo_url.repo_id
136
+ self.repo_type = repo_type
137
+ self.revision = revision
138
+ self.token = token
139
+
140
+ # Keep track of already uploaded files
141
+ self.last_uploaded: Dict[Path, float] = {} # key is local path, value is timestamp
142
+
143
+ # Scheduler
144
+ if not every > 0:
145
+ raise ValueError(f"'every' must be a positive integer, not '{every}'.")
146
+ self.lock = Lock()
147
+ self.every = every
148
+ self.squash_history = squash_history
149
+
150
+ logger.info(f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes.")
151
+ self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
152
+ self._scheduler_thread.start()
153
+ atexit.register(self._push_to_hub)
154
+
155
+ self.__stopped = False
156
+
157
+ def stop(self) -> None:
158
+ """Stop the scheduler.
159
+
160
+ A stopped scheduler cannot be restarted. Mostly for tests purposes.
161
+ """
162
+ self.__stopped = True
163
+
164
+ def __enter__(self) -> "CommitScheduler":
165
+ return self
166
+
167
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
168
+ # Upload last changes before exiting
169
+ self.trigger().result()
170
+ self.stop()
171
+ return
172
+
173
+ def _run_scheduler(self) -> None:
174
+ """Dumb thread waiting between each scheduled push to Hub."""
175
+ while True:
176
+ self.last_future = self.trigger()
177
+ time.sleep(self.every * 60)
178
+ if self.__stopped:
179
+ break
180
+
181
+ def trigger(self) -> Future:
182
+ """Trigger a `push_to_hub` and return a future.
183
+
184
+ This method is automatically called every `every` minutes. You can also call it manually to trigger a commit
185
+ immediately, without waiting for the next scheduled commit.
186
+ """
187
+ return self.api.run_as_future(self._push_to_hub)
188
+
189
+ def _push_to_hub(self) -> Optional[CommitInfo]:
190
+ if self.__stopped: # If stopped, already scheduled commits are ignored
191
+ return None
192
+
193
+ logger.info("(Background) scheduled commit triggered.")
194
+ try:
195
+ value = self.push_to_hub()
196
+ if self.squash_history:
197
+ logger.info("(Background) squashing repo history.")
198
+ self.api.super_squash_history(repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision)
199
+ return value
200
+ except Exception as e:
201
+ logger.error(f"Error while pushing to Hub: {e}") # Depending on the setup, error might be silenced
202
+ raise
203
+
204
+ def push_to_hub(self) -> Optional[CommitInfo]:
205
+ """
206
+ Push folder to the Hub and return the commit info.
207
+
208
+ <Tip warning={true}>
209
+
210
+ This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
211
+ queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
212
+ issues.
213
+
214
+ </Tip>
215
+
216
+ The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
217
+ uploads only changed files. If no changes are found, the method returns without committing anything. If you want
218
+ to change this behavior, you can inherit from [`CommitScheduler`] and override this method. This can be useful
219
+ for example to compress data together in a single file before committing. For more details and examples, check
220
+ out our [integration guide](https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads).
221
+ """
222
+ # Check files to upload (with lock)
223
+ with self.lock:
224
+ logger.debug("Listing files to upload for scheduled commit.")
225
+
226
+ # List files from folder (taken from `_prepare_upload_folder_additions`)
227
+ relpath_to_abspath = {
228
+ path.relative_to(self.folder_path).as_posix(): path
229
+ for path in sorted(self.folder_path.glob("**/*")) # sorted to be deterministic
230
+ if path.is_file()
231
+ }
232
+ prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
233
+
234
+ # Filter with pattern + filter out unchanged files + retrieve current file size
235
+ files_to_upload: List[_FileToUpload] = []
236
+ for relpath in filter_repo_objects(
237
+ relpath_to_abspath.keys(), allow_patterns=self.allow_patterns, ignore_patterns=self.ignore_patterns
238
+ ):
239
+ local_path = relpath_to_abspath[relpath]
240
+ stat = local_path.stat()
241
+ if self.last_uploaded.get(local_path) is None or self.last_uploaded[local_path] != stat.st_mtime:
242
+ files_to_upload.append(
243
+ _FileToUpload(
244
+ local_path=local_path,
245
+ path_in_repo=prefix + relpath,
246
+ size_limit=stat.st_size,
247
+ last_modified=stat.st_mtime,
248
+ )
249
+ )
250
+
251
+ # Return if nothing to upload
252
+ if len(files_to_upload) == 0:
253
+ logger.debug("Dropping schedule commit: no changed file to upload.")
254
+ return None
255
+
256
+ # Convert `_FileToUpload` as `CommitOperationAdd` (=> compute file shas + limit to file size)
257
+ logger.debug("Removing unchanged files since previous scheduled commit.")
258
+ add_operations = [
259
+ CommitOperationAdd(
260
+ # Cap the file to its current size, even if the user append data to it while a scheduled commit is happening
261
+ path_or_fileobj=PartialFileIO(file_to_upload.local_path, size_limit=file_to_upload.size_limit),
262
+ path_in_repo=file_to_upload.path_in_repo,
263
+ )
264
+ for file_to_upload in files_to_upload
265
+ ]
266
+
267
+ # Upload files (append mode expected - no need for lock)
268
+ logger.debug("Uploading files for scheduled commit.")
269
+ commit_info = self.api.create_commit(
270
+ repo_id=self.repo_id,
271
+ repo_type=self.repo_type,
272
+ operations=add_operations,
273
+ commit_message="Scheduled Commit",
274
+ revision=self.revision,
275
+ )
276
+
277
+ # Successful commit: keep track of the latest "last_modified" for each file
278
+ for file in files_to_upload:
279
+ self.last_uploaded[file.local_path] = file.last_modified
280
+ return commit_info
281
+
282
+
283
+ class PartialFileIO(BytesIO):
284
+ """A file-like object that reads only the first part of a file.
285
+
286
+ Useful to upload a file to the Hub when the user might still be appending data to it. Only the first part of the
287
+ file is uploaded (i.e. the part that was available when the filesystem was first scanned).
288
+
289
+ In practice, only used internally by the CommitScheduler to regularly push a folder to the Hub with minimal
290
+ disturbance for the user. The object is passed to `CommitOperationAdd`.
291
+
292
+ Only supports `read`, `tell` and `seek` methods.
293
+
294
+ Args:
295
+ file_path (`str` or `Path`):
296
+ Path to the file to read.
297
+ size_limit (`int`):
298
+ The maximum number of bytes to read from the file. If the file is larger than this, only the first part
299
+ will be read (and uploaded).
300
+ """
301
+
302
+ def __init__(self, file_path: Union[str, Path], size_limit: int) -> None:
303
+ self._file_path = Path(file_path)
304
+ self._file = self._file_path.open("rb")
305
+ self._size_limit = min(size_limit, os.fstat(self._file.fileno()).st_size)
306
+
307
+ def __del__(self) -> None:
308
+ self._file.close()
309
+ return super().__del__()
310
+
311
+ def __repr__(self) -> str:
312
+ return f"<PartialFileIO file_path={self._file_path} size_limit={self._size_limit}>"
313
+
314
+ def __len__(self) -> int:
315
+ return self._size_limit
316
+
317
+ def __getattribute__(self, name: str):
318
+ if name.startswith("_") or name in ("read", "tell", "seek"): # only 3 public methods supported
319
+ return super().__getattribute__(name)
320
+ raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
321
+
322
+ def tell(self) -> int:
323
+ """Return the current file position."""
324
+ return self._file.tell()
325
+
326
+ def seek(self, __offset: int, __whence: int = SEEK_SET) -> int:
327
+ """Change the stream position to the given offset.
328
+
329
+ Behavior is the same as a regular file, except that the position is capped to the size limit.
330
+ """
331
+ if __whence == SEEK_END:
332
+ # SEEK_END => set from the truncated end
333
+ __offset = len(self) + __offset
334
+ __whence = SEEK_SET
335
+
336
+ pos = self._file.seek(__offset, __whence)
337
+ if pos > self._size_limit:
338
+ return self._file.seek(self._size_limit)
339
+ return pos
340
+
341
+ def read(self, __size: Optional[int] = -1) -> bytes:
342
+ """Read at most `__size` bytes from the file.
343
+
344
+ Behavior is the same as a regular file, except that it is capped to the size limit.
345
+ """
346
+ current = self._file.tell()
347
+ if __size is None or __size < 0:
348
+ # Read until file limit
349
+ truncated_size = self._size_limit - current
350
+ else:
351
+ # Read until file limit or __size
352
+ truncated_size = min(__size, self._size_limit - current)
353
+ return self._file.read(truncated_size)
meow/lib/python3.13/site-packages/huggingface_hub/_inference_endpoints.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import TYPE_CHECKING, Dict, Optional, Union
6
+
7
+ from huggingface_hub.errors import InferenceEndpointError, InferenceEndpointTimeoutError
8
+
9
+ from .inference._client import InferenceClient
10
+ from .inference._generated._async_client import AsyncInferenceClient
11
+ from .utils import get_session, logging, parse_datetime
12
+
13
+
14
+ if TYPE_CHECKING:
15
+ from .hf_api import HfApi
16
+
17
+
18
+ logger = logging.get_logger(__name__)
19
+
20
+
21
+ class InferenceEndpointStatus(str, Enum):
22
+ PENDING = "pending"
23
+ INITIALIZING = "initializing"
24
+ UPDATING = "updating"
25
+ UPDATE_FAILED = "updateFailed"
26
+ RUNNING = "running"
27
+ PAUSED = "paused"
28
+ FAILED = "failed"
29
+ SCALED_TO_ZERO = "scaledToZero"
30
+
31
+
32
+ class InferenceEndpointType(str, Enum):
33
+ PUBlIC = "public"
34
+ PROTECTED = "protected"
35
+ PRIVATE = "private"
36
+
37
+
38
+ @dataclass
39
+ class InferenceEndpoint:
40
+ """
41
+ Contains information about a deployed Inference Endpoint.
42
+
43
+ Args:
44
+ name (`str`):
45
+ The unique name of the Inference Endpoint.
46
+ namespace (`str`):
47
+ The namespace where the Inference Endpoint is located.
48
+ repository (`str`):
49
+ The name of the model repository deployed on this Inference Endpoint.
50
+ status ([`InferenceEndpointStatus`]):
51
+ The current status of the Inference Endpoint.
52
+ url (`str`, *optional*):
53
+ The URL of the Inference Endpoint, if available. Only a deployed Inference Endpoint will have a URL.
54
+ framework (`str`):
55
+ The machine learning framework used for the model.
56
+ revision (`str`):
57
+ The specific model revision deployed on the Inference Endpoint.
58
+ task (`str`):
59
+ The task associated with the deployed model.
60
+ created_at (`datetime.datetime`):
61
+ The timestamp when the Inference Endpoint was created.
62
+ updated_at (`datetime.datetime`):
63
+ The timestamp of the last update of the Inference Endpoint.
64
+ type ([`InferenceEndpointType`]):
65
+ The type of the Inference Endpoint (public, protected, private).
66
+ raw (`Dict`):
67
+ The raw dictionary data returned from the API.
68
+ token (`str` or `bool`, *optional*):
69
+ Authentication token for the Inference Endpoint, if set when requesting the API. Will default to the
70
+ locally saved token if not provided. Pass `token=False` if you don't want to send your token to the server.
71
+
72
+ Example:
73
+ ```python
74
+ >>> from huggingface_hub import get_inference_endpoint
75
+ >>> endpoint = get_inference_endpoint("my-text-to-image")
76
+ >>> endpoint
77
+ InferenceEndpoint(name='my-text-to-image', ...)
78
+
79
+ # Get status
80
+ >>> endpoint.status
81
+ 'running'
82
+ >>> endpoint.url
83
+ 'https://my-text-to-image.region.vendor.endpoints.huggingface.cloud'
84
+
85
+ # Run inference
86
+ >>> endpoint.client.text_to_image(...)
87
+
88
+ # Pause endpoint to save $$$
89
+ >>> endpoint.pause()
90
+
91
+ # ...
92
+ # Resume and wait for deployment
93
+ >>> endpoint.resume()
94
+ >>> endpoint.wait()
95
+ >>> endpoint.client.text_to_image(...)
96
+ ```
97
+ """
98
+
99
+ # Field in __repr__
100
+ name: str = field(init=False)
101
+ namespace: str
102
+ repository: str = field(init=False)
103
+ status: InferenceEndpointStatus = field(init=False)
104
+ url: Optional[str] = field(init=False)
105
+
106
+ # Other fields
107
+ framework: str = field(repr=False, init=False)
108
+ revision: str = field(repr=False, init=False)
109
+ task: str = field(repr=False, init=False)
110
+ created_at: datetime = field(repr=False, init=False)
111
+ updated_at: datetime = field(repr=False, init=False)
112
+ type: InferenceEndpointType = field(repr=False, init=False)
113
+
114
+ # Raw dict from the API
115
+ raw: Dict = field(repr=False)
116
+
117
+ # Internal fields
118
+ _token: Union[str, bool, None] = field(repr=False, compare=False)
119
+ _api: "HfApi" = field(repr=False, compare=False)
120
+
121
+ @classmethod
122
+ def from_raw(
123
+ cls, raw: Dict, namespace: str, token: Union[str, bool, None] = None, api: Optional["HfApi"] = None
124
+ ) -> "InferenceEndpoint":
125
+ """Initialize object from raw dictionary."""
126
+ if api is None:
127
+ from .hf_api import HfApi
128
+
129
+ api = HfApi()
130
+ if token is None:
131
+ token = api.token
132
+
133
+ # All other fields are populated in __post_init__
134
+ return cls(raw=raw, namespace=namespace, _token=token, _api=api)
135
+
136
+ def __post_init__(self) -> None:
137
+ """Populate fields from raw dictionary."""
138
+ self._populate_from_raw()
139
+
140
+ @property
141
+ def client(self) -> InferenceClient:
142
+ """Returns a client to make predictions on this Inference Endpoint.
143
+
144
+ Returns:
145
+ [`InferenceClient`]: an inference client pointing to the deployed endpoint.
146
+
147
+ Raises:
148
+ [`InferenceEndpointError`]: If the Inference Endpoint is not yet deployed.
149
+ """
150
+ if self.url is None:
151
+ raise InferenceEndpointError(
152
+ "Cannot create a client for this Inference Endpoint as it is not yet deployed. "
153
+ "Please wait for the Inference Endpoint to be deployed using `endpoint.wait()` and try again."
154
+ )
155
+ return InferenceClient(model=self.url, token=self._token)
156
+
157
+ @property
158
+ def async_client(self) -> AsyncInferenceClient:
159
+ """Returns a client to make predictions on this Inference Endpoint.
160
+
161
+ Returns:
162
+ [`AsyncInferenceClient`]: an asyncio-compatible inference client pointing to the deployed endpoint.
163
+
164
+ Raises:
165
+ [`InferenceEndpointError`]: If the Inference Endpoint is not yet deployed.
166
+ """
167
+ if self.url is None:
168
+ raise InferenceEndpointError(
169
+ "Cannot create a client for this Inference Endpoint as it is not yet deployed. "
170
+ "Please wait for the Inference Endpoint to be deployed using `endpoint.wait()` and try again."
171
+ )
172
+ return AsyncInferenceClient(model=self.url, token=self._token)
173
+
174
+ def wait(self, timeout: Optional[int] = None, refresh_every: int = 5) -> "InferenceEndpoint":
175
+ """Wait for the Inference Endpoint to be deployed.
176
+
177
+ Information from the server will be fetched every 1s. If the Inference Endpoint is not deployed after `timeout`
178
+ seconds, a [`InferenceEndpointTimeoutError`] will be raised. The [`InferenceEndpoint`] will be mutated in place with the latest
179
+ data.
180
+
181
+ Args:
182
+ timeout (`int`, *optional*):
183
+ The maximum time to wait for the Inference Endpoint to be deployed, in seconds. If `None`, will wait
184
+ indefinitely.
185
+ refresh_every (`int`, *optional*):
186
+ The time to wait between each fetch of the Inference Endpoint status, in seconds. Defaults to 5s.
187
+
188
+ Returns:
189
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
190
+
191
+ Raises:
192
+ [`InferenceEndpointError`]
193
+ If the Inference Endpoint ended up in a failed state.
194
+ [`InferenceEndpointTimeoutError`]
195
+ If the Inference Endpoint is not deployed after `timeout` seconds.
196
+ """
197
+ if timeout is not None and timeout < 0:
198
+ raise ValueError("`timeout` cannot be negative.")
199
+ if refresh_every <= 0:
200
+ raise ValueError("`refresh_every` must be positive.")
201
+
202
+ start = time.time()
203
+ while True:
204
+ if self.url is not None:
205
+ # Means the URL is provisioned => check if the endpoint is reachable
206
+ response = get_session().get(self.url, headers=self._api._build_hf_headers(token=self._token))
207
+ if response.status_code == 200:
208
+ logger.info("Inference Endpoint is ready to be used.")
209
+ return self
210
+ if self.status == InferenceEndpointStatus.FAILED:
211
+ raise InferenceEndpointError(
212
+ f"Inference Endpoint {self.name} failed to deploy. Please check the logs for more information."
213
+ )
214
+ if timeout is not None:
215
+ if time.time() - start > timeout:
216
+ raise InferenceEndpointTimeoutError("Timeout while waiting for Inference Endpoint to be deployed.")
217
+ logger.info(f"Inference Endpoint is not deployed yet ({self.status}). Waiting {refresh_every}s...")
218
+ time.sleep(refresh_every)
219
+ self.fetch()
220
+
221
+ def fetch(self) -> "InferenceEndpoint":
222
+ """Fetch latest information about the Inference Endpoint.
223
+
224
+ Returns:
225
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
226
+ """
227
+ obj = self._api.get_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
228
+ self.raw = obj.raw
229
+ self._populate_from_raw()
230
+ return self
231
+
232
+ def update(
233
+ self,
234
+ *,
235
+ # Compute update
236
+ accelerator: Optional[str] = None,
237
+ instance_size: Optional[str] = None,
238
+ instance_type: Optional[str] = None,
239
+ min_replica: Optional[int] = None,
240
+ max_replica: Optional[int] = None,
241
+ scale_to_zero_timeout: Optional[int] = None,
242
+ # Model update
243
+ repository: Optional[str] = None,
244
+ framework: Optional[str] = None,
245
+ revision: Optional[str] = None,
246
+ task: Optional[str] = None,
247
+ custom_image: Optional[Dict] = None,
248
+ secrets: Optional[Dict[str, str]] = None,
249
+ ) -> "InferenceEndpoint":
250
+ """Update the Inference Endpoint.
251
+
252
+ This method allows the update of either the compute configuration, the deployed model, or both. All arguments are
253
+ optional but at least one must be provided.
254
+
255
+ This is an alias for [`HfApi.update_inference_endpoint`]. The current object is mutated in place with the
256
+ latest data from the server.
257
+
258
+ Args:
259
+ accelerator (`str`, *optional*):
260
+ The hardware accelerator to be used for inference (e.g. `"cpu"`).
261
+ instance_size (`str`, *optional*):
262
+ The size or type of the instance to be used for hosting the model (e.g. `"x4"`).
263
+ instance_type (`str`, *optional*):
264
+ The cloud instance type where the Inference Endpoint will be deployed (e.g. `"intel-icl"`).
265
+ min_replica (`int`, *optional*):
266
+ The minimum number of replicas (instances) to keep running for the Inference Endpoint.
267
+ max_replica (`int`, *optional*):
268
+ The maximum number of replicas (instances) to scale to for the Inference Endpoint.
269
+ scale_to_zero_timeout (`int`, *optional*):
270
+ The duration in minutes before an inactive endpoint is scaled to zero.
271
+
272
+ repository (`str`, *optional*):
273
+ The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`).
274
+ framework (`str`, *optional*):
275
+ The machine learning framework used for the model (e.g. `"custom"`).
276
+ revision (`str`, *optional*):
277
+ The specific model revision to deploy on the Inference Endpoint (e.g. `"6c0e6080953db56375760c0471a8c5f2929baf11"`).
278
+ task (`str`, *optional*):
279
+ The task on which to deploy the model (e.g. `"text-classification"`).
280
+ custom_image (`Dict`, *optional*):
281
+ A custom Docker image to use for the Inference Endpoint. This is useful if you want to deploy an
282
+ Inference Endpoint running on the `text-generation-inference` (TGI) framework (see examples).
283
+ secrets (`Dict[str, str]`, *optional*):
284
+ Secret values to inject in the container environment.
285
+ Returns:
286
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
287
+ """
288
+ # Make API call
289
+ obj = self._api.update_inference_endpoint(
290
+ name=self.name,
291
+ namespace=self.namespace,
292
+ accelerator=accelerator,
293
+ instance_size=instance_size,
294
+ instance_type=instance_type,
295
+ min_replica=min_replica,
296
+ max_replica=max_replica,
297
+ scale_to_zero_timeout=scale_to_zero_timeout,
298
+ repository=repository,
299
+ framework=framework,
300
+ revision=revision,
301
+ task=task,
302
+ custom_image=custom_image,
303
+ secrets=secrets,
304
+ token=self._token, # type: ignore [arg-type]
305
+ )
306
+
307
+ # Mutate current object
308
+ self.raw = obj.raw
309
+ self._populate_from_raw()
310
+ return self
311
+
312
+ def pause(self) -> "InferenceEndpoint":
313
+ """Pause the Inference Endpoint.
314
+
315
+ A paused Inference Endpoint will not be charged. It can be resumed at any time using [`InferenceEndpoint.resume`].
316
+ This is different than scaling the Inference Endpoint to zero with [`InferenceEndpoint.scale_to_zero`], which
317
+ would be automatically restarted when a request is made to it.
318
+
319
+ This is an alias for [`HfApi.pause_inference_endpoint`]. The current object is mutated in place with the
320
+ latest data from the server.
321
+
322
+ Returns:
323
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
324
+ """
325
+ obj = self._api.pause_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
326
+ self.raw = obj.raw
327
+ self._populate_from_raw()
328
+ return self
329
+
330
+ def resume(self, running_ok: bool = True) -> "InferenceEndpoint":
331
+ """Resume the Inference Endpoint.
332
+
333
+ This is an alias for [`HfApi.resume_inference_endpoint`]. The current object is mutated in place with the
334
+ latest data from the server.
335
+
336
+ Args:
337
+ running_ok (`bool`, *optional*):
338
+ If `True`, the method will not raise an error if the Inference Endpoint is already running. Defaults to
339
+ `True`.
340
+
341
+ Returns:
342
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
343
+ """
344
+ obj = self._api.resume_inference_endpoint(
345
+ name=self.name, namespace=self.namespace, running_ok=running_ok, token=self._token
346
+ ) # type: ignore [arg-type]
347
+ self.raw = obj.raw
348
+ self._populate_from_raw()
349
+ return self
350
+
351
+ def scale_to_zero(self) -> "InferenceEndpoint":
352
+ """Scale Inference Endpoint to zero.
353
+
354
+ An Inference Endpoint scaled to zero will not be charged. It will be resume on the next request to it, with a
355
+ cold start delay. This is different than pausing the Inference Endpoint with [`InferenceEndpoint.pause`], which
356
+ would require a manual resume with [`InferenceEndpoint.resume`].
357
+
358
+ This is an alias for [`HfApi.scale_to_zero_inference_endpoint`]. The current object is mutated in place with the
359
+ latest data from the server.
360
+
361
+ Returns:
362
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
363
+ """
364
+ obj = self._api.scale_to_zero_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
365
+ self.raw = obj.raw
366
+ self._populate_from_raw()
367
+ return self
368
+
369
+ def delete(self) -> None:
370
+ """Delete the Inference Endpoint.
371
+
372
+ This operation is not reversible. If you don't want to be charged for an Inference Endpoint, it is preferable
373
+ to pause it with [`InferenceEndpoint.pause`] or scale it to zero with [`InferenceEndpoint.scale_to_zero`].
374
+
375
+ This is an alias for [`HfApi.delete_inference_endpoint`].
376
+ """
377
+ self._api.delete_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
378
+
379
+ def _populate_from_raw(self) -> None:
380
+ """Populate fields from raw dictionary.
381
+
382
+ Called in __post_init__ + each time the Inference Endpoint is updated.
383
+ """
384
+ # Repr fields
385
+ self.name = self.raw["name"]
386
+ self.repository = self.raw["model"]["repository"]
387
+ self.status = self.raw["status"]["state"]
388
+ self.url = self.raw["status"].get("url")
389
+
390
+ # Other fields
391
+ self.framework = self.raw["model"]["framework"]
392
+ self.revision = self.raw["model"]["revision"]
393
+ self.task = self.raw["model"]["task"]
394
+ self.created_at = parse_datetime(self.raw["status"]["createdAt"])
395
+ self.updated_at = parse_datetime(self.raw["status"]["updatedAt"])
396
+ self.type = self.raw["type"]
meow/lib/python3.13/site-packages/huggingface_hub/_local_folder.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024-present, the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Contains utilities to handle the `../.cache/huggingface` folder in local directories.
16
+
17
+ First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store
18
+ download metadata when downloading files from the hub to a local directory (without
19
+ using the cache).
20
+
21
+ ./.cache/huggingface folder structure:
22
+ [4.0K] data
23
+ ├── [4.0K] .cache
24
+ │ └── [4.0K] huggingface
25
+ │ └── [4.0K] download
26
+ │ ├── [ 16] file.parquet.metadata
27
+ │ ├── [ 16] file.txt.metadata
28
+ │ └── [4.0K] folder
29
+ │ └── [ 16] file.parquet.metadata
30
+
31
+ ├── [6.5G] file.parquet
32
+ ├── [1.5K] file.txt
33
+ └── [4.0K] folder
34
+ └── [ 16] file.parquet
35
+
36
+
37
+ Download metadata file structure:
38
+ ```
39
+ # file.txt.metadata
40
+ 11c5a3d5811f50298f278a704980280950aedb10
41
+ a16a55fda99d2f2e7b69cce5cf93ff4ad3049930
42
+ 1712656091.123
43
+
44
+ # file.parquet.metadata
45
+ 11c5a3d5811f50298f278a704980280950aedb10
46
+ 7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421
47
+ 1712656091.123
48
+ }
49
+ ```
50
+ """
51
+
52
+ import logging
53
+ import os
54
+ import time
55
+ from dataclasses import dataclass
56
+ from pathlib import Path
57
+ from typing import Optional
58
+
59
+ from .utils import WeakFileLock
60
+
61
+
62
+ logger = logging.getLogger(__name__)
63
+
64
+
65
+ @dataclass
66
+ class LocalDownloadFilePaths:
67
+ """
68
+ Paths to the files related to a download process in a local dir.
69
+
70
+ Returned by [`get_local_download_paths`].
71
+
72
+ Attributes:
73
+ file_path (`Path`):
74
+ Path where the file will be saved.
75
+ lock_path (`Path`):
76
+ Path to the lock file used to ensure atomicity when reading/writing metadata.
77
+ metadata_path (`Path`):
78
+ Path to the metadata file.
79
+ """
80
+
81
+ file_path: Path
82
+ lock_path: Path
83
+ metadata_path: Path
84
+
85
+ def incomplete_path(self, etag: str) -> Path:
86
+ """Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
87
+ return self.metadata_path.with_suffix(f".{etag}.incomplete")
88
+
89
+
90
+ @dataclass(frozen=True)
91
+ class LocalUploadFilePaths:
92
+ """
93
+ Paths to the files related to an upload process in a local dir.
94
+
95
+ Returned by [`get_local_upload_paths`].
96
+
97
+ Attributes:
98
+ path_in_repo (`str`):
99
+ Path of the file in the repo.
100
+ file_path (`Path`):
101
+ Path where the file will be saved.
102
+ lock_path (`Path`):
103
+ Path to the lock file used to ensure atomicity when reading/writing metadata.
104
+ metadata_path (`Path`):
105
+ Path to the metadata file.
106
+ """
107
+
108
+ path_in_repo: str
109
+ file_path: Path
110
+ lock_path: Path
111
+ metadata_path: Path
112
+
113
+
114
+ @dataclass
115
+ class LocalDownloadFileMetadata:
116
+ """
117
+ Metadata about a file in the local directory related to a download process.
118
+
119
+ Attributes:
120
+ filename (`str`):
121
+ Path of the file in the repo.
122
+ commit_hash (`str`):
123
+ Commit hash of the file in the repo.
124
+ etag (`str`):
125
+ ETag of the file in the repo. Used to check if the file has changed.
126
+ For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash.
127
+ timestamp (`int`):
128
+ Unix timestamp of when the metadata was saved i.e. when the metadata was accurate.
129
+ """
130
+
131
+ filename: str
132
+ commit_hash: str
133
+ etag: str
134
+ timestamp: float
135
+
136
+
137
+ @dataclass
138
+ class LocalUploadFileMetadata:
139
+ """
140
+ Metadata about a file in the local directory related to an upload process.
141
+ """
142
+
143
+ size: int
144
+
145
+ # Default values correspond to "we don't know yet"
146
+ timestamp: Optional[float] = None
147
+ should_ignore: Optional[bool] = None
148
+ sha256: Optional[str] = None
149
+ upload_mode: Optional[str] = None
150
+ is_uploaded: bool = False
151
+ is_committed: bool = False
152
+
153
+ def save(self, paths: LocalUploadFilePaths) -> None:
154
+ """Save the metadata to disk."""
155
+ with WeakFileLock(paths.lock_path):
156
+ with paths.metadata_path.open("w") as f:
157
+ new_timestamp = time.time()
158
+ f.write(str(new_timestamp) + "\n")
159
+
160
+ f.write(str(self.size)) # never None
161
+ f.write("\n")
162
+
163
+ if self.should_ignore is not None:
164
+ f.write(str(int(self.should_ignore)))
165
+ f.write("\n")
166
+
167
+ if self.sha256 is not None:
168
+ f.write(self.sha256)
169
+ f.write("\n")
170
+
171
+ if self.upload_mode is not None:
172
+ f.write(self.upload_mode)
173
+ f.write("\n")
174
+
175
+ f.write(str(int(self.is_uploaded)) + "\n")
176
+ f.write(str(int(self.is_committed)) + "\n")
177
+
178
+ self.timestamp = new_timestamp
179
+
180
+
181
+ def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths:
182
+ """Compute paths to the files related to a download process.
183
+
184
+ Folders containing the paths are all guaranteed to exist.
185
+
186
+ Args:
187
+ local_dir (`Path`):
188
+ Path to the local directory in which files are downloaded.
189
+ filename (`str`):
190
+ Path of the file in the repo.
191
+
192
+ Return:
193
+ [`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path).
194
+ """
195
+ # filename is the path in the Hub repository (separated by '/')
196
+ # make sure to have a cross platform transcription
197
+ sanitized_filename = os.path.join(*filename.split("/"))
198
+ if os.name == "nt":
199
+ if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
200
+ raise ValueError(
201
+ f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
202
+ " owner to rename this file."
203
+ )
204
+ file_path = local_dir / sanitized_filename
205
+ metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata"
206
+ lock_path = metadata_path.with_suffix(".lock")
207
+
208
+ # Some Windows versions do not allow for paths longer than 255 characters.
209
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix
210
+ if os.name == "nt":
211
+ if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
212
+ file_path = Path("\\\\?\\" + os.path.abspath(file_path))
213
+ lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
214
+ metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
215
+
216
+ file_path.parent.mkdir(parents=True, exist_ok=True)
217
+ metadata_path.parent.mkdir(parents=True, exist_ok=True)
218
+ return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path)
219
+
220
+
221
+ def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths:
222
+ """Compute paths to the files related to an upload process.
223
+
224
+ Folders containing the paths are all guaranteed to exist.
225
+
226
+ Args:
227
+ local_dir (`Path`):
228
+ Path to the local directory that is uploaded.
229
+ filename (`str`):
230
+ Path of the file in the repo.
231
+
232
+ Return:
233
+ [`LocalUploadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path).
234
+ """
235
+ # filename is the path in the Hub repository (separated by '/')
236
+ # make sure to have a cross platform transcription
237
+ sanitized_filename = os.path.join(*filename.split("/"))
238
+ if os.name == "nt":
239
+ if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
240
+ raise ValueError(
241
+ f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
242
+ " owner to rename this file."
243
+ )
244
+ file_path = local_dir / sanitized_filename
245
+ metadata_path = _huggingface_dir(local_dir) / "upload" / f"{sanitized_filename}.metadata"
246
+ lock_path = metadata_path.with_suffix(".lock")
247
+
248
+ # Some Windows versions do not allow for paths longer than 255 characters.
249
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix
250
+ if os.name == "nt":
251
+ if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
252
+ file_path = Path("\\\\?\\" + os.path.abspath(file_path))
253
+ lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
254
+ metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
255
+
256
+ file_path.parent.mkdir(parents=True, exist_ok=True)
257
+ metadata_path.parent.mkdir(parents=True, exist_ok=True)
258
+ return LocalUploadFilePaths(
259
+ path_in_repo=filename, file_path=file_path, lock_path=lock_path, metadata_path=metadata_path
260
+ )
261
+
262
+
263
+ def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]:
264
+ """Read metadata about a file in the local directory related to a download process.
265
+
266
+ Args:
267
+ local_dir (`Path`):
268
+ Path to the local directory in which files are downloaded.
269
+ filename (`str`):
270
+ Path of the file in the repo.
271
+
272
+ Return:
273
+ `[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
274
+ """
275
+ paths = get_local_download_paths(local_dir, filename)
276
+ with WeakFileLock(paths.lock_path):
277
+ if paths.metadata_path.exists():
278
+ try:
279
+ with paths.metadata_path.open() as f:
280
+ commit_hash = f.readline().strip()
281
+ etag = f.readline().strip()
282
+ timestamp = float(f.readline().strip())
283
+ metadata = LocalDownloadFileMetadata(
284
+ filename=filename,
285
+ commit_hash=commit_hash,
286
+ etag=etag,
287
+ timestamp=timestamp,
288
+ )
289
+ except Exception as e:
290
+ # remove the metadata file if it is corrupted / not the right format
291
+ logger.warning(
292
+ f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
293
+ )
294
+ try:
295
+ paths.metadata_path.unlink()
296
+ except Exception as e:
297
+ logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
298
+
299
+ try:
300
+ # check if the file exists and hasn't been modified since the metadata was saved
301
+ stat = paths.file_path.stat()
302
+ if (
303
+ stat.st_mtime - 1 <= metadata.timestamp
304
+ ): # allow 1s difference as stat.st_mtime might not be precise
305
+ return metadata
306
+ logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
307
+ except FileNotFoundError:
308
+ # file does not exist => metadata is outdated
309
+ return None
310
+ return None
311
+
312
+
313
+ def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetadata:
314
+ """Read metadata about a file in the local directory related to an upload process.
315
+
316
+ TODO: factorize logic with `read_download_metadata`.
317
+
318
+ Args:
319
+ local_dir (`Path`):
320
+ Path to the local directory in which files are downloaded.
321
+ filename (`str`):
322
+ Path of the file in the repo.
323
+
324
+ Return:
325
+ `[LocalUploadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
326
+ """
327
+ paths = get_local_upload_paths(local_dir, filename)
328
+ with WeakFileLock(paths.lock_path):
329
+ if paths.metadata_path.exists():
330
+ try:
331
+ with paths.metadata_path.open() as f:
332
+ timestamp = float(f.readline().strip())
333
+
334
+ size = int(f.readline().strip()) # never None
335
+
336
+ _should_ignore = f.readline().strip()
337
+ should_ignore = None if _should_ignore == "" else bool(int(_should_ignore))
338
+
339
+ _sha256 = f.readline().strip()
340
+ sha256 = None if _sha256 == "" else _sha256
341
+
342
+ _upload_mode = f.readline().strip()
343
+ upload_mode = None if _upload_mode == "" else _upload_mode
344
+ if upload_mode not in (None, "regular", "lfs"):
345
+ raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}")
346
+
347
+ is_uploaded = bool(int(f.readline().strip()))
348
+ is_committed = bool(int(f.readline().strip()))
349
+
350
+ metadata = LocalUploadFileMetadata(
351
+ timestamp=timestamp,
352
+ size=size,
353
+ should_ignore=should_ignore,
354
+ sha256=sha256,
355
+ upload_mode=upload_mode,
356
+ is_uploaded=is_uploaded,
357
+ is_committed=is_committed,
358
+ )
359
+ except Exception as e:
360
+ # remove the metadata file if it is corrupted / not the right format
361
+ logger.warning(
362
+ f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
363
+ )
364
+ try:
365
+ paths.metadata_path.unlink()
366
+ except Exception as e:
367
+ logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
368
+
369
+ # TODO: can we do better?
370
+ if (
371
+ metadata.timestamp is not None
372
+ and metadata.is_uploaded # file was uploaded
373
+ and not metadata.is_committed # but not committed
374
+ and time.time() - metadata.timestamp > 20 * 3600 # and it's been more than 20 hours
375
+ ): # => we consider it as garbage-collected by S3
376
+ metadata.is_uploaded = False
377
+
378
+ # check if the file exists and hasn't been modified since the metadata was saved
379
+ try:
380
+ if metadata.timestamp is not None and paths.file_path.stat().st_mtime <= metadata.timestamp:
381
+ return metadata
382
+ logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
383
+ except FileNotFoundError:
384
+ # file does not exist => metadata is outdated
385
+ pass
386
+
387
+ # empty metadata => we don't know anything expect its size
388
+ return LocalUploadFileMetadata(size=paths.file_path.stat().st_size)
389
+
390
+
391
+ def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None:
392
+ """Write metadata about a file in the local directory related to a download process.
393
+
394
+ Args:
395
+ local_dir (`Path`):
396
+ Path to the local directory in which files are downloaded.
397
+ """
398
+ paths = get_local_download_paths(local_dir, filename)
399
+ with WeakFileLock(paths.lock_path):
400
+ with paths.metadata_path.open("w") as f:
401
+ f.write(f"{commit_hash}\n{etag}\n{time.time()}\n")
402
+
403
+
404
+ def _huggingface_dir(local_dir: Path) -> Path:
405
+ """Return the path to the `.cache/huggingface` directory in a local directory."""
406
+ # Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times
407
+ path = local_dir / ".cache" / "huggingface"
408
+ path.mkdir(exist_ok=True, parents=True)
409
+
410
+ # Create a .gitignore file in the .cache/huggingface directory if it doesn't exist
411
+ # Should be thread-safe enough like this.
412
+ gitignore = path / ".gitignore"
413
+ gitignore_lock = path / ".gitignore.lock"
414
+ if not gitignore.exists():
415
+ try:
416
+ with WeakFileLock(gitignore_lock):
417
+ gitignore.write_text("*")
418
+ gitignore_lock.unlink()
419
+ except OSError: # FileNotFoundError, PermissionError, etc.
420
+ pass
421
+ return path
meow/lib/python3.13/site-packages/huggingface_hub/_login.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains methods to log in to the Hub."""
15
+
16
+ import os
17
+ import subprocess
18
+ from getpass import getpass
19
+ from pathlib import Path
20
+ from typing import Optional
21
+
22
+ from . import constants
23
+ from .commands._cli_utils import ANSI
24
+ from .utils import (
25
+ capture_output,
26
+ get_token,
27
+ is_google_colab,
28
+ is_notebook,
29
+ list_credential_helpers,
30
+ logging,
31
+ run_subprocess,
32
+ set_git_credential,
33
+ unset_git_credential,
34
+ )
35
+ from .utils._auth import (
36
+ _get_token_by_name,
37
+ _get_token_from_environment,
38
+ _get_token_from_file,
39
+ _get_token_from_google_colab,
40
+ _save_stored_tokens,
41
+ _save_token,
42
+ get_stored_tokens,
43
+ )
44
+ from .utils._deprecation import _deprecate_arguments, _deprecate_positional_args
45
+
46
+
47
+ logger = logging.get_logger(__name__)
48
+
49
+ _HF_LOGO_ASCII = """
50
+ _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
51
+ _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
52
+ _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
53
+ _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
54
+ _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
55
+ """
56
+
57
+
58
+ @_deprecate_arguments(
59
+ version="1.0",
60
+ deprecated_args="write_permission",
61
+ custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
62
+ )
63
+ @_deprecate_positional_args(version="1.0")
64
+ def login(
65
+ token: Optional[str] = None,
66
+ *,
67
+ add_to_git_credential: bool = False,
68
+ new_session: bool = True,
69
+ write_permission: bool = False,
70
+ ) -> None:
71
+ """Login the machine to access the Hub.
72
+
73
+ The `token` is persisted in cache and set as a git credential. Once done, the machine
74
+ is logged in and the access token will be available across all `huggingface_hub`
75
+ components. If `token` is not provided, it will be prompted to the user either with
76
+ a widget (in a notebook) or via the terminal.
77
+
78
+ To log in from outside of a script, one can also use `huggingface-cli login` which is
79
+ a cli command that wraps [`login`].
80
+
81
+ <Tip>
82
+
83
+ [`login`] is a drop-in replacement method for [`notebook_login`] as it wraps and
84
+ extends its capabilities.
85
+
86
+ </Tip>
87
+
88
+ <Tip>
89
+
90
+ When the token is not passed, [`login`] will automatically detect if the script runs
91
+ in a notebook or not. However, this detection might not be accurate due to the
92
+ variety of notebooks that exists nowadays. If that is the case, you can always force
93
+ the UI by using [`notebook_login`] or [`interpreter_login`].
94
+
95
+ </Tip>
96
+
97
+ Args:
98
+ token (`str`, *optional*):
99
+ User access token to generate from https://huggingface.co/settings/token.
100
+ add_to_git_credential (`bool`, defaults to `False`):
101
+ If `True`, token will be set as git credential. If no git credential helper
102
+ is configured, a warning will be displayed to the user. If `token` is `None`,
103
+ the value of `add_to_git_credential` is ignored and will be prompted again
104
+ to the end user.
105
+ new_session (`bool`, defaults to `True`):
106
+ If `True`, will request a token even if one is already saved on the machine.
107
+ write_permission (`bool`):
108
+ Ignored and deprecated argument.
109
+ Raises:
110
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
111
+ If an organization token is passed. Only personal account tokens are valid
112
+ to log in.
113
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
114
+ If token is invalid.
115
+ [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
116
+ If running in a notebook but `ipywidgets` is not installed.
117
+ """
118
+ if token is not None:
119
+ if not add_to_git_credential:
120
+ logger.info(
121
+ "The token has not been saved to the git credentials helper. Pass "
122
+ "`add_to_git_credential=True` in this function directly or "
123
+ "`--add-to-git-credential` if using via `huggingface-cli` if "
124
+ "you want to set the git credential as well."
125
+ )
126
+ _login(token, add_to_git_credential=add_to_git_credential)
127
+ elif is_notebook():
128
+ notebook_login(new_session=new_session)
129
+ else:
130
+ interpreter_login(new_session=new_session)
131
+
132
+
133
+ def logout(token_name: Optional[str] = None) -> None:
134
+ """Logout the machine from the Hub.
135
+
136
+ Token is deleted from the machine and removed from git credential.
137
+
138
+ Args:
139
+ token_name (`str`, *optional*):
140
+ Name of the access token to logout from. If `None`, will logout from all saved access tokens.
141
+ Raises:
142
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
143
+ If the access token name is not found.
144
+ """
145
+ if get_token() is None and not get_stored_tokens(): # No active token and no saved access tokens
146
+ logger.warning("Not logged in!")
147
+ return
148
+ if not token_name:
149
+ # Delete all saved access tokens and token
150
+ for file_path in (constants.HF_TOKEN_PATH, constants.HF_STORED_TOKENS_PATH):
151
+ try:
152
+ Path(file_path).unlink()
153
+ except FileNotFoundError:
154
+ pass
155
+ logger.info("Successfully logged out from all access tokens.")
156
+ else:
157
+ _logout_from_token(token_name)
158
+ logger.info(f"Successfully logged out from access token: {token_name}.")
159
+
160
+ unset_git_credential()
161
+
162
+ # Check if still logged in
163
+ if _get_token_from_google_colab() is not None:
164
+ raise EnvironmentError(
165
+ "You are automatically logged in using a Google Colab secret.\n"
166
+ "To log out, you must unset the `HF_TOKEN` secret in your Colab settings."
167
+ )
168
+ if _get_token_from_environment() is not None:
169
+ raise EnvironmentError(
170
+ "Token has been deleted from your machine but you are still logged in.\n"
171
+ "To log out, you must clear out both `HF_TOKEN` and `HUGGING_FACE_HUB_TOKEN` environment variables."
172
+ )
173
+
174
+
175
+ def auth_switch(token_name: str, add_to_git_credential: bool = False) -> None:
176
+ """Switch to a different access token.
177
+
178
+ Args:
179
+ token_name (`str`):
180
+ Name of the access token to switch to.
181
+ add_to_git_credential (`bool`, defaults to `False`):
182
+ If `True`, token will be set as git credential. If no git credential helper
183
+ is configured, a warning will be displayed to the user. If `token` is `None`,
184
+ the value of `add_to_git_credential` is ignored and will be prompted again
185
+ to the end user.
186
+
187
+ Raises:
188
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
189
+ If the access token name is not found.
190
+ """
191
+ token = _get_token_by_name(token_name)
192
+ if not token:
193
+ raise ValueError(f"Access token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}")
194
+ # Write token to HF_TOKEN_PATH
195
+ _set_active_token(token_name, add_to_git_credential)
196
+ logger.info(f"The current active token is: {token_name}")
197
+ token_from_environment = _get_token_from_environment()
198
+ if token_from_environment is not None and token_from_environment != token:
199
+ logger.warning(
200
+ "The environment variable `HF_TOKEN` is set and will override the access token you've just switched to."
201
+ )
202
+
203
+
204
+ def auth_list() -> None:
205
+ """List all stored access tokens."""
206
+ tokens = get_stored_tokens()
207
+
208
+ if not tokens:
209
+ logger.info("No access tokens found.")
210
+ return
211
+ # Find current token
212
+ current_token = get_token()
213
+ current_token_name = None
214
+ for token_name in tokens:
215
+ if tokens.get(token_name) == current_token:
216
+ current_token_name = token_name
217
+ # Print header
218
+ max_offset = max(len("token"), max(len(token) for token in tokens)) + 2
219
+ print(f" {{:<{max_offset}}}| {{:<15}}".format("name", "token"))
220
+ print("-" * (max_offset + 2) + "|" + "-" * 15)
221
+
222
+ # Print saved access tokens
223
+ for token_name in tokens:
224
+ token = tokens.get(token_name, "<not set>")
225
+ masked_token = f"{token[:3]}****{token[-4:]}" if token != "<not set>" else token
226
+ is_current = "*" if token == current_token else " "
227
+
228
+ print(f"{is_current} {{:<{max_offset}}}| {{:<15}}".format(token_name, masked_token))
229
+
230
+ if _get_token_from_environment():
231
+ logger.warning(
232
+ "\nNote: Environment variable `HF_TOKEN` is set and is the current active token independently from the stored tokens listed above."
233
+ )
234
+ elif current_token_name is None:
235
+ logger.warning(
236
+ "\nNote: No active token is set and no environment variable `HF_TOKEN` is found. Use `huggingface-cli login` to log in."
237
+ )
238
+
239
+
240
+ ###
241
+ # Interpreter-based login (text)
242
+ ###
243
+
244
+
245
+ @_deprecate_arguments(
246
+ version="1.0",
247
+ deprecated_args="write_permission",
248
+ custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
249
+ )
250
+ @_deprecate_positional_args(version="1.0")
251
+ def interpreter_login(*, new_session: bool = True, write_permission: bool = False) -> None:
252
+ """
253
+ Displays a prompt to log in to the HF website and store the token.
254
+
255
+ This is equivalent to [`login`] without passing a token when not run in a notebook.
256
+ [`interpreter_login`] is useful if you want to force the use of the terminal prompt
257
+ instead of a notebook widget.
258
+
259
+ For more details, see [`login`].
260
+
261
+ Args:
262
+ new_session (`bool`, defaults to `True`):
263
+ If `True`, will request a token even if one is already saved on the machine.
264
+ write_permission (`bool`):
265
+ Ignored and deprecated argument.
266
+ """
267
+ if not new_session and get_token() is not None:
268
+ logger.info("User is already logged in.")
269
+ return
270
+
271
+ from .commands.delete_cache import _ask_for_confirmation_no_tui
272
+
273
+ print(_HF_LOGO_ASCII)
274
+ if get_token() is not None:
275
+ logger.info(
276
+ " A token is already saved on your machine. Run `huggingface-cli"
277
+ " whoami` to get more information or `huggingface-cli logout` if you want"
278
+ " to log out."
279
+ )
280
+ logger.info(" Setting a new token will erase the existing one.")
281
+
282
+ logger.info(
283
+ " To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens ."
284
+ )
285
+ if os.name == "nt":
286
+ logger.info("Token can be pasted using 'Right-Click'.")
287
+ token = getpass("Enter your token (input will not be visible): ")
288
+ add_to_git_credential = _ask_for_confirmation_no_tui("Add token as git credential?")
289
+
290
+ _login(token=token, add_to_git_credential=add_to_git_credential)
291
+
292
+
293
+ ###
294
+ # Notebook-based login (widget)
295
+ ###
296
+
297
+ NOTEBOOK_LOGIN_PASSWORD_HTML = """<center> <img
298
+ src=https://huggingface.co/front/assets/huggingface_logo-noborder.svg
299
+ alt='Hugging Face'> <br> Immediately click login after typing your password or
300
+ it might be stored in plain text in this notebook file. </center>"""
301
+
302
+
303
+ NOTEBOOK_LOGIN_TOKEN_HTML_START = """<center> <img
304
+ src=https://huggingface.co/front/assets/huggingface_logo-noborder.svg
305
+ alt='Hugging Face'> <br> Copy a token from <a
306
+ href="https://huggingface.co/settings/tokens" target="_blank">your Hugging Face
307
+ tokens page</a> and paste it below. <br> Immediately click login after copying
308
+ your token or it might be stored in plain text in this notebook file. </center>"""
309
+
310
+
311
+ NOTEBOOK_LOGIN_TOKEN_HTML_END = """
312
+ <b>Pro Tip:</b> If you don't already have one, you can create a dedicated
313
+ 'notebooks' token with 'write' access, that you can then easily reuse for all
314
+ notebooks. </center>"""
315
+
316
+
317
+ @_deprecate_arguments(
318
+ version="1.0",
319
+ deprecated_args="write_permission",
320
+ custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
321
+ )
322
+ @_deprecate_positional_args(version="1.0")
323
+ def notebook_login(*, new_session: bool = True, write_permission: bool = False) -> None:
324
+ """
325
+ Displays a widget to log in to the HF website and store the token.
326
+
327
+ This is equivalent to [`login`] without passing a token when run in a notebook.
328
+ [`notebook_login`] is useful if you want to force the use of the notebook widget
329
+ instead of a prompt in the terminal.
330
+
331
+ For more details, see [`login`].
332
+
333
+ Args:
334
+ new_session (`bool`, defaults to `True`):
335
+ If `True`, will request a token even if one is already saved on the machine.
336
+ write_permission (`bool`):
337
+ Ignored and deprecated argument.
338
+ """
339
+ try:
340
+ import ipywidgets.widgets as widgets # type: ignore
341
+ from IPython.display import display # type: ignore
342
+ except ImportError:
343
+ raise ImportError(
344
+ "The `notebook_login` function can only be used in a notebook (Jupyter or"
345
+ " Colab) and you need the `ipywidgets` module: `pip install ipywidgets`."
346
+ )
347
+ if not new_session and get_token() is not None:
348
+ logger.info("User is already logged in.")
349
+ return
350
+
351
+ box_layout = widgets.Layout(display="flex", flex_flow="column", align_items="center", width="50%")
352
+
353
+ token_widget = widgets.Password(description="Token:")
354
+ git_checkbox_widget = widgets.Checkbox(value=True, description="Add token as git credential?")
355
+ token_finish_button = widgets.Button(description="Login")
356
+
357
+ login_token_widget = widgets.VBox(
358
+ [
359
+ widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_START),
360
+ token_widget,
361
+ git_checkbox_widget,
362
+ token_finish_button,
363
+ widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_END),
364
+ ],
365
+ layout=box_layout,
366
+ )
367
+ display(login_token_widget)
368
+
369
+ # On click events
370
+ def login_token_event(t):
371
+ """Event handler for the login button."""
372
+ token = token_widget.value
373
+ add_to_git_credential = git_checkbox_widget.value
374
+ # Erase token and clear value to make sure it's not saved in the notebook.
375
+ token_widget.value = ""
376
+ # Hide inputs
377
+ login_token_widget.children = [widgets.Label("Connecting...")]
378
+ try:
379
+ with capture_output() as captured:
380
+ _login(token, add_to_git_credential=add_to_git_credential)
381
+ message = captured.getvalue()
382
+ except Exception as error:
383
+ message = str(error)
384
+ # Print result (success message or error)
385
+ login_token_widget.children = [widgets.Label(line) for line in message.split("\n") if line.strip()]
386
+
387
+ token_finish_button.on_click(login_token_event)
388
+
389
+
390
+ ###
391
+ # Login private helpers
392
+ ###
393
+
394
+
395
+ def _login(
396
+ token: str,
397
+ add_to_git_credential: bool,
398
+ ) -> None:
399
+ from .hf_api import whoami # avoid circular import
400
+
401
+ if token.startswith("api_org"):
402
+ raise ValueError("You must use your personal account token, not an organization token.")
403
+
404
+ token_info = whoami(token)
405
+ permission = token_info["auth"]["accessToken"]["role"]
406
+ logger.info(f"Token is valid (permission: {permission}).")
407
+
408
+ token_name = token_info["auth"]["accessToken"]["displayName"]
409
+ # Store token locally
410
+ _save_token(token=token, token_name=token_name)
411
+ # Set active token
412
+ _set_active_token(token_name=token_name, add_to_git_credential=add_to_git_credential)
413
+ logger.info("Login successful.")
414
+ if _get_token_from_environment():
415
+ logger.warning(
416
+ "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured."
417
+ )
418
+ else:
419
+ logger.info(f"The current active token is: `{token_name}`")
420
+
421
+
422
+ def _logout_from_token(token_name: str) -> None:
423
+ """Logout from a specific access token.
424
+
425
+ Args:
426
+ token_name (`str`):
427
+ The name of the access token to logout from.
428
+ Raises:
429
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
430
+ If the access token name is not found.
431
+ """
432
+ stored_tokens = get_stored_tokens()
433
+ # If there is no access tokens saved or the access token name is not found, do nothing
434
+ if not stored_tokens or token_name not in stored_tokens:
435
+ return
436
+
437
+ token = stored_tokens.pop(token_name)
438
+ _save_stored_tokens(stored_tokens)
439
+
440
+ if token == _get_token_from_file():
441
+ logger.warning(f"Active token '{token_name}' has been deleted.")
442
+ Path(constants.HF_TOKEN_PATH).unlink(missing_ok=True)
443
+
444
+
445
+ def _set_active_token(
446
+ token_name: str,
447
+ add_to_git_credential: bool,
448
+ ) -> None:
449
+ """Set the active access token.
450
+
451
+ Args:
452
+ token_name (`str`):
453
+ The name of the token to set as active.
454
+ """
455
+ token = _get_token_by_name(token_name)
456
+ if not token:
457
+ raise ValueError(f"Token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}")
458
+ if add_to_git_credential:
459
+ if _is_git_credential_helper_configured():
460
+ set_git_credential(token)
461
+ logger.info(
462
+ "Your token has been saved in your configured git credential helpers"
463
+ + f" ({','.join(list_credential_helpers())})."
464
+ )
465
+ else:
466
+ logger.warning("Token has not been saved to git credential helper.")
467
+ # Write token to HF_TOKEN_PATH
468
+ path = Path(constants.HF_TOKEN_PATH)
469
+ path.parent.mkdir(parents=True, exist_ok=True)
470
+ path.write_text(token)
471
+ logger.info(f"Your token has been saved to {constants.HF_TOKEN_PATH}")
472
+
473
+
474
+ def _is_git_credential_helper_configured() -> bool:
475
+ """Check if a git credential helper is configured.
476
+
477
+ Warns user if not the case (except for Google Colab where "store" is set by default
478
+ by `huggingface_hub`).
479
+ """
480
+ helpers = list_credential_helpers()
481
+ if len(helpers) > 0:
482
+ return True # Do not warn: at least 1 helper is set
483
+
484
+ # Only in Google Colab to avoid the warning message
485
+ # See https://github.com/huggingface/huggingface_hub/issues/1043#issuecomment-1247010710
486
+ if is_google_colab():
487
+ _set_store_as_git_credential_helper_globally()
488
+ return True # Do not warn: "store" is used by default in Google Colab
489
+
490
+ # Otherwise, warn user
491
+ print(
492
+ ANSI.red(
493
+ "Cannot authenticate through git-credential as no helper is defined on your"
494
+ " machine.\nYou might have to re-authenticate when pushing to the Hugging"
495
+ " Face Hub.\nRun the following command in your terminal in case you want to"
496
+ " set the 'store' credential helper as default.\n\ngit config --global"
497
+ " credential.helper store\n\nRead"
498
+ " https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more"
499
+ " details."
500
+ )
501
+ )
502
+ return False
503
+
504
+
505
+ def _set_store_as_git_credential_helper_globally() -> None:
506
+ """Set globally the credential.helper to `store`.
507
+
508
+ To be used only in Google Colab as we assume the user doesn't care about the git
509
+ credential config. It is the only particular case where we don't want to display the
510
+ warning message in [`notebook_login()`].
511
+
512
+ Related:
513
+ - https://github.com/huggingface/huggingface_hub/issues/1043
514
+ - https://github.com/huggingface/huggingface_hub/issues/1051
515
+ - https://git-scm.com/docs/git-credential-store
516
+ """
517
+ try:
518
+ run_subprocess("git config --global credential.helper store")
519
+ except subprocess.CalledProcessError as exc:
520
+ raise EnvironmentError(exc.stderr)
meow/lib/python3.13/site-packages/huggingface_hub/_snapshot_download.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Dict, List, Literal, Optional, Union
4
+
5
+ import requests
6
+ from tqdm.auto import tqdm as base_tqdm
7
+ from tqdm.contrib.concurrent import thread_map
8
+
9
+ from . import constants
10
+ from .errors import GatedRepoError, LocalEntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
11
+ from .file_download import REGEX_COMMIT_HASH, hf_hub_download, repo_folder_name
12
+ from .hf_api import DatasetInfo, HfApi, ModelInfo, SpaceInfo
13
+ from .utils import OfflineModeIsEnabled, filter_repo_objects, logging, validate_hf_hub_args
14
+ from .utils import tqdm as hf_tqdm
15
+
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+
20
+ @validate_hf_hub_args
21
+ def snapshot_download(
22
+ repo_id: str,
23
+ *,
24
+ repo_type: Optional[str] = None,
25
+ revision: Optional[str] = None,
26
+ cache_dir: Union[str, Path, None] = None,
27
+ local_dir: Union[str, Path, None] = None,
28
+ library_name: Optional[str] = None,
29
+ library_version: Optional[str] = None,
30
+ user_agent: Optional[Union[Dict, str]] = None,
31
+ proxies: Optional[Dict] = None,
32
+ etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
33
+ force_download: bool = False,
34
+ token: Optional[Union[bool, str]] = None,
35
+ local_files_only: bool = False,
36
+ allow_patterns: Optional[Union[List[str], str]] = None,
37
+ ignore_patterns: Optional[Union[List[str], str]] = None,
38
+ max_workers: int = 8,
39
+ tqdm_class: Optional[base_tqdm] = None,
40
+ headers: Optional[Dict[str, str]] = None,
41
+ endpoint: Optional[str] = None,
42
+ # Deprecated args
43
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
44
+ resume_download: Optional[bool] = None,
45
+ ) -> str:
46
+ """Download repo files.
47
+
48
+ Download a whole snapshot of a repo's files at the specified revision. This is useful when you want all files from
49
+ a repo, because you don't know which ones you will need a priori. All files are nested inside a folder in order
50
+ to keep their actual filename relative to that folder. You can also filter which files to download using
51
+ `allow_patterns` and `ignore_patterns`.
52
+
53
+ If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
54
+ option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir`
55
+ to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
56
+ cache-system, it's optimized for regularly pulling the latest version of a repository.
57
+
58
+ An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly
59
+ configured. It is also not possible to filter which files to download when cloning a repository using git.
60
+
61
+ Args:
62
+ repo_id (`str`):
63
+ A user or an organization name and a repo name separated by a `/`.
64
+ repo_type (`str`, *optional*):
65
+ Set to `"dataset"` or `"space"` if downloading from a dataset or space,
66
+ `None` or `"model"` if downloading from a model. Default is `None`.
67
+ revision (`str`, *optional*):
68
+ An optional Git revision id which can be a branch name, a tag, or a
69
+ commit hash.
70
+ cache_dir (`str`, `Path`, *optional*):
71
+ Path to the folder where cached files are stored.
72
+ local_dir (`str` or `Path`, *optional*):
73
+ If provided, the downloaded files will be placed under this directory.
74
+ library_name (`str`, *optional*):
75
+ The name of the library to which the object corresponds.
76
+ library_version (`str`, *optional*):
77
+ The version of the library.
78
+ user_agent (`str`, `dict`, *optional*):
79
+ The user-agent info in the form of a dictionary or a string.
80
+ proxies (`dict`, *optional*):
81
+ Dictionary mapping protocol to the URL of the proxy passed to
82
+ `requests.request`.
83
+ etag_timeout (`float`, *optional*, defaults to `10`):
84
+ When fetching ETag, how many seconds to wait for the server to send
85
+ data before giving up which is passed to `requests.request`.
86
+ force_download (`bool`, *optional*, defaults to `False`):
87
+ Whether the file should be downloaded even if it already exists in the local cache.
88
+ token (`str`, `bool`, *optional*):
89
+ A token to be used for the download.
90
+ - If `True`, the token is read from the HuggingFace config
91
+ folder.
92
+ - If a string, it's used as the authentication token.
93
+ headers (`dict`, *optional*):
94
+ Additional headers to include in the request. Those headers take precedence over the others.
95
+ local_files_only (`bool`, *optional*, defaults to `False`):
96
+ If `True`, avoid downloading the file and return the path to the
97
+ local cached file if it exists.
98
+ allow_patterns (`List[str]` or `str`, *optional*):
99
+ If provided, only files matching at least one pattern are downloaded.
100
+ ignore_patterns (`List[str]` or `str`, *optional*):
101
+ If provided, files matching any of the patterns are not downloaded.
102
+ max_workers (`int`, *optional*):
103
+ Number of concurrent threads to download files (1 thread = 1 file download).
104
+ Defaults to 8.
105
+ tqdm_class (`tqdm`, *optional*):
106
+ If provided, overwrites the default behavior for the progress bar. Passed
107
+ argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior.
108
+ Note that the `tqdm_class` is not passed to each individual download.
109
+ Defaults to the custom HF progress bar that can be disabled by setting
110
+ `HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
111
+
112
+ Returns:
113
+ `str`: folder path of the repo snapshot.
114
+
115
+ Raises:
116
+ [`~utils.RepositoryNotFoundError`]
117
+ If the repository to download from cannot be found. This may be because it doesn't exist,
118
+ or because it is set to `private` and you do not have access.
119
+ [`~utils.RevisionNotFoundError`]
120
+ If the revision to download from cannot be found.
121
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
122
+ If `token=True` and the token cannot be found.
123
+ [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
124
+ ETag cannot be determined.
125
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
126
+ if some parameter value is invalid.
127
+ """
128
+ if cache_dir is None:
129
+ cache_dir = constants.HF_HUB_CACHE
130
+ if revision is None:
131
+ revision = constants.DEFAULT_REVISION
132
+ if isinstance(cache_dir, Path):
133
+ cache_dir = str(cache_dir)
134
+
135
+ if repo_type is None:
136
+ repo_type = "model"
137
+ if repo_type not in constants.REPO_TYPES:
138
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
139
+
140
+ storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
141
+
142
+ repo_info: Union[ModelInfo, DatasetInfo, SpaceInfo, None] = None
143
+ api_call_error: Optional[Exception] = None
144
+ if not local_files_only:
145
+ # try/except logic to handle different errors => taken from `hf_hub_download`
146
+ try:
147
+ # if we have internet connection we want to list files to download
148
+ api = HfApi(
149
+ library_name=library_name,
150
+ library_version=library_version,
151
+ user_agent=user_agent,
152
+ endpoint=endpoint,
153
+ headers=headers,
154
+ )
155
+ repo_info = api.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision, token=token)
156
+ except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
157
+ # Actually raise for those subclasses of ConnectionError
158
+ raise
159
+ except (
160
+ requests.exceptions.ConnectionError,
161
+ requests.exceptions.Timeout,
162
+ OfflineModeIsEnabled,
163
+ ) as error:
164
+ # Internet connection is down
165
+ # => will try to use local files only
166
+ api_call_error = error
167
+ pass
168
+ except RevisionNotFoundError:
169
+ # The repo was found but the revision doesn't exist on the Hub (never existed or got deleted)
170
+ raise
171
+ except requests.HTTPError as error:
172
+ # Multiple reasons for an http error:
173
+ # - Repository is private and invalid/missing token sent
174
+ # - Repository is gated and invalid/missing token sent
175
+ # - Hub is down (error 500 or 504)
176
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
177
+ # (if it's not the case, the error will be re-raised)
178
+ api_call_error = error
179
+ pass
180
+
181
+ # At this stage, if `repo_info` is None it means either:
182
+ # - internet connection is down
183
+ # - internet connection is deactivated (local_files_only=True or HF_HUB_OFFLINE=True)
184
+ # - repo is private/gated and invalid/missing token sent
185
+ # - Hub is down
186
+ # => let's look if we can find the appropriate folder in the cache:
187
+ # - if the specified revision is a commit hash, look inside "snapshots".
188
+ # - f the specified revision is a branch or tag, look inside "refs".
189
+ # => if local_dir is not None, we will return the path to the local folder if it exists.
190
+ if repo_info is None:
191
+ # Try to get which commit hash corresponds to the specified revision
192
+ commit_hash = None
193
+ if REGEX_COMMIT_HASH.match(revision):
194
+ commit_hash = revision
195
+ else:
196
+ ref_path = os.path.join(storage_folder, "refs", revision)
197
+ if os.path.exists(ref_path):
198
+ # retrieve commit_hash from refs file
199
+ with open(ref_path) as f:
200
+ commit_hash = f.read()
201
+
202
+ # Try to locate snapshot folder for this commit hash
203
+ if commit_hash is not None:
204
+ snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
205
+ if os.path.exists(snapshot_folder):
206
+ # Snapshot folder exists => let's return it
207
+ # (but we can't check if all the files are actually there)
208
+ return snapshot_folder
209
+ # If local_dir is not None, return it if it exists and is not empty
210
+ if local_dir is not None:
211
+ local_dir = Path(local_dir)
212
+ if local_dir.is_dir() and any(local_dir.iterdir()):
213
+ logger.warning(
214
+ f"Returning existing local_dir `{local_dir}` as remote repo cannot be accessed in `snapshot_download` ({api_call_error})."
215
+ )
216
+ return str(local_dir.resolve())
217
+ # If we couldn't find the appropriate folder on disk, raise an error.
218
+ if local_files_only:
219
+ raise LocalEntryNotFoundError(
220
+ "Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and "
221
+ "outgoing traffic has been disabled. To enable repo look-ups and downloads online, pass "
222
+ "'local_files_only=False' as input."
223
+ )
224
+ elif isinstance(api_call_error, OfflineModeIsEnabled):
225
+ raise LocalEntryNotFoundError(
226
+ "Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and "
227
+ "outgoing traffic has been disabled. To enable repo look-ups and downloads online, set "
228
+ "'HF_HUB_OFFLINE=0' as environment variable."
229
+ ) from api_call_error
230
+ elif isinstance(api_call_error, RepositoryNotFoundError) or isinstance(api_call_error, GatedRepoError):
231
+ # Repo not found => let's raise the actual error
232
+ raise api_call_error
233
+ else:
234
+ # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
235
+ raise LocalEntryNotFoundError(
236
+ "An error happened while trying to locate the files on the Hub and we cannot find the appropriate"
237
+ " snapshot folder for the specified revision on the local disk. Please check your internet connection"
238
+ " and try again."
239
+ ) from api_call_error
240
+
241
+ # At this stage, internet connection is up and running
242
+ # => let's download the files!
243
+ assert repo_info.sha is not None, "Repo info returned from server must have a revision sha."
244
+ assert repo_info.siblings is not None, "Repo info returned from server must have a siblings list."
245
+ filtered_repo_files = list(
246
+ filter_repo_objects(
247
+ items=[f.rfilename for f in repo_info.siblings],
248
+ allow_patterns=allow_patterns,
249
+ ignore_patterns=ignore_patterns,
250
+ )
251
+ )
252
+ commit_hash = repo_info.sha
253
+ snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
254
+ # if passed revision is not identical to commit_hash
255
+ # then revision has to be a branch name or tag name.
256
+ # In that case store a ref.
257
+ if revision != commit_hash:
258
+ ref_path = os.path.join(storage_folder, "refs", revision)
259
+ try:
260
+ os.makedirs(os.path.dirname(ref_path), exist_ok=True)
261
+ with open(ref_path, "w") as f:
262
+ f.write(commit_hash)
263
+ except OSError as e:
264
+ logger.warning(f"Ignored error while writing commit hash to {ref_path}: {e}.")
265
+
266
+ # we pass the commit_hash to hf_hub_download
267
+ # so no network call happens if we already
268
+ # have the file locally.
269
+ def _inner_hf_hub_download(repo_file: str):
270
+ return hf_hub_download(
271
+ repo_id,
272
+ filename=repo_file,
273
+ repo_type=repo_type,
274
+ revision=commit_hash,
275
+ endpoint=endpoint,
276
+ cache_dir=cache_dir,
277
+ local_dir=local_dir,
278
+ local_dir_use_symlinks=local_dir_use_symlinks,
279
+ library_name=library_name,
280
+ library_version=library_version,
281
+ user_agent=user_agent,
282
+ proxies=proxies,
283
+ etag_timeout=etag_timeout,
284
+ resume_download=resume_download,
285
+ force_download=force_download,
286
+ token=token,
287
+ headers=headers,
288
+ )
289
+
290
+ if constants.HF_HUB_ENABLE_HF_TRANSFER:
291
+ # when using hf_transfer we don't want extra parallelism
292
+ # from the one hf_transfer provides
293
+ for file in filtered_repo_files:
294
+ _inner_hf_hub_download(file)
295
+ else:
296
+ thread_map(
297
+ _inner_hf_hub_download,
298
+ filtered_repo_files,
299
+ desc=f"Fetching {len(filtered_repo_files)} files",
300
+ max_workers=max_workers,
301
+ # User can use its own tqdm class or the default one from `huggingface_hub.utils`
302
+ tqdm_class=tqdm_class or hf_tqdm,
303
+ )
304
+
305
+ if local_dir is not None:
306
+ return str(os.path.realpath(local_dir))
307
+ return snapshot_folder
meow/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024-present, the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import enum
16
+ import logging
17
+ import os
18
+ import queue
19
+ import shutil
20
+ import sys
21
+ import threading
22
+ import time
23
+ import traceback
24
+ from datetime import datetime
25
+ from pathlib import Path
26
+ from threading import Lock
27
+ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
28
+
29
+ from . import constants
30
+ from ._commit_api import CommitOperationAdd, UploadInfo, _fetch_upload_modes
31
+ from ._local_folder import LocalUploadFileMetadata, LocalUploadFilePaths, get_local_upload_paths, read_upload_metadata
32
+ from .constants import DEFAULT_REVISION, REPO_TYPES
33
+ from .utils import DEFAULT_IGNORE_PATTERNS, filter_repo_objects, tqdm
34
+ from .utils._cache_manager import _format_size
35
+ from .utils.sha import sha_fileobj
36
+
37
+
38
+ if TYPE_CHECKING:
39
+ from .hf_api import HfApi
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+ WAITING_TIME_IF_NO_TASKS = 10 # seconds
44
+ MAX_NB_REGULAR_FILES_PER_COMMIT = 75
45
+ MAX_NB_LFS_FILES_PER_COMMIT = 150
46
+
47
+
48
+ def upload_large_folder_internal(
49
+ api: "HfApi",
50
+ repo_id: str,
51
+ folder_path: Union[str, Path],
52
+ *,
53
+ repo_type: str, # Repo type is required!
54
+ revision: Optional[str] = None,
55
+ private: Optional[bool] = None,
56
+ allow_patterns: Optional[Union[List[str], str]] = None,
57
+ ignore_patterns: Optional[Union[List[str], str]] = None,
58
+ num_workers: Optional[int] = None,
59
+ print_report: bool = True,
60
+ print_report_every: int = 60,
61
+ ):
62
+ """Upload a large folder to the Hub in the most resilient way possible.
63
+
64
+ See [`HfApi.upload_large_folder`] for the full documentation.
65
+ """
66
+ # 1. Check args and setup
67
+ if repo_type is None:
68
+ raise ValueError(
69
+ "For large uploads, `repo_type` is explicitly required. Please set it to `model`, `dataset` or `space`."
70
+ " If you are using the CLI, pass it as `--repo-type=model`."
71
+ )
72
+ if repo_type not in REPO_TYPES:
73
+ raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}")
74
+ if revision is None:
75
+ revision = DEFAULT_REVISION
76
+
77
+ folder_path = Path(folder_path).expanduser().resolve()
78
+ if not folder_path.is_dir():
79
+ raise ValueError(f"Provided path: '{folder_path}' is not a directory")
80
+
81
+ if ignore_patterns is None:
82
+ ignore_patterns = []
83
+ elif isinstance(ignore_patterns, str):
84
+ ignore_patterns = [ignore_patterns]
85
+ ignore_patterns += DEFAULT_IGNORE_PATTERNS
86
+
87
+ if num_workers is None:
88
+ nb_cores = os.cpu_count() or 1
89
+ num_workers = max(nb_cores - 2, 2) # Use all but 2 cores, or at least 2 cores
90
+
91
+ # 2. Create repo if missing
92
+ repo_url = api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
93
+ logger.info(f"Repo created: {repo_url}")
94
+ repo_id = repo_url.repo_id
95
+
96
+ # 3. List files to upload
97
+ filtered_paths_list = filter_repo_objects(
98
+ (path.relative_to(folder_path).as_posix() for path in folder_path.glob("**/*") if path.is_file()),
99
+ allow_patterns=allow_patterns,
100
+ ignore_patterns=ignore_patterns,
101
+ )
102
+ paths_list = [get_local_upload_paths(folder_path, relpath) for relpath in filtered_paths_list]
103
+ logger.info(f"Found {len(paths_list)} candidate files to upload")
104
+
105
+ # Read metadata for each file
106
+ items = [
107
+ (paths, read_upload_metadata(folder_path, paths.path_in_repo))
108
+ for paths in tqdm(paths_list, desc="Recovering from metadata files")
109
+ ]
110
+
111
+ # 4. Start workers
112
+ status = LargeUploadStatus(items)
113
+ threads = [
114
+ threading.Thread(
115
+ target=_worker_job,
116
+ kwargs={
117
+ "status": status,
118
+ "api": api,
119
+ "repo_id": repo_id,
120
+ "repo_type": repo_type,
121
+ "revision": revision,
122
+ },
123
+ )
124
+ for _ in range(num_workers)
125
+ ]
126
+
127
+ for thread in threads:
128
+ thread.start()
129
+
130
+ # 5. Print regular reports
131
+ if print_report:
132
+ print("\n\n" + status.current_report())
133
+ last_report_ts = time.time()
134
+ while True:
135
+ time.sleep(1)
136
+ if time.time() - last_report_ts >= print_report_every:
137
+ if print_report:
138
+ _print_overwrite(status.current_report())
139
+ last_report_ts = time.time()
140
+ if status.is_done():
141
+ logging.info("Is done: exiting main loop")
142
+ break
143
+
144
+ for thread in threads:
145
+ thread.join()
146
+
147
+ logger.info(status.current_report())
148
+ logging.info("Upload is complete!")
149
+
150
+
151
+ ####################
152
+ # Logic to manage workers and synchronize tasks
153
+ ####################
154
+
155
+
156
+ class WorkerJob(enum.Enum):
157
+ SHA256 = enum.auto()
158
+ GET_UPLOAD_MODE = enum.auto()
159
+ PREUPLOAD_LFS = enum.auto()
160
+ COMMIT = enum.auto()
161
+ WAIT = enum.auto() # if no tasks are available but we don't want to exit
162
+
163
+
164
+ JOB_ITEM_T = Tuple[LocalUploadFilePaths, LocalUploadFileMetadata]
165
+
166
+
167
+ class LargeUploadStatus:
168
+ """Contains information, queues and tasks for a large upload process."""
169
+
170
+ def __init__(self, items: List[JOB_ITEM_T]):
171
+ self.items = items
172
+ self.queue_sha256: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
173
+ self.queue_get_upload_mode: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
174
+ self.queue_preupload_lfs: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
175
+ self.queue_commit: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
176
+ self.lock = Lock()
177
+
178
+ self.nb_workers_sha256: int = 0
179
+ self.nb_workers_get_upload_mode: int = 0
180
+ self.nb_workers_preupload_lfs: int = 0
181
+ self.nb_workers_commit: int = 0
182
+ self.nb_workers_waiting: int = 0
183
+ self.last_commit_attempt: Optional[float] = None
184
+
185
+ self._started_at = datetime.now()
186
+
187
+ # Setup queues
188
+ for item in self.items:
189
+ paths, metadata = item
190
+ if metadata.sha256 is None:
191
+ self.queue_sha256.put(item)
192
+ elif metadata.upload_mode is None:
193
+ self.queue_get_upload_mode.put(item)
194
+ elif metadata.upload_mode == "lfs" and not metadata.is_uploaded:
195
+ self.queue_preupload_lfs.put(item)
196
+ elif not metadata.is_committed:
197
+ self.queue_commit.put(item)
198
+ else:
199
+ logger.debug(f"Skipping file {paths.path_in_repo} (already uploaded and committed)")
200
+
201
+ def current_report(self) -> str:
202
+ """Generate a report of the current status of the large upload."""
203
+ nb_hashed = 0
204
+ size_hashed = 0
205
+ nb_preuploaded = 0
206
+ nb_lfs = 0
207
+ nb_lfs_unsure = 0
208
+ size_preuploaded = 0
209
+ nb_committed = 0
210
+ size_committed = 0
211
+ total_size = 0
212
+ ignored_files = 0
213
+ total_files = 0
214
+
215
+ with self.lock:
216
+ for _, metadata in self.items:
217
+ if metadata.should_ignore:
218
+ ignored_files += 1
219
+ continue
220
+ total_size += metadata.size
221
+ total_files += 1
222
+ if metadata.sha256 is not None:
223
+ nb_hashed += 1
224
+ size_hashed += metadata.size
225
+ if metadata.upload_mode == "lfs":
226
+ nb_lfs += 1
227
+ if metadata.upload_mode is None:
228
+ nb_lfs_unsure += 1
229
+ if metadata.is_uploaded:
230
+ nb_preuploaded += 1
231
+ size_preuploaded += metadata.size
232
+ if metadata.is_committed:
233
+ nb_committed += 1
234
+ size_committed += metadata.size
235
+ total_size_str = _format_size(total_size)
236
+
237
+ now = datetime.now()
238
+ now_str = now.strftime("%Y-%m-%d %H:%M:%S")
239
+ elapsed = now - self._started_at
240
+ elapsed_str = str(elapsed).split(".")[0] # remove milliseconds
241
+
242
+ message = "\n" + "-" * 10
243
+ message += f" {now_str} ({elapsed_str}) "
244
+ message += "-" * 10 + "\n"
245
+
246
+ message += "Files: "
247
+ message += f"hashed {nb_hashed}/{total_files} ({_format_size(size_hashed)}/{total_size_str}) | "
248
+ message += f"pre-uploaded: {nb_preuploaded}/{nb_lfs} ({_format_size(size_preuploaded)}/{total_size_str})"
249
+ if nb_lfs_unsure > 0:
250
+ message += f" (+{nb_lfs_unsure} unsure)"
251
+ message += f" | committed: {nb_committed}/{total_files} ({_format_size(size_committed)}/{total_size_str})"
252
+ message += f" | ignored: {ignored_files}\n"
253
+
254
+ message += "Workers: "
255
+ message += f"hashing: {self.nb_workers_sha256} | "
256
+ message += f"get upload mode: {self.nb_workers_get_upload_mode} | "
257
+ message += f"pre-uploading: {self.nb_workers_preupload_lfs} | "
258
+ message += f"committing: {self.nb_workers_commit} | "
259
+ message += f"waiting: {self.nb_workers_waiting}\n"
260
+ message += "-" * 51
261
+
262
+ return message
263
+
264
+ def is_done(self) -> bool:
265
+ with self.lock:
266
+ return all(metadata.is_committed or metadata.should_ignore for _, metadata in self.items)
267
+
268
+
269
+ def _worker_job(
270
+ status: LargeUploadStatus,
271
+ api: "HfApi",
272
+ repo_id: str,
273
+ repo_type: str,
274
+ revision: str,
275
+ ):
276
+ """
277
+ Main process for a worker. The worker will perform tasks based on the priority list until all files are uploaded
278
+ and committed. If no tasks are available, the worker will wait for 10 seconds before checking again.
279
+
280
+ If a task fails for any reason, the item(s) are put back in the queue for another worker to pick up.
281
+
282
+ Read `upload_large_folder` docstring for more information on how tasks are prioritized.
283
+ """
284
+ while True:
285
+ next_job: Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]] = None
286
+
287
+ # Determine next task
288
+ next_job = _determine_next_job(status)
289
+ if next_job is None:
290
+ return
291
+ job, items = next_job
292
+
293
+ # Perform task
294
+ if job == WorkerJob.SHA256:
295
+ item = items[0] # single item
296
+ try:
297
+ _compute_sha256(item)
298
+ status.queue_get_upload_mode.put(item)
299
+ except KeyboardInterrupt:
300
+ raise
301
+ except Exception as e:
302
+ logger.error(f"Failed to compute sha256: {e}")
303
+ traceback.format_exc()
304
+ status.queue_sha256.put(item)
305
+
306
+ with status.lock:
307
+ status.nb_workers_sha256 -= 1
308
+
309
+ elif job == WorkerJob.GET_UPLOAD_MODE:
310
+ try:
311
+ _get_upload_mode(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
312
+ except KeyboardInterrupt:
313
+ raise
314
+ except Exception as e:
315
+ logger.error(f"Failed to get upload mode: {e}")
316
+ traceback.format_exc()
317
+
318
+ # Items are either:
319
+ # - dropped (if should_ignore)
320
+ # - put in LFS queue (if LFS)
321
+ # - put in commit queue (if regular)
322
+ # - or put back (if error occurred).
323
+ for item in items:
324
+ _, metadata = item
325
+ if metadata.should_ignore:
326
+ continue
327
+ if metadata.upload_mode == "lfs":
328
+ status.queue_preupload_lfs.put(item)
329
+ elif metadata.upload_mode == "regular":
330
+ status.queue_commit.put(item)
331
+ else:
332
+ status.queue_get_upload_mode.put(item)
333
+
334
+ with status.lock:
335
+ status.nb_workers_get_upload_mode -= 1
336
+
337
+ elif job == WorkerJob.PREUPLOAD_LFS:
338
+ item = items[0] # single item
339
+ try:
340
+ _preupload_lfs(item, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
341
+ status.queue_commit.put(item)
342
+ except KeyboardInterrupt:
343
+ raise
344
+ except Exception as e:
345
+ logger.error(f"Failed to preupload LFS: {e}")
346
+ traceback.format_exc()
347
+ status.queue_preupload_lfs.put(item)
348
+
349
+ with status.lock:
350
+ status.nb_workers_preupload_lfs -= 1
351
+
352
+ elif job == WorkerJob.COMMIT:
353
+ try:
354
+ _commit(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
355
+ except KeyboardInterrupt:
356
+ raise
357
+ except Exception as e:
358
+ logger.error(f"Failed to commit: {e}")
359
+ traceback.format_exc()
360
+ for item in items:
361
+ status.queue_commit.put(item)
362
+ with status.lock:
363
+ status.last_commit_attempt = time.time()
364
+ status.nb_workers_commit -= 1
365
+
366
+ elif job == WorkerJob.WAIT:
367
+ time.sleep(WAITING_TIME_IF_NO_TASKS)
368
+ with status.lock:
369
+ status.nb_workers_waiting -= 1
370
+
371
+
372
+ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]]:
373
+ with status.lock:
374
+ # 1. Commit if more than 5 minutes since last commit attempt (and at least 1 file)
375
+ if (
376
+ status.nb_workers_commit == 0
377
+ and status.queue_commit.qsize() > 0
378
+ and status.last_commit_attempt is not None
379
+ and time.time() - status.last_commit_attempt > 5 * 60
380
+ ):
381
+ status.nb_workers_commit += 1
382
+ logger.debug("Job: commit (more than 5 minutes since last commit attempt)")
383
+ return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
384
+
385
+ # 2. Commit if at least 100 files are ready to commit
386
+ elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 150:
387
+ status.nb_workers_commit += 1
388
+ logger.debug("Job: commit (>100 files ready)")
389
+ return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
390
+
391
+ # 3. Get upload mode if at least 10 files
392
+ elif status.queue_get_upload_mode.qsize() >= 10:
393
+ status.nb_workers_get_upload_mode += 1
394
+ logger.debug("Job: get upload mode (>10 files ready)")
395
+ return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
396
+
397
+ # 4. Preupload LFS file if at least 1 file and no worker is preuploading LFS
398
+ elif status.queue_preupload_lfs.qsize() > 0 and status.nb_workers_preupload_lfs == 0:
399
+ status.nb_workers_preupload_lfs += 1
400
+ logger.debug("Job: preupload LFS (no other worker preuploading LFS)")
401
+ return (WorkerJob.PREUPLOAD_LFS, _get_one(status.queue_preupload_lfs))
402
+
403
+ # 5. Compute sha256 if at least 1 file and no worker is computing sha256
404
+ elif status.queue_sha256.qsize() > 0 and status.nb_workers_sha256 == 0:
405
+ status.nb_workers_sha256 += 1
406
+ logger.debug("Job: sha256 (no other worker computing sha256)")
407
+ return (WorkerJob.SHA256, _get_one(status.queue_sha256))
408
+
409
+ # 6. Get upload mode if at least 1 file and no worker is getting upload mode
410
+ elif status.queue_get_upload_mode.qsize() > 0 and status.nb_workers_get_upload_mode == 0:
411
+ status.nb_workers_get_upload_mode += 1
412
+ logger.debug("Job: get upload mode (no other worker getting upload mode)")
413
+ return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
414
+
415
+ # 7. Preupload LFS file if at least 1 file
416
+ # Skip if hf_transfer is enabled and there is already a worker preuploading LFS
417
+ elif status.queue_preupload_lfs.qsize() > 0 and (
418
+ status.nb_workers_preupload_lfs == 0 or not constants.HF_HUB_ENABLE_HF_TRANSFER
419
+ ):
420
+ status.nb_workers_preupload_lfs += 1
421
+ logger.debug("Job: preupload LFS")
422
+ return (WorkerJob.PREUPLOAD_LFS, _get_one(status.queue_preupload_lfs))
423
+
424
+ # 8. Compute sha256 if at least 1 file
425
+ elif status.queue_sha256.qsize() > 0:
426
+ status.nb_workers_sha256 += 1
427
+ logger.debug("Job: sha256")
428
+ return (WorkerJob.SHA256, _get_one(status.queue_sha256))
429
+
430
+ # 9. Get upload mode if at least 1 file
431
+ elif status.queue_get_upload_mode.qsize() > 0:
432
+ status.nb_workers_get_upload_mode += 1
433
+ logger.debug("Job: get upload mode")
434
+ return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
435
+
436
+ # 10. Commit if at least 1 file and 1 min since last commit attempt
437
+ elif (
438
+ status.nb_workers_commit == 0
439
+ and status.queue_commit.qsize() > 0
440
+ and status.last_commit_attempt is not None
441
+ and time.time() - status.last_commit_attempt > 1 * 60
442
+ ):
443
+ status.nb_workers_commit += 1
444
+ logger.debug("Job: commit (1 min since last commit attempt)")
445
+ return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
446
+
447
+ # 11. Commit if at least 1 file all other queues are empty and all workers are waiting
448
+ # e.g. when it's the last commit
449
+ elif (
450
+ status.nb_workers_commit == 0
451
+ and status.queue_commit.qsize() > 0
452
+ and status.queue_sha256.qsize() == 0
453
+ and status.queue_get_upload_mode.qsize() == 0
454
+ and status.queue_preupload_lfs.qsize() == 0
455
+ and status.nb_workers_sha256 == 0
456
+ and status.nb_workers_get_upload_mode == 0
457
+ and status.nb_workers_preupload_lfs == 0
458
+ ):
459
+ status.nb_workers_commit += 1
460
+ logger.debug("Job: commit")
461
+ return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
462
+
463
+ # 12. If all queues are empty, exit
464
+ elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
465
+ logger.info("All files have been processed! Exiting worker.")
466
+ return None
467
+
468
+ # 13. If no task is available, wait
469
+ else:
470
+ status.nb_workers_waiting += 1
471
+ logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)")
472
+ return (WorkerJob.WAIT, [])
473
+
474
+
475
+ ####################
476
+ # Atomic jobs (sha256, get_upload_mode, preupload_lfs, commit)
477
+ ####################
478
+
479
+
480
+ def _compute_sha256(item: JOB_ITEM_T) -> None:
481
+ """Compute sha256 of a file and save it in metadata."""
482
+ paths, metadata = item
483
+ if metadata.sha256 is None:
484
+ with paths.file_path.open("rb") as f:
485
+ metadata.sha256 = sha_fileobj(f).hex()
486
+ metadata.save(paths)
487
+
488
+
489
+ def _get_upload_mode(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
490
+ """Get upload mode for each file and update metadata.
491
+
492
+ Also receive info if the file should be ignored.
493
+ """
494
+ additions = [_build_hacky_operation(item) for item in items]
495
+ _fetch_upload_modes(
496
+ additions=additions,
497
+ repo_type=repo_type,
498
+ repo_id=repo_id,
499
+ headers=api._build_hf_headers(),
500
+ revision=revision,
501
+ )
502
+ for item, addition in zip(items, additions):
503
+ paths, metadata = item
504
+ metadata.upload_mode = addition._upload_mode
505
+ metadata.should_ignore = addition._should_ignore
506
+ metadata.save(paths)
507
+
508
+
509
+ def _preupload_lfs(item: JOB_ITEM_T, api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
510
+ """Preupload LFS file and update metadata."""
511
+ paths, metadata = item
512
+ addition = _build_hacky_operation(item)
513
+ api.preupload_lfs_files(
514
+ repo_id=repo_id,
515
+ repo_type=repo_type,
516
+ revision=revision,
517
+ additions=[addition],
518
+ )
519
+
520
+ metadata.is_uploaded = True
521
+ metadata.save(paths)
522
+
523
+
524
+ def _commit(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
525
+ """Commit files to the repo."""
526
+ additions = [_build_hacky_operation(item) for item in items]
527
+ api.create_commit(
528
+ repo_id=repo_id,
529
+ repo_type=repo_type,
530
+ revision=revision,
531
+ operations=additions,
532
+ commit_message="Add files using upload-large-folder tool",
533
+ )
534
+ for paths, metadata in items:
535
+ metadata.is_committed = True
536
+ metadata.save(paths)
537
+
538
+
539
+ ####################
540
+ # Hacks with CommitOperationAdd to bypass checks/sha256 calculation
541
+ ####################
542
+
543
+
544
+ class HackyCommitOperationAdd(CommitOperationAdd):
545
+ def __post_init__(self) -> None:
546
+ if isinstance(self.path_or_fileobj, Path):
547
+ self.path_or_fileobj = str(self.path_or_fileobj)
548
+
549
+
550
+ def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd:
551
+ paths, metadata = item
552
+ operation = HackyCommitOperationAdd(path_in_repo=paths.path_in_repo, path_or_fileobj=paths.file_path)
553
+ with paths.file_path.open("rb") as file:
554
+ sample = file.peek(512)[:512]
555
+ if metadata.sha256 is None:
556
+ raise ValueError("sha256 must have been computed by now!")
557
+ operation.upload_info = UploadInfo(sha256=bytes.fromhex(metadata.sha256), size=metadata.size, sample=sample)
558
+ return operation
559
+
560
+
561
+ ####################
562
+ # Misc helpers
563
+ ####################
564
+
565
+
566
+ def _get_one(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
567
+ return [queue.get()]
568
+
569
+
570
+ def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> List[JOB_ITEM_T]:
571
+ return [queue.get() for _ in range(min(queue.qsize(), n))]
572
+
573
+
574
+ def _get_items_to_commit(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
575
+ """Special case for commit job: the number of items to commit depends on the type of files."""
576
+ # Can take at most 50 regular files and/or 100 LFS files in a single commit
577
+ items: List[JOB_ITEM_T] = []
578
+ nb_lfs, nb_regular = 0, 0
579
+ while True:
580
+ # If empty queue => commit everything
581
+ if queue.qsize() == 0:
582
+ return items
583
+
584
+ # If we have enough items => commit them
585
+ if nb_lfs >= MAX_NB_LFS_FILES_PER_COMMIT or nb_regular >= MAX_NB_REGULAR_FILES_PER_COMMIT:
586
+ return items
587
+
588
+ # Else, get a new item and increase counter
589
+ item = queue.get()
590
+ items.append(item)
591
+ _, metadata = item
592
+ if metadata.upload_mode == "lfs":
593
+ nb_lfs += 1
594
+ else:
595
+ nb_regular += 1
596
+
597
+
598
+ def _print_overwrite(report: str) -> None:
599
+ """Print a report, overwriting the previous lines.
600
+
601
+ Since tqdm in using `sys.stderr` to (re-)write progress bars, we need to use `sys.stdout`
602
+ to print the report.
603
+
604
+ Note: works well only if no other process is writing to `sys.stdout`!
605
+ """
606
+ report += "\n"
607
+ # Get terminal width
608
+ terminal_width = shutil.get_terminal_size().columns
609
+
610
+ # Count number of lines that should be cleared
611
+ nb_lines = sum(len(line) // terminal_width + 1 for line in report.splitlines())
612
+
613
+ # Clear previous lines based on the number of lines in the report
614
+ for _ in range(nb_lines):
615
+ sys.stdout.write("\r\033[K") # Clear line
616
+ sys.stdout.write("\033[F") # Move cursor up one line
617
+
618
+ # Print the new report, filling remaining space with whitespace
619
+ sys.stdout.write(report)
620
+ sys.stdout.write(" " * (terminal_width - len(report.splitlines()[-1])))
621
+ sys.stdout.flush()
meow/lib/python3.13/site-packages/huggingface_hub/community.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data structures to interact with Discussions and Pull Requests on the Hub.
3
+
4
+ See [the Discussions and Pull Requests guide](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)
5
+ for more information on Pull Requests, Discussions, and the community tab.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from datetime import datetime
10
+ from typing import List, Literal, Optional, Union
11
+
12
+ from . import constants
13
+ from .utils import parse_datetime
14
+
15
+
16
+ DiscussionStatus = Literal["open", "closed", "merged", "draft"]
17
+
18
+
19
+ @dataclass
20
+ class Discussion:
21
+ """
22
+ A Discussion or Pull Request on the Hub.
23
+
24
+ This dataclass is not intended to be instantiated directly.
25
+
26
+ Attributes:
27
+ title (`str`):
28
+ The title of the Discussion / Pull Request
29
+ status (`str`):
30
+ The status of the Discussion / Pull Request.
31
+ It must be one of:
32
+ * `"open"`
33
+ * `"closed"`
34
+ * `"merged"` (only for Pull Requests )
35
+ * `"draft"` (only for Pull Requests )
36
+ num (`int`):
37
+ The number of the Discussion / Pull Request.
38
+ repo_id (`str`):
39
+ The id (`"{namespace}/{repo_name}"`) of the repo on which
40
+ the Discussion / Pull Request was open.
41
+ repo_type (`str`):
42
+ The type of the repo on which the Discussion / Pull Request was open.
43
+ Possible values are: `"model"`, `"dataset"`, `"space"`.
44
+ author (`str`):
45
+ The username of the Discussion / Pull Request author.
46
+ Can be `"deleted"` if the user has been deleted since.
47
+ is_pull_request (`bool`):
48
+ Whether or not this is a Pull Request.
49
+ created_at (`datetime`):
50
+ The `datetime` of creation of the Discussion / Pull Request.
51
+ endpoint (`str`):
52
+ Endpoint of the Hub. Default is https://huggingface.co.
53
+ git_reference (`str`, *optional*):
54
+ (property) Git reference to which changes can be pushed if this is a Pull Request, `None` otherwise.
55
+ url (`str`):
56
+ (property) URL of the discussion on the Hub.
57
+ """
58
+
59
+ title: str
60
+ status: DiscussionStatus
61
+ num: int
62
+ repo_id: str
63
+ repo_type: str
64
+ author: str
65
+ is_pull_request: bool
66
+ created_at: datetime
67
+ endpoint: str
68
+
69
+ @property
70
+ def git_reference(self) -> Optional[str]:
71
+ """
72
+ If this is a Pull Request , returns the git reference to which changes can be pushed.
73
+ Returns `None` otherwise.
74
+ """
75
+ if self.is_pull_request:
76
+ return f"refs/pr/{self.num}"
77
+ return None
78
+
79
+ @property
80
+ def url(self) -> str:
81
+ """Returns the URL of the discussion on the Hub."""
82
+ if self.repo_type is None or self.repo_type == constants.REPO_TYPE_MODEL:
83
+ return f"{self.endpoint}/{self.repo_id}/discussions/{self.num}"
84
+ return f"{self.endpoint}/{self.repo_type}s/{self.repo_id}/discussions/{self.num}"
85
+
86
+
87
+ @dataclass
88
+ class DiscussionWithDetails(Discussion):
89
+ """
90
+ Subclass of [`Discussion`].
91
+
92
+ Attributes:
93
+ title (`str`):
94
+ The title of the Discussion / Pull Request
95
+ status (`str`):
96
+ The status of the Discussion / Pull Request.
97
+ It can be one of:
98
+ * `"open"`
99
+ * `"closed"`
100
+ * `"merged"` (only for Pull Requests )
101
+ * `"draft"` (only for Pull Requests )
102
+ num (`int`):
103
+ The number of the Discussion / Pull Request.
104
+ repo_id (`str`):
105
+ The id (`"{namespace}/{repo_name}"`) of the repo on which
106
+ the Discussion / Pull Request was open.
107
+ repo_type (`str`):
108
+ The type of the repo on which the Discussion / Pull Request was open.
109
+ Possible values are: `"model"`, `"dataset"`, `"space"`.
110
+ author (`str`):
111
+ The username of the Discussion / Pull Request author.
112
+ Can be `"deleted"` if the user has been deleted since.
113
+ is_pull_request (`bool`):
114
+ Whether or not this is a Pull Request.
115
+ created_at (`datetime`):
116
+ The `datetime` of creation of the Discussion / Pull Request.
117
+ events (`list` of [`DiscussionEvent`])
118
+ The list of [`DiscussionEvents`] in this Discussion or Pull Request.
119
+ conflicting_files (`Union[List[str], bool, None]`, *optional*):
120
+ A list of conflicting files if this is a Pull Request.
121
+ `None` if `self.is_pull_request` is `False`.
122
+ `True` if there are conflicting files but the list can't be retrieved.
123
+ target_branch (`str`, *optional*):
124
+ The branch into which changes are to be merged if this is a
125
+ Pull Request . `None` if `self.is_pull_request` is `False`.
126
+ merge_commit_oid (`str`, *optional*):
127
+ If this is a merged Pull Request , this is set to the OID / SHA of
128
+ the merge commit, `None` otherwise.
129
+ diff (`str`, *optional*):
130
+ The git diff if this is a Pull Request , `None` otherwise.
131
+ endpoint (`str`):
132
+ Endpoint of the Hub. Default is https://huggingface.co.
133
+ git_reference (`str`, *optional*):
134
+ (property) Git reference to which changes can be pushed if this is a Pull Request, `None` otherwise.
135
+ url (`str`):
136
+ (property) URL of the discussion on the Hub.
137
+ """
138
+
139
+ events: List["DiscussionEvent"]
140
+ conflicting_files: Union[List[str], bool, None]
141
+ target_branch: Optional[str]
142
+ merge_commit_oid: Optional[str]
143
+ diff: Optional[str]
144
+
145
+
146
+ @dataclass
147
+ class DiscussionEvent:
148
+ """
149
+ An event in a Discussion or Pull Request.
150
+
151
+ Use concrete classes:
152
+ * [`DiscussionComment`]
153
+ * [`DiscussionStatusChange`]
154
+ * [`DiscussionCommit`]
155
+ * [`DiscussionTitleChange`]
156
+
157
+ Attributes:
158
+ id (`str`):
159
+ The ID of the event. An hexadecimal string.
160
+ type (`str`):
161
+ The type of the event.
162
+ created_at (`datetime`):
163
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
164
+ object holding the creation timestamp for the event.
165
+ author (`str`):
166
+ The username of the Discussion / Pull Request author.
167
+ Can be `"deleted"` if the user has been deleted since.
168
+ """
169
+
170
+ id: str
171
+ type: str
172
+ created_at: datetime
173
+ author: str
174
+
175
+ _event: dict
176
+ """Stores the original event data, in case we need to access it later."""
177
+
178
+
179
+ @dataclass
180
+ class DiscussionComment(DiscussionEvent):
181
+ """A comment in a Discussion / Pull Request.
182
+
183
+ Subclass of [`DiscussionEvent`].
184
+
185
+
186
+ Attributes:
187
+ id (`str`):
188
+ The ID of the event. An hexadecimal string.
189
+ type (`str`):
190
+ The type of the event.
191
+ created_at (`datetime`):
192
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
193
+ object holding the creation timestamp for the event.
194
+ author (`str`):
195
+ The username of the Discussion / Pull Request author.
196
+ Can be `"deleted"` if the user has been deleted since.
197
+ content (`str`):
198
+ The raw markdown content of the comment. Mentions, links and images are not rendered.
199
+ edited (`bool`):
200
+ Whether or not this comment has been edited.
201
+ hidden (`bool`):
202
+ Whether or not this comment has been hidden.
203
+ """
204
+
205
+ content: str
206
+ edited: bool
207
+ hidden: bool
208
+
209
+ @property
210
+ def rendered(self) -> str:
211
+ """The rendered comment, as a HTML string"""
212
+ return self._event["data"]["latest"]["html"]
213
+
214
+ @property
215
+ def last_edited_at(self) -> datetime:
216
+ """The last edit time, as a `datetime` object."""
217
+ return parse_datetime(self._event["data"]["latest"]["updatedAt"])
218
+
219
+ @property
220
+ def last_edited_by(self) -> str:
221
+ """The last edit time, as a `datetime` object."""
222
+ return self._event["data"]["latest"].get("author", {}).get("name", "deleted")
223
+
224
+ @property
225
+ def edit_history(self) -> List[dict]:
226
+ """The edit history of the comment"""
227
+ return self._event["data"]["history"]
228
+
229
+ @property
230
+ def number_of_edits(self) -> int:
231
+ return len(self.edit_history)
232
+
233
+
234
+ @dataclass
235
+ class DiscussionStatusChange(DiscussionEvent):
236
+ """A change of status in a Discussion / Pull Request.
237
+
238
+ Subclass of [`DiscussionEvent`].
239
+
240
+ Attributes:
241
+ id (`str`):
242
+ The ID of the event. An hexadecimal string.
243
+ type (`str`):
244
+ The type of the event.
245
+ created_at (`datetime`):
246
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
247
+ object holding the creation timestamp for the event.
248
+ author (`str`):
249
+ The username of the Discussion / Pull Request author.
250
+ Can be `"deleted"` if the user has been deleted since.
251
+ new_status (`str`):
252
+ The status of the Discussion / Pull Request after the change.
253
+ It can be one of:
254
+ * `"open"`
255
+ * `"closed"`
256
+ * `"merged"` (only for Pull Requests )
257
+ """
258
+
259
+ new_status: str
260
+
261
+
262
+ @dataclass
263
+ class DiscussionCommit(DiscussionEvent):
264
+ """A commit in a Pull Request.
265
+
266
+ Subclass of [`DiscussionEvent`].
267
+
268
+ Attributes:
269
+ id (`str`):
270
+ The ID of the event. An hexadecimal string.
271
+ type (`str`):
272
+ The type of the event.
273
+ created_at (`datetime`):
274
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
275
+ object holding the creation timestamp for the event.
276
+ author (`str`):
277
+ The username of the Discussion / Pull Request author.
278
+ Can be `"deleted"` if the user has been deleted since.
279
+ summary (`str`):
280
+ The summary of the commit.
281
+ oid (`str`):
282
+ The OID / SHA of the commit, as a hexadecimal string.
283
+ """
284
+
285
+ summary: str
286
+ oid: str
287
+
288
+
289
+ @dataclass
290
+ class DiscussionTitleChange(DiscussionEvent):
291
+ """A rename event in a Discussion / Pull Request.
292
+
293
+ Subclass of [`DiscussionEvent`].
294
+
295
+ Attributes:
296
+ id (`str`):
297
+ The ID of the event. An hexadecimal string.
298
+ type (`str`):
299
+ The type of the event.
300
+ created_at (`datetime`):
301
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
302
+ object holding the creation timestamp for the event.
303
+ author (`str`):
304
+ The username of the Discussion / Pull Request author.
305
+ Can be `"deleted"` if the user has been deleted since.
306
+ old_title (`str`):
307
+ The previous title for the Discussion / Pull Request.
308
+ new_title (`str`):
309
+ The new title.
310
+ """
311
+
312
+ old_title: str
313
+ new_title: str
314
+
315
+
316
+ def deserialize_event(event: dict) -> DiscussionEvent:
317
+ """Instantiates a [`DiscussionEvent`] from a dict"""
318
+ event_id: str = event["id"]
319
+ event_type: str = event["type"]
320
+ created_at = parse_datetime(event["createdAt"])
321
+
322
+ common_args = dict(
323
+ id=event_id,
324
+ type=event_type,
325
+ created_at=created_at,
326
+ author=event.get("author", {}).get("name", "deleted"),
327
+ _event=event,
328
+ )
329
+
330
+ if event_type == "comment":
331
+ return DiscussionComment(
332
+ **common_args,
333
+ edited=event["data"]["edited"],
334
+ hidden=event["data"]["hidden"],
335
+ content=event["data"]["latest"]["raw"],
336
+ )
337
+ if event_type == "status-change":
338
+ return DiscussionStatusChange(
339
+ **common_args,
340
+ new_status=event["data"]["status"],
341
+ )
342
+ if event_type == "commit":
343
+ return DiscussionCommit(
344
+ **common_args,
345
+ summary=event["data"]["subject"],
346
+ oid=event["data"]["oid"],
347
+ )
348
+ if event_type == "title-change":
349
+ return DiscussionTitleChange(
350
+ **common_args,
351
+ old_title=event["data"]["from"],
352
+ new_title=event["data"]["to"],
353
+ )
354
+
355
+ return DiscussionEvent(**common_args)