poemsforaphrodite's picture
Upload folder using huggingface_hub
b72ab63 verified
raw
history blame
11.7 kB
import os
import sys
import uuid
import warnings
from ftplib import FTP, Error, error_perm
from typing import Any
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, isfilelike
class FTPFileSystem(AbstractFileSystem):
"""A filesystem over classic FTP"""
root_marker = "/"
cachable = False
protocol = "ftp"
def __init__(
self,
host,
port=21,
username=None,
password=None,
acct=None,
block_size=None,
tempdir=None,
timeout=30,
encoding="utf-8",
**kwargs,
):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable FTP url.
Authentication will be anonymous if username/password are not
given.
Parameters
----------
host: str
The remote server name/ip to connect to
port: int
Port to connect with
username: str or None
If authenticating, the user's identifier
password: str of None
User's password on the server, if using
acct: str or None
Some servers also need an "account" string for auth
block_size: int or None
If given, the read-ahead or write buffer size.
tempdir: str
Directory on remote to put temporary files when in a transaction
timeout: int
Timeout of the ftp connection in seconds
encoding: str
Encoding to use for directories and filenames in FTP connection
"""
super().__init__(**kwargs)
self.host = host
self.port = port
self.tempdir = tempdir or "/tmp"
self.cred = username, password, acct
self.timeout = timeout
self.encoding = encoding
if block_size is not None:
self.blocksize = block_size
else:
self.blocksize = 2**16
self._connect()
def _connect(self):
if sys.version_info >= (3, 9):
self.ftp = FTP(timeout=self.timeout, encoding=self.encoding)
elif self.encoding:
warnings.warn("`encoding` not supported for python<3.9, ignoring")
self.ftp = FTP(timeout=self.timeout)
else:
self.ftp = FTP(timeout=self.timeout)
self.ftp.connect(self.host, self.port)
self.ftp.login(*self.cred)
@classmethod
def _strip_protocol(cls, path):
return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
return out
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
out = []
if path not in self.dircache:
try:
try:
out = [
(fn, details)
for (fn, details) in self.ftp.mlsd(path)
if fn not in [".", ".."]
and details["type"] not in ["pdir", "cdir"]
]
except error_perm:
out = _mlsd2(self.ftp, path) # Not platform independent
for fn, details in out:
if path == "/":
path = "" # just for forming the names, below
details["name"] = "/".join([path, fn.lstrip("/")])
if details["type"] == "file":
details["size"] = int(details["size"])
else:
details["size"] = 0
if details["type"] == "dir":
details["type"] = "directory"
self.dircache[path] = out
except Error:
try:
info = self.info(path)
if info["type"] == "file":
out = [(path, info)]
except (Error, IndexError):
raise FileNotFoundError(path)
files = self.dircache.get(path, out)
if not detail:
return sorted([fn for fn, details in files])
return [details for fn, details in files]
def info(self, path, **kwargs):
# implement with direct method
path = self._strip_protocol(path)
if path == "/":
# special case, since this dir has no real entry
return {"name": "/", "size": 0, "type": "directory"}
files = self.ls(self._parent(path).lstrip("/"), True)
try:
out = [f for f in files if f["name"] == path][0]
except IndexError:
raise FileNotFoundError(path)
return out
def get_file(self, rpath, lpath, **kwargs):
if self.isdir(rpath):
if not os.path.exists(lpath):
os.mkdir(lpath)
return
if isfilelike(lpath):
outfile = lpath
else:
outfile = open(lpath, "wb")
def cb(x):
outfile.write(x)
self.ftp.retrbinary(
f"RETR {rpath}",
blocksize=self.blocksize,
callback=cb,
)
if not isfilelike(lpath):
outfile.close()
def cat_file(self, path, start=None, end=None, **kwargs):
if end is not None:
return super().cat_file(path, start, end, **kwargs)
out = []
def cb(x):
out.append(x)
try:
self.ftp.retrbinary(
f"RETR {path}",
blocksize=self.blocksize,
rest=start,
callback=cb,
)
except (Error, error_perm) as orig_exc:
raise FileNotFoundError(path) from orig_exc
return b"".join(out)
def _open(
self,
path,
mode="rb",
block_size=None,
cache_options=None,
autocommit=True,
**kwargs,
):
path = self._strip_protocol(path)
block_size = block_size or self.blocksize
return FTPFile(
self,
path,
mode=mode,
block_size=block_size,
tempdir=self.tempdir,
autocommit=autocommit,
cache_options=cache_options,
)
def _rm(self, path):
path = self._strip_protocol(path)
self.ftp.delete(path)
self.invalidate_cache(self._parent(path))
def rm(self, path, recursive=False, maxdepth=None):
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(paths):
if self.isfile(p):
self.rm_file(p)
else:
self.rmdir(p)
def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
path = self._strip_protocol(path)
parent = self._parent(path)
if parent != self.root_marker and not self.exists(parent) and create_parents:
self.mkdir(parent, create_parents=create_parents)
self.ftp.mkd(path)
self.invalidate_cache(self._parent(path))
def makedirs(self, path: str, exist_ok: bool = False) -> None:
path = self._strip_protocol(path)
if self.exists(path):
# NB: "/" does not "exist" as it has no directory entry
if not exist_ok:
raise FileExistsError(f"{path} exists without `exist_ok`")
# exists_ok=True -> no-op
else:
self.mkdir(path, create_parents=True)
def rmdir(self, path):
path = self._strip_protocol(path)
self.ftp.rmd(path)
self.invalidate_cache(self._parent(path))
def mv(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
self.ftp.rename(path1, path2)
self.invalidate_cache(self._parent(path1))
self.invalidate_cache(self._parent(path2))
def __del__(self):
self.ftp.close()
def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
else:
self.dircache.pop(path, None)
super().invalidate_cache(path)
class TransferDone(Exception):
"""Internal exception to break out of transfer"""
pass
class FTPFile(AbstractBufferedFile):
"""Interact with a remote FTP file with read/write buffering"""
def __init__(
self,
fs,
path,
mode="rb",
block_size="default",
autocommit=True,
cache_type="readahead",
cache_options=None,
**kwargs,
):
super().__init__(
fs,
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
if not autocommit:
self.target = self.path
self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
def commit(self):
self.fs.mv(self.path, self.target)
def discard(self):
self.fs.rm(self.path)
def _fetch_range(self, start, end):
"""Get bytes between given byte limits
Implemented by raising an exception in the fetch callback when the
number of bytes received reaches the requested amount.
Will fail if the server does not respect the REST command on
retrieve requests.
"""
out = []
total = [0]
def callback(x):
total[0] += len(x)
if total[0] > end - start:
out.append(x[: (end - start) - total[0]])
if end < self.size:
raise TransferDone
else:
out.append(x)
if total[0] == end - start and end < self.size:
raise TransferDone
try:
self.fs.ftp.retrbinary(
f"RETR {self.path}",
blocksize=self.blocksize,
rest=start,
callback=callback,
)
except TransferDone:
try:
# stop transfer, we got enough bytes for this block
self.fs.ftp.abort()
self.fs.ftp.getmultiline()
except Error:
self.fs._connect()
return b"".join(out)
def _upload_chunk(self, final=False):
self.buffer.seek(0)
self.fs.ftp.storbinary(
f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
)
return True
def _mlsd2(ftp, path="."):
"""
Fall back to using `dir` instead of `mlsd` if not supported.
This parses a Linux style `ls -l` response to `dir`, but the response may
be platform dependent.
Parameters
----------
ftp: ftplib.FTP
path: str
Expects to be given path, but defaults to ".".
"""
lines = []
minfo = []
ftp.dir(path, lines.append)
for line in lines:
split_line = line.split()
if len(split_line) < 9:
continue
this = (
split_line[-1],
{
"modify": " ".join(split_line[5:8]),
"unix.owner": split_line[2],
"unix.group": split_line[3],
"unix.mode": split_line[0],
"size": split_line[4],
},
)
if "d" == this[1]["unix.mode"][0]:
this[1]["type"] = "dir"
else:
this[1]["type"] = "file"
minfo.append(this)
return minfo