diff --git a/waifuc/__init__.py b/waifuc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/waifuc/__pycache__/__init__.cpython-310.pyc b/waifuc/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2052eafc697c8e9cf21abf1617f35b0009fa755d Binary files /dev/null and b/waifuc/__pycache__/__init__.cpython-310.pyc differ diff --git a/waifuc/action/__init__.py b/waifuc/action/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1305e1870b7b640634e62c896c08d985e89e33d6 --- /dev/null +++ b/waifuc/action/__init__.py @@ -0,0 +1,13 @@ +from .align import AlignMaxSizeAction, AlignMinSizeAction, PaddingAlignAction +from .augument import RandomFilenameAction, RandomChoiceAction, BaseRandomAction, MirrorAction +from .background import BackgroundRemovalAction +from .base import BaseAction, ProcessAction, FilterAction, ActionStop +from .basic import ModeConvertAction +from .ccip import CCIPAction +from .count import SliceSelectAction, FirstNSelectAction +from .filename import FileExtAction, FileOrderAction +from .filter import NoMonochromeAction, OnlyMonochromeAction, ClassFilterAction, RatingFilterAction, FaceCountAction, \ + HeadCountAction, PersonRatioAction, MinSizeFilterAction, MinAreaFilterAction +from .lpips import FilterSimilarAction +from .split import PersonSplitAction, ThreeStageSplitAction +from .tagging import TaggingAction, TagFilterAction diff --git a/waifuc/action/__pycache__/__init__.cpython-310.pyc b/waifuc/action/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d580cc008cfc7e24e71beffc7f62b6b0c7ba9f48 Binary files /dev/null and b/waifuc/action/__pycache__/__init__.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/align.cpython-310.pyc b/waifuc/action/__pycache__/align.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af6399c3ecc68a9a7f54ac79d9259cfe05479740 Binary files /dev/null and b/waifuc/action/__pycache__/align.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/augument.cpython-310.pyc b/waifuc/action/__pycache__/augument.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f5abe604e428b23135235d05f91c1d166ff4c3a Binary files /dev/null and b/waifuc/action/__pycache__/augument.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/background.cpython-310.pyc b/waifuc/action/__pycache__/background.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f834cce44a5d072bf0b90cb953d44c1d70b6c8ed Binary files /dev/null and b/waifuc/action/__pycache__/background.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/base.cpython-310.pyc b/waifuc/action/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69fa0ddf938ee3965c32a457b027596022e80668 Binary files /dev/null and b/waifuc/action/__pycache__/base.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/basic.cpython-310.pyc b/waifuc/action/__pycache__/basic.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb6c9fbe45fde4a19f200c03ca8cb4117fb06dfe Binary files /dev/null and b/waifuc/action/__pycache__/basic.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/ccip.cpython-310.pyc b/waifuc/action/__pycache__/ccip.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1a723411349015e8277b330600bd6060dc6004c Binary files /dev/null and b/waifuc/action/__pycache__/ccip.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/count.cpython-310.pyc b/waifuc/action/__pycache__/count.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed4e1c39e73aed617dbaa4e88c9504e9ff78a9df Binary files /dev/null and b/waifuc/action/__pycache__/count.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/filename.cpython-310.pyc b/waifuc/action/__pycache__/filename.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a38991068a05f7e1d1db540eda131ba2ff3b92f6 Binary files /dev/null and b/waifuc/action/__pycache__/filename.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/filter.cpython-310.pyc b/waifuc/action/__pycache__/filter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d3c19918688f74f06d285bfa2c5c692d08c969e Binary files /dev/null and b/waifuc/action/__pycache__/filter.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/lpips.cpython-310.pyc b/waifuc/action/__pycache__/lpips.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee773059c10440ace26e93424eb51122456b8d3e Binary files /dev/null and b/waifuc/action/__pycache__/lpips.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/split.cpython-310.pyc b/waifuc/action/__pycache__/split.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7f710b69169e936f8f3ceac5e001594a94e5956 Binary files /dev/null and b/waifuc/action/__pycache__/split.cpython-310.pyc differ diff --git a/waifuc/action/__pycache__/tagging.cpython-310.pyc b/waifuc/action/__pycache__/tagging.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f1c137342875bc484beef8dee1ec558bcc02440 Binary files /dev/null and b/waifuc/action/__pycache__/tagging.cpython-310.pyc differ diff --git a/waifuc/action/align.py b/waifuc/action/align.py new file mode 100644 index 0000000000000000000000000000000000000000..9e9ad152f5602e411397bef0c942f0fe081cbca2 --- /dev/null +++ b/waifuc/action/align.py @@ -0,0 +1,51 @@ +from typing import Tuple + +from PIL import Image +from imgutils.data import load_image + +from .base import ProcessAction +from ..model import ImageItem + + +class AlignMaxSizeAction(ProcessAction): + def __init__(self, max_size: int): + self._max_size = max_size + + def process(self, item: ImageItem) -> ImageItem: + image = item.image + ms = max(image.width, image.height) + if ms > self._max_size: + r = ms / self._max_size + image = image.resize((int(image.width / r), int(image.height / r))) + + return ImageItem(image, item.meta) + + +class AlignMinSizeAction(ProcessAction): + def __init__(self, min_size: int): + self._min_size = min_size + + def process(self, item: ImageItem) -> ImageItem: + image = item.image + ms = min(image.width, image.height) + if ms > self._min_size: + r = ms / self._min_size + image = image.resize((int(image.width / r), int(image.height / r))) + + return ImageItem(image, item.meta) + + +class PaddingAlignAction(ProcessAction): + def __init__(self, size: Tuple[int, int], color: str = 'white'): + self.width, self.height = size + self.color = color + + def process(self, item: ImageItem) -> ImageItem: + image = load_image(item.image, force_background=None, mode='RGBA') + r = min(self.width / image.width, self.height / image.height) + resized = image.resize((int(image.width * r), int(image.height * r))) + + new_image = Image.new('RGBA', (self.width, self.height), self.color) + left, top = int((new_image.width - resized.width) // 2), int((new_image.height - resized.height) // 2) + new_image.paste(resized, (left, top, left + resized.width, top + resized.height), resized) + return ImageItem(new_image.convert(item.image.mode), item.meta) diff --git a/waifuc/action/augument.py b/waifuc/action/augument.py new file mode 100644 index 0000000000000000000000000000000000000000..58ba9cb02e0157ac5a989b28ecea344fd9db92b2 --- /dev/null +++ b/waifuc/action/augument.py @@ -0,0 +1,68 @@ +import os.path +import random +from typing import Iterator, Optional, Tuple + +from PIL import ImageOps +from hbutils.random import random_sha1 + +from .base import BaseAction +from ..model import ImageItem + + +class BaseRandomAction(BaseAction): + def __init__(self, seed=None): + self.seed = seed + self.random = random.Random(self.seed) + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + raise NotImplementedError # pragma: no cover + + def reset(self): + self.random = random.Random(self.seed) + + +class RandomChoiceAction(BaseRandomAction): + def __init__(self, p=0.5, seed=None): + BaseRandomAction.__init__(self, seed) + self.p = p + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if self.random.random() <= self.p: + yield item + + +class RandomFilenameAction(BaseRandomAction): + def __init__(self, ext: Optional[str] = '.png', seed=None): + BaseRandomAction.__init__(self, seed) + self.ext = ext + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if 'filename' in item.meta: + ext = self.ext or os.path.splitext(os.path.basename(item.meta['filename']))[0] + else: + if self.ext: + ext = self.ext + else: + raise NameError(f'Extension (ext) must be specified ' + f'when filename not in metadata of image item - {item!r}.') + + filename = random_sha1(rnd=self.random) + ext + yield ImageItem(item.image, {**item.meta, 'filename': filename}) + + +class MirrorAction(BaseAction): + def __init__(self, names: Tuple[str, str] = ('origin', 'mirror')): + self.origin_name, self.mirror_name = names + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if 'filename' in item.meta: + filebody, ext = os.path.splitext(item.meta['filename']) + yield ImageItem(item.image, {**item.meta, 'filename': f'{filebody}_{self.origin_name}{ext}'}) + yield ImageItem(ImageOps.mirror(item.image), + {**item.meta, 'filename': f'{filebody}_{self.mirror_name}{ext}'}) + else: + yield ImageItem(item.image, item.meta) + yield ImageItem(ImageOps.mirror(item.image), item.meta) + + def reset(self): + pass diff --git a/waifuc/action/background.py b/waifuc/action/background.py new file mode 100644 index 0000000000000000000000000000000000000000..247cca95394807570509831e15ebdd919f34ca07 --- /dev/null +++ b/waifuc/action/background.py @@ -0,0 +1,10 @@ +from imgutils.segment import segment_rgba_with_isnetis + +from .base import ProcessAction +from ..model import ImageItem + + +class BackgroundRemovalAction(ProcessAction): + def process(self, item: ImageItem) -> ImageItem: + _, image = segment_rgba_with_isnetis(item.image) + return ImageItem(image, item.meta) diff --git a/waifuc/action/base.py b/waifuc/action/base.py new file mode 100644 index 0000000000000000000000000000000000000000..5f5696d678c4065d53b4616d6c2cbc3f14269b91 --- /dev/null +++ b/waifuc/action/base.py @@ -0,0 +1,51 @@ +from typing import Iterator, Iterable + +from ..model import ImageItem + + +class ActionStop(Exception): + pass + + +class BaseAction: + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + raise NotImplementedError # pragma: no cover + + def iter_from(self, iter_: Iterable[ImageItem]) -> Iterator[ImageItem]: + for item in iter_: + try: + yield from self.iter(item) + except ActionStop: + break + + def reset(self): + raise NotImplementedError # pragma: no cover + + +class ProcessAction(BaseAction): + def process(self, item: ImageItem) -> ImageItem: + raise NotImplementedError # pragma: no cover + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + yield self.process(item) + + def reset(self): + pass + + def __call__(self, item: ImageItem) -> ImageItem: + return self.process(item) + + +class FilterAction(BaseAction): + def check(self, item: ImageItem) -> bool: + raise NotImplementedError # pragma: no cover + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if self.check(item): + yield item + + def reset(self): + pass + + def __call__(self, item: ImageItem) -> bool: + return self.check(item) diff --git a/waifuc/action/basic.py b/waifuc/action/basic.py new file mode 100644 index 0000000000000000000000000000000000000000..4087eb86df03342cbc70e6d5d71aa4cb6f6a17a8 --- /dev/null +++ b/waifuc/action/basic.py @@ -0,0 +1,16 @@ +from typing import Optional + +from imgutils.data import load_image + +from .base import ProcessAction +from ..model import ImageItem + + +class ModeConvertAction(ProcessAction): + def __init__(self, mode='RGB', force_background: Optional[str] = 'white'): + self.mode = mode + self.force_background = force_background + + def process(self, item: ImageItem) -> ImageItem: + image = load_image(item.image, mode=self.mode, force_background=self.force_background) + return ImageItem(image, item.meta) diff --git a/waifuc/action/ccip.py b/waifuc/action/ccip.py new file mode 100644 index 0000000000000000000000000000000000000000..2cec82502ded408ff006ec580cf297b612f72a7d --- /dev/null +++ b/waifuc/action/ccip.py @@ -0,0 +1,151 @@ +import logging +from enum import IntEnum +from typing import Iterator, Optional, List, Tuple + +import numpy as np +from hbutils.string import plural_word +from hbutils.testing import disable_output +from imgutils.metrics import ccip_extract_feature, ccip_default_threshold, ccip_clustering, ccip_batch_differences + +from .base import BaseAction +from ..model import ImageItem + + +class CCIPStatus(IntEnum): + INIT = 0x1 + APPROACH = 0x2 + EVAL = 0x3 + INIT_WITH_SOURCE = 0x4 + + +class CCIPAction(BaseAction): + def __init__(self, init_source=None, *, min_val_count: int = 15, step: int = 5, + ratio_threshold: float = 0.6, min_clu_dump_ratio: float = 0.3, cmp_threshold: float = 0.5, + eps: Optional[float] = None, min_samples: Optional[int] = None, + model='ccip-caformer-24-randaug-pruned', threshold: Optional[float] = None): + self.init_source = init_source + + self.min_val_count = min_val_count + self.step = step + self.ratio_threshold = ratio_threshold + self.min_clu_dump_ratio = min_clu_dump_ratio + self.cmp_threshold = cmp_threshold + self.eps, self.min_samples = eps, min_samples + self.model = model + self.threshold = threshold or ccip_default_threshold(self.model) + + self.items = [] + self.item_released = [] + self.feats = [] + if self.init_source is not None: + self.status = CCIPStatus.INIT_WITH_SOURCE + else: + self.status = CCIPStatus.INIT + + def _extract_feature(self, item: ImageItem): + if 'ccip_feature' in item.meta: + return item.meta['ccip_feature'] + else: + return ccip_extract_feature(item.image, model=self.model) + + def _try_cluster(self) -> bool: + with disable_output(): + clu_ids = ccip_clustering(self.feats, method='optics', model=self.model, + eps=self.eps, min_samples=self.min_samples) + clu_counts = {} + for id_ in clu_ids: + if id_ != -1: + clu_counts[id_] = clu_counts.get(id_, 0) + 1 + + clu_total = sum(clu_counts.values()) if clu_counts else 0 + chosen_id = None + for id_, count in clu_counts.items(): + if count >= clu_total * self.ratio_threshold: + chosen_id = id_ + break + + if chosen_id is not None: + feats = [feat for i, feat in enumerate(self.feats) if clu_ids[i] == chosen_id] + clu_dump_ratio = np.array([ + self._compare_to_exists(feat, base_set=feats) + for feat in feats + ]).astype(float).mean() + + if clu_dump_ratio >= self.min_clu_dump_ratio: + self.items = [item for i, item in enumerate(self.items) if clu_ids[i] == chosen_id] + self.item_released = [False] * len(self.items) + self.feats = [feat for i, feat in enumerate(self.feats) if clu_ids[i] == chosen_id] + return True + else: + return False + else: + return False + + def _compare_to_exists(self, feat, base_set=None) -> Tuple[bool, List[int]]: + diffs = ccip_batch_differences([feat, *(base_set or self.feats)], model=self.model)[0, 1:] + matches = diffs <= self.threshold + return matches.astype(float).mean() >= self.cmp_threshold + + def _dump_items(self) -> Iterator[ImageItem]: + for i in range(len(self.items)): + if not self.item_released[i]: + if self._compare_to_exists(self.feats[i]): + self.item_released[i] = True + yield self.items[i] + + def _eval_iter(self, item: ImageItem) -> Iterator[ImageItem]: + feat = self._extract_feature(item) + if self._compare_to_exists(feat): + self.feats.append(feat) + yield item + + if (len(self.feats) - len(self.items)) % self.step == 0: + yield from self._dump_items() + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if self.status == CCIPStatus.INIT_WITH_SOURCE: + cnt = 0 + logging.info('Existing anchor detected.') + for item_ in self.init_source: + self.feats.append(self._extract_feature(item_)) + yield item_ + cnt += 1 + logging.info(f'{plural_word(cnt, "items")} loaded from anchor.') + + self.status = CCIPStatus.EVAL + yield from self._eval_iter(item) + + elif self.status == CCIPStatus.INIT: + self.items.append(item) + self.feats.append(self._extract_feature(item)) + + if len(self.items) >= self.min_val_count: + if self._try_cluster(): + self.status = CCIPStatus.EVAL + yield from self._dump_items() + else: + self.status = CCIPStatus.APPROACH + + elif self.status == CCIPStatus.APPROACH: + self.items.append(item) + self.feats.append(self._extract_feature(item)) + + if (len(self.items) - self.min_val_count) % self.step == 0: + if self._try_cluster(): + self.status = CCIPStatus.EVAL + yield from self._dump_items() + + elif self.status == CCIPStatus.EVAL: + yield from self._eval_iter(item) + + else: + raise ValueError(f'Unknown status for {self.__class__.__name__} - {self.status!r}.') + + def reset(self): + self.items.clear() + self.item_released.clear() + self.feats.clear() + if self.init_source: + self.status = CCIPStatus.INIT_WITH_SOURCE + else: + self.status = CCIPStatus.INIT diff --git a/waifuc/action/count.py b/waifuc/action/count.py new file mode 100644 index 0000000000000000000000000000000000000000..b04443de064e06fc90f61aab998ef42002f76759 --- /dev/null +++ b/waifuc/action/count.py @@ -0,0 +1,72 @@ +from typing import Iterator + +from .base import BaseAction, ActionStop +from ..model import ImageItem + + +class FirstNSelectAction(BaseAction): + def __init__(self, n: int): + self._n = n + self._passed = 0 + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if self._passed < self._n: + yield item + self._passed += 1 + else: + raise ActionStop + + def reset(self): + self._passed = 0 + + +def _slice_process(start, stop, step): + start = 0 if start is None else start + step = 1 if step is None else step + if not isinstance(start, int) or start < 0: + raise ValueError(f'Start should be an integer no less than 0, but {start!r} found.') + if stop is not None and (not isinstance(stop, int) or stop < 0): + raise ValueError(f'Stop should be an integer no less than 0, but {stop!r} found.') + if not isinstance(step, int) or step < 1: + raise ValueError(f'Step should be an integer no less than 1, but {step!r} found.') + + return start, stop, step + + +class SliceSelectAction(BaseAction): + def __init__(self, *args): + if len(args) == 0: + slice_args = _slice_process(None, None, None) + elif len(args) == 1: + slice_args = _slice_process(None, args[0], None) + elif len(args) == 2: + slice_args = _slice_process(args[0], args[1], None) + elif len(args) == 3: + slice_args = _slice_process(args[0], args[1], args[2]) + else: + raise ValueError(f'Arguments of {self.__class__.__name__} should no no more than 3, but {args!r} found.') + + self._start, self._stop, self._step = slice_args + if self._stop is not None: + self._max = self._start + ((self._stop - self._start - 1) // self._step) * self._step + else: + self._max = None + self._current = 0 + + def _check_current(self): + if self._stop is not None and self._current >= self._stop: + return False + if self._current < self._start: + return False + return (self._current - self._start) % self._step == 0 + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if self._current > self._max: + raise ActionStop + else: + if self._check_current(): + yield item + self._current += 1 + + def reset(self): + self._current = 0 diff --git a/waifuc/action/filename.py b/waifuc/action/filename.py new file mode 100644 index 0000000000000000000000000000000000000000..2d35eed4f61bd541772c6a673033a7148c7039c0 --- /dev/null +++ b/waifuc/action/filename.py @@ -0,0 +1,46 @@ +import os +from typing import Iterator, Optional + +from .base import BaseAction +from ..model import ImageItem + + +class FileExtAction(BaseAction): + def __init__(self, ext: str): + self.ext = ext + self.untitles = 0 + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if 'filename' in item.meta: + filebody, _ = os.path.splitext(item.meta['filename']) + filename = f'{filebody}{self.ext}' + else: + self.untitles += 1 + filename = f'untitled_{self.untitles}{self.ext}' + + yield ImageItem(item.image, {**item.meta, 'filename': filename}) + + def reset(self): + self.untitles = 0 + + +class FileOrderAction(BaseAction): + def __init__(self, ext: Optional[str] = '.png'): + self.ext = ext + self._current = 0 + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + self._current += 1 + if 'filename' in item.meta: + _, ext = os.path.splitext(item.meta['filename']) + new_filename = f'{self._current}{self.ext or ext}' + else: + if not self.ext: + raise ValueError('No extension name provided for unnamed file.') + else: + new_filename = f'{self._current}{self.ext}' + + yield ImageItem(item.image, {**item.meta, 'filename': new_filename}) + + def reset(self): + self._current = 0 diff --git a/waifuc/action/filter.py b/waifuc/action/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..7f43f3dac81a3405d57eddd970b2ca16642c4824 --- /dev/null +++ b/waifuc/action/filter.py @@ -0,0 +1,110 @@ +from typing import List, Optional, Literal + +from imgutils.detect import detect_faces, detect_heads, detect_person +from imgutils.validate import is_monochrome, anime_classify, anime_rating + +from .base import FilterAction +from ..model import ImageItem + + +class NoMonochromeAction(FilterAction): + def check(self, item: ImageItem) -> bool: + return not is_monochrome(item.image) + + +class OnlyMonochromeAction(FilterAction): + def check(self, item: ImageItem) -> bool: + return is_monochrome(item.image) + + +ImageClassTyping = Literal['illustration', 'bangumi', 'comic', '3d'] + + +class ClassFilterAction(FilterAction): + def __init__(self, classes: List[ImageClassTyping], threshold: Optional[float] = None, **kwargs): + self.classes = classes + self.threshold = threshold + self.kwargs = kwargs + + def check(self, item: ImageItem) -> bool: + cls, score = anime_classify(item.image, **self.kwargs) + return cls in self.classes and (self.threshold is None or score >= self.threshold) + + +ImageRatingTyping = Literal['safe', 'r15', 'r18'] + + +class RatingFilterAction(FilterAction): + def __init__(self, ratings: List[ImageRatingTyping], threshold: Optional[float] = None, **kwargs): + self.ratings = ratings + self.threshold = threshold + self.kwargs = kwargs + + def check(self, item: ImageItem) -> bool: + rating, score = anime_rating(item.image, **self.kwargs) + return rating in self.ratings and (self.threshold is None or score >= self.threshold) + + +class FaceCountAction(FilterAction): + def __init__(self, count: int, level: str = 's', version: str = 'v1.4', + conf_threshold: float = 0.25, iou_threshold: float = 0.7): + self.count = count + self.level = level + self.version = version + self.conf_threshold = conf_threshold + self.iou_threshold = iou_threshold + + def check(self, item: ImageItem) -> bool: + detection = detect_faces(item.image, self.level, self.version, + conf_threshold=self.conf_threshold, iou_threshold=self.iou_threshold) + return len(detection) == self.count + + +class HeadCountAction(FilterAction): + def __init__(self, count: int, level: str = 's', conf_threshold: float = 0.3, iou_threshold: float = 0.7): + self.count = count + self.level = level + self.conf_threshold = conf_threshold + self.iou_threshold = iou_threshold + + def check(self, item: ImageItem) -> bool: + detection = detect_heads( + item.image, self.level, + conf_threshold=self.conf_threshold, + iou_threshold=self.iou_threshold + ) + return len(detection) == self.count + + +class PersonRatioAction(FilterAction): + def __init__(self, ratio: float = 0.4, level: str = 'm', version: str = 'v1.1', + conf_threshold: float = 0.3, iou_threshold: float = 0.5): + self.ratio = ratio + self.level = level + self.version = version + self.conf_threshold = conf_threshold + self.iou_threshold = iou_threshold + + def check(self, item: ImageItem) -> bool: + detections = detect_person(item.image, self.level, self.version, 640, self.conf_threshold, self.iou_threshold) + if len(detections) != 1: + return False + + (x0, y0, x1, y1), _, _ = detections[0] + return abs((x1 - x0) * (y1 - y0)) >= self.ratio * (item.image.width * item.image.height) + + +class MinSizeFilterAction(FilterAction): + def __init__(self, min_size: int): + self.min_size = min_size + + def check(self, item: ImageItem) -> bool: + return min(item.image.width, item.image.height) >= self.min_size + + +class MinAreaFilterAction(FilterAction): + def __init__(self, min_size: int): + self.min_size = min_size + + def check(self, item: ImageItem) -> bool: + return (item.image.width * item.image.height) ** 0.5 >= self.min_size diff --git a/waifuc/action/lpips.py b/waifuc/action/lpips.py new file mode 100644 index 0000000000000000000000000000000000000000..c1b5e7196381ce0f1fd7e441f9d09a0182a27c16 --- /dev/null +++ b/waifuc/action/lpips.py @@ -0,0 +1,69 @@ +from typing import Dict, Iterator, Literal + +import numpy as np +from imgutils.metrics import lpips_difference, lpips_extract_feature + +from .base import BaseAction +from ..model import ImageItem + + +class FeatureBucket: + def __init__(self, threshold: float = 0.45, capacity: int = 500, rtol=1.e-5, atol=1.e-8): + self.threshold = threshold + self.rtol, self.atol = rtol, atol + self.features = [] + self.ratios = np.array([], dtype=float) + self.capacity = capacity + + def check_duplicate(self, feat, ratio: float): + for id_ in np.where(np.isclose(self.ratios, ratio, rtol=self.rtol, atol=self.atol))[0]: + exist_feat = self.features[id_.item()] + if lpips_difference(exist_feat, feat) <= self.threshold: + return True + + return False + + def add(self, feat, ratio: float): + self.features.append(feat) + self.ratios = np.append(self.ratios, ratio) + if len(self.features) >= self.capacity * 2: + self.features = self.features[-self.capacity:] + self.ratios = self.ratios[-self.capacity:] + + +FilterSimilarModeTyping = Literal['all', 'group'] + + +class FilterSimilarAction(BaseAction): + def __init__(self, mode: FilterSimilarModeTyping = 'all', threshold: float = 0.45, + capacity: int = 500, rtol=5.e-2, atol=2.e-2): + self.mode = mode + self.threshold, self.rtol, self.atol = threshold, rtol, atol + self.capacity = capacity + self.buckets: Dict[str, FeatureBucket] = {} + self.global_bucket = FeatureBucket(threshold, self.capacity, rtol, atol) + + def _get_bin(self, group_id): + if self.mode == 'all': + return self.global_bucket + elif self.mode == 'group': + if group_id not in self.buckets: + self.buckets[group_id] = FeatureBucket(self.threshold, self.capacity, self.rtol, self.atol) + + return self.buckets[group_id] + else: + raise ValueError(f'Unknown mode for filter similar action - {self.mode!r}.') + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + image = item.image + ratio = image.height * 1.0 / image.width + feat = lpips_extract_feature(image) + bucket = self._get_bin(item.meta.get('group_id')) + + if not bucket.check_duplicate(feat, ratio): + bucket.add(feat, ratio) + yield item + + def reset(self): + self.buckets.clear() + self.global_bucket = FeatureBucket(self.threshold, self.capacity, self.rtol, self.atol) diff --git a/waifuc/action/split.py b/waifuc/action/split.py new file mode 100644 index 0000000000000000000000000000000000000000..2aa9b4c6191bba6488051ac27f89b7a07aac39a1 --- /dev/null +++ b/waifuc/action/split.py @@ -0,0 +1,145 @@ +import os +from typing import Iterator, Optional + +from imgutils.detect import detect_person, detect_heads, detect_halfbody, detect_eyes + +from .base import BaseAction +from ..model import ImageItem + + +class PersonSplitAction(BaseAction): + def __init__(self, keep_original: bool = False, level: str = 'm', version: str = 'v1.1', + conf_threshold: float = 0.3, iou_threshold: float = 0.5, keep_origin_tags: bool = False): + self.keep_original = keep_original + self.level = level + self.version = version + self.conf_threshold = conf_threshold + self.iou_threshold = iou_threshold + self.keep_origin_tags = keep_origin_tags + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + detection = detect_person(item.image, self.level, self.version, + conf_threshold=self.conf_threshold, iou_threshold=self.iou_threshold) + + if 'filename' in item.meta: + filename = item.meta['filename'] + filebody, ext = os.path.splitext(filename) + else: + filebody, ext = None, None + + if self.keep_original: + yield item + + for i, (area, type_, score) in enumerate(detection): + new_meta = { + **item.meta, + 'crop': {'type': type_, 'score': score}, + } + if 'tags' in new_meta and not self.keep_origin_tags: + del new_meta['tags'] + if filebody is not None: + new_meta['filename'] = f'{filebody}_person{i}{ext}' + yield ImageItem(item.image.crop(area), new_meta) + + def reset(self): + pass + + +class ThreeStageSplitAction(BaseAction): + def __init__(self, person_conf: Optional[dict] = None, halfbody_conf: Optional[dict] = None, + head_conf: Optional[dict] = None, head_scale: float = 1.5, + split_eyes: bool = False, eye_conf: Optional[dict] = None, eye_scale: float = 2.4, + split_person: bool = True, keep_origin_tags: bool = False): + self.person_conf = dict(person_conf or {}) + self.halfbody_conf = dict(halfbody_conf or {}) + self.head_conf = dict(head_conf or {}) + self.eye_conf = dict(eye_conf or {}) + self.head_scale = head_scale + self.eye_scale = eye_scale + self.split_eyes = split_eyes + self.split_person = split_person + self.keep_origin_tags = keep_origin_tags + + def _split_person(self, item: ImageItem, filebody, ext): + if self.split_person: + for i, (px, type_, score) in enumerate(detect_person(item.image, **self.person_conf), start=1): + person_image = item.image.crop(px) + person_meta = { + **item.meta, + 'crop': {'type': type_, 'score': score}, + } + if 'tags' in person_meta and not self.keep_origin_tags: + del person_meta['tags'] + if filebody is not None: + person_meta['filename'] = f'{filebody}_person{i}{ext}' + yield i, ImageItem(person_image, person_meta) + + else: + yield 1, item + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + if 'filename' in item.meta: + filename = item.meta['filename'] + filebody, ext = os.path.splitext(filename) + else: + filebody, ext = None, None + + for i, person_item in self._split_person(item, filebody, ext): + person_image = person_item.image + yield person_item + + half_detects = detect_halfbody(person_image, **self.halfbody_conf) + if half_detects: + halfbody_area, halfbody_type, halfbody_score = half_detects[0] + halfbody_image = person_image.crop(halfbody_area) + halfbody_meta = { + **item.meta, + 'crop': {'type': halfbody_type, 'score': halfbody_score}, + } + if 'tags' in halfbody_meta and not self.keep_origin_tags: + del halfbody_meta['tags'] + if filebody is not None: + halfbody_meta['filename'] = f'{filebody}_person{i}_halfbody{ext}' + yield ImageItem(halfbody_image, halfbody_meta) + + head_detects = detect_heads(person_image, **self.head_conf) + if head_detects: + (hx0, hy0, hx1, hy1), head_type, head_score = head_detects[0] + cx, cy = (hx0 + hx1) / 2, (hy0 + hy1) / 2 + width, height = hx1 - hx0, hy1 - hy0 + width = height = max(width, height) * self.head_scale + x0, y0 = int(max(cx - width / 2, 0)), int(max(cy - height / 2, 0)) + x1, y1 = int(min(cx + width / 2, person_image.width)), int(min(cy + height / 2, person_image.height)) + head_image = person_image.crop((x0, y0, x1, y1)) + head_meta = { + **item.meta, + 'crop': {'type': head_type, 'score': head_score}, + } + if 'tags' in head_meta and not self.keep_origin_tags: + del head_meta['tags'] + if filebody is not None: + head_meta['filename'] = f'{filebody}_person{i}_head{ext}' + yield ImageItem(head_image, head_meta) + + if self.split_eyes: + eye_detects = detect_eyes(head_image, **self.eye_conf) + for j, ((ex0, ey0, ex1, ey1), eye_type, eye_score) in enumerate(eye_detects): + cx, cy = (ex0 + ex1) / 2, (ey0 + ey1) / 2 + width, height = ex1 - ex0, ey1 - ey0 + width = height = max(width, height) * self.eye_scale + x0, y0 = int(max(cx - width / 2, 0)), int(max(cy - height / 2, 0)) + x1, y1 = int(min(cx + width / 2, head_image.width)), \ + int(min(cy + height / 2, head_image.height)) + eye_image = head_image.crop((x0, y0, x1, y1)) + eye_meta = { + **item.meta, + 'crop': {'type': eye_type, 'score': eye_score}, + } + if 'tags' in eye_meta and not self.keep_origin_tags: + del eye_meta['tags'] + if filebody is not None: + eye_meta['filename'] = f'{filebody}_person{i}_head_eye{j}{ext}' + yield ImageItem(eye_image, eye_meta) + + def reset(self): + pass diff --git a/waifuc/action/tagging.py b/waifuc/action/tagging.py new file mode 100644 index 0000000000000000000000000000000000000000..2c5f4986074a0b7a6b07f456b6091d297b65a0bd --- /dev/null +++ b/waifuc/action/tagging.py @@ -0,0 +1,83 @@ +from functools import partial +from typing import Iterator, Union, List, Mapping, Literal + +from PIL import Image +from imgutils.tagging import get_deepdanbooru_tags, get_wd14_tags, get_mldanbooru_tags + +from .base import ProcessAction, BaseAction +from ..model import ImageItem + + +def _deepdanbooru_tagging(image: Image.Image, use_real_name: bool = False, + general_threshold: float = 0.5, character_threshold: float = 0.5, **kwargs): + _ = kwargs + _, features, characters = get_deepdanbooru_tags(image, use_real_name, general_threshold, character_threshold) + return {**features, **characters} + + +def _wd14_tagging(image: Image.Image, model_name: str, + general_threshold: float = 0.35, character_threshold: float = 0.85, **kwargs): + _ = kwargs + _, features, characters = get_wd14_tags(image, model_name, general_threshold, character_threshold) + return {**features, **characters} + + +def _mldanbooru_tagging(image: Image.Image, use_real_name: bool = False, general_threshold: float = 0.7, **kwargs): + _ = kwargs + features = get_mldanbooru_tags(image, use_real_name, general_threshold) + return features + + +_TAGGING_METHODS = { + 'deepdanbooru': _deepdanbooru_tagging, + 'wd14_vit': partial(_wd14_tagging, model_name='ViT'), + 'wd14_convnext': partial(_wd14_tagging, model_name='ConvNext'), + 'wd14_convnextv2': partial(_wd14_tagging, model_name='ConvNextV2'), + 'wd14_swinv2': partial(_wd14_tagging, model_name='SwinV2'), + 'mldanbooru': _mldanbooru_tagging, +} + +TaggingMethodTyping = Literal[ + 'deepdanbooru', 'wd14_vit', 'wd14_convnext', 'wd14_convnextv2', 'wd14_swinv2', 'mldanbooru'] + + +class TaggingAction(ProcessAction): + def __init__(self, method: TaggingMethodTyping = 'wd14_convnextv2', force: bool = False, **kwargs): + self.method = _TAGGING_METHODS[method] + self.force = force + self.kwargs = kwargs + + def process(self, item: ImageItem) -> ImageItem: + if 'tags' in item.meta and not self.force: + return item + else: + tags = self.method(image=item.image, **self.kwargs) + return ImageItem(item.image, {**item.meta, 'tags': tags}) + + +class TagFilterAction(BaseAction): + def __init__(self, tags: Union[List[str], Mapping[str, float]], + method: TaggingMethodTyping = 'wd14_convnextv2', **kwargs): + if isinstance(tags, (list, tuple)): + self.tags = {tag: 1e-6 for tag in tags} + elif isinstance(tags, dict): + self.tags = dict(tags) + else: + raise TypeError(f'Unknown type of tags - {tags!r}.') + self.tagger = TaggingAction(method, force=False, **kwargs) + + def iter(self, item: ImageItem) -> Iterator[ImageItem]: + item = self.tagger(item) + tags = item.meta['tags'] + + valid = True + for tag, min_score in self.tags.items(): + if tags[tag] < min_score: + valid = False + break + + if valid: + yield item + + def reset(self): + self.tagger.reset() diff --git a/waifuc/config/__init__.py b/waifuc/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/waifuc/config/__pycache__/__init__.cpython-310.pyc b/waifuc/config/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64e9186a709311f1e7d5400fa72996f9f0072114 Binary files /dev/null and b/waifuc/config/__pycache__/__init__.cpython-310.pyc differ diff --git a/waifuc/config/__pycache__/meta.cpython-310.pyc b/waifuc/config/__pycache__/meta.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d98d2f6096dbfecf53ae9725f908a08d810f4ff7 Binary files /dev/null and b/waifuc/config/__pycache__/meta.cpython-310.pyc differ diff --git a/waifuc/config/meta.py b/waifuc/config/meta.py new file mode 100644 index 0000000000000000000000000000000000000000..6155456c454bbc47c770bc5c05cdaa49fdc56938 --- /dev/null +++ b/waifuc/config/meta.py @@ -0,0 +1,19 @@ +""" +Overview: + Meta information for waifuc package. +""" + +#: Title of this project (should be `waifuc`). +__TITLE__ = 'waifuc' + +#: Version of this project. +__VERSION__ = '0.0.1' + +#: Short description of the project, will be included in ``setup.py``. +__DESCRIPTION__ = 'Efficient Train Data Collector for Anime Waifu' + +#: Author of this project. +__AUTHOR__ = 'narugo1992' + +#: Email of the authors'. +__AUTHOR_EMAIL__ = 'narugo992@gmail.com' diff --git a/waifuc/export/__init__.py b/waifuc/export/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0a4f9cae0b1df0077bf1bdae6a1a832600637f8b --- /dev/null +++ b/waifuc/export/__init__.py @@ -0,0 +1,3 @@ +from .base import BaseExporter, SaveExporter, LocalDirectoryExporter +from .huggingface import HuggingFaceExporter +from .textual_inversion import TextualInversionExporter diff --git a/waifuc/export/__pycache__/__init__.cpython-310.pyc b/waifuc/export/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4d1c97eabb0806ccd59b26bb1fba8df62beabff Binary files /dev/null and b/waifuc/export/__pycache__/__init__.cpython-310.pyc differ diff --git a/waifuc/export/__pycache__/base.cpython-310.pyc b/waifuc/export/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f239c5c38aa59caf30a131f0481f85199d95319e Binary files /dev/null and b/waifuc/export/__pycache__/base.cpython-310.pyc differ diff --git a/waifuc/export/__pycache__/huggingface.cpython-310.pyc b/waifuc/export/__pycache__/huggingface.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e9b4f14d242de1fe9f20a689d6b574bd207987c Binary files /dev/null and b/waifuc/export/__pycache__/huggingface.cpython-310.pyc differ diff --git a/waifuc/export/__pycache__/textual_inversion.cpython-310.pyc b/waifuc/export/__pycache__/textual_inversion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f58b6500767ecbcedc8411e8ac5a7d5d0e148aba Binary files /dev/null and b/waifuc/export/__pycache__/textual_inversion.cpython-310.pyc differ diff --git a/waifuc/export/base.py b/waifuc/export/base.py new file mode 100644 index 0000000000000000000000000000000000000000..d95778e42ba99659a56ca669dced3abdafacd05b --- /dev/null +++ b/waifuc/export/base.py @@ -0,0 +1,79 @@ +import os.path +from typing import Iterator + +from hbutils.system import remove +from tqdm.auto import tqdm + +from ..model import ImageItem +from ..utils import get_task_names + + +class BaseExporter: + def pre_export(self): + raise NotImplementedError # pragma: no cover + + def export_item(self, item: ImageItem): + raise NotImplementedError # pragma: no cover + + def post_export(self): + raise NotImplementedError # pragma: no cover + + def export_from(self, items: Iterator[ImageItem]): + self.pre_export() + names = get_task_names() + if names: + desc = f'{self.__class__.__name__} - {".".join(names)}' + else: + desc = f'{self.__class__.__name__}' + for item in tqdm(items, desc=desc): + self.export_item(item) + self.post_export() + + def reset(self): + raise NotImplementedError # pragma: no cover + + +class LocalDirectoryExporter(BaseExporter): + def __init__(self, output_dir, clear: bool = False): + self.output_dir = output_dir + self.clear = clear + + def pre_export(self): + if self.clear and os.path.exists(self.output_dir): + remove(self.output_dir) + + os.makedirs(self.output_dir, exist_ok=True) + + def export_item(self, item: ImageItem): + raise NotImplementedError # pragma: no cover + + def post_export(self): + pass + + def reset(self): + raise NotImplementedError # pragma: no cover + + +class SaveExporter(LocalDirectoryExporter): + def __init__(self, output_dir, clear: bool = False, no_meta: bool = False, + skip_when_image_exist: bool = False): + LocalDirectoryExporter.__init__(self, output_dir, clear) + self.no_meta = no_meta + self.untitles = 0 + self.skip_when_image_exist = skip_when_image_exist + + def export_item(self, item: ImageItem): + if 'filename' in item.meta: + filename = item.meta['filename'] + else: + self.untitles += 1 + filename = f'untited_{self.untitles}.png' + + full_filename = os.path.join(self.output_dir, filename) + full_directory = os.path.dirname(full_filename) + if full_directory: + os.makedirs(full_directory, exist_ok=True) + item.save(full_filename, no_meta=self.no_meta, skip_when_image_exist=self.skip_when_image_exist) + + def reset(self): + self.untitles = 0 diff --git a/waifuc/export/huggingface.py b/waifuc/export/huggingface.py new file mode 100644 index 0000000000000000000000000000000000000000..2d1232a1bba2340a893a517468e2357e45b70174 --- /dev/null +++ b/waifuc/export/huggingface.py @@ -0,0 +1,64 @@ +import os +import zipfile +from typing import Type, Optional, Mapping, Any + +from hbutils.system import TemporaryDirectory +from huggingface_hub import HfApi + +from .base import LocalDirectoryExporter, BaseExporter +from ..model import ImageItem + + +class HuggingFaceExporter(BaseExporter): + def __init__(self, repository: str, file_in_repo: str, + cls: Type[LocalDirectoryExporter], args: tuple = (), kwargs: Optional[Mapping[str, Any]] = None, + repo_type: str = 'dataset', revision: str = 'main', hf_token: Optional[str] = None): + self.repository = repository + self.repo_type, self.revision = repo_type, revision + self.file_in_repo = file_in_repo + self.cls, self.args, self.kwargs = (cls, args, kwargs or {}) + self._tempdir: Optional[TemporaryDirectory] = None + self._exporter: Optional[LocalDirectoryExporter] = None + self.hf_token = hf_token or os.environ.get('HF_TOKEN') + + def pre_export(self): + self._tempdir = TemporaryDirectory() + self._exporter = self.cls(self._tempdir.name, *self.args, **self.kwargs) + self._exporter.pre_export() + + def export_item(self, item: ImageItem): + self._exporter.export_item(item) + + def post_export(self): + self._exporter.post_export() + + # upload to huggingface + hf_api = HfApi(token=self.hf_token) + hf_api.create_repo(self.repository, repo_type=self.repo_type, exist_ok=True) + with TemporaryDirectory() as td: + zip_file = os.path.join(td, 'package.zip') + with zipfile.ZipFile(zip_file, mode='w') as zf: + for directory, _, files in os.walk(self._tempdir.name): + for file in files: + file_path = os.path.join(directory, file) + rel_file_path = os.path.relpath(file_path, self._tempdir.name) + zf.write( + file_path, + '/'.join(rel_file_path.split(os.sep)) + ) + + hf_api.upload_file( + path_or_fileobj=zip_file, + repo_id=self.repository, + repo_type=self.repo_type, + path_in_repo=self.file_in_repo, + revision=self.revision, + commit_message=f'Upload {self.file_in_repo} with waifuc' + ) + + self._exporter = None + self._tempdir.cleanup() + self._tempdir = None + + def reset(self): + pass diff --git a/waifuc/export/textual_inversion.py b/waifuc/export/textual_inversion.py new file mode 100644 index 0000000000000000000000000000000000000000..22f0098c1494b58994193871dfb5f635976a1aa7 --- /dev/null +++ b/waifuc/export/textual_inversion.py @@ -0,0 +1,43 @@ +import os + +from imgutils.tagging import tags_to_text + +from .base import LocalDirectoryExporter +from ..model import ImageItem + + +class TextualInversionExporter(LocalDirectoryExporter): + def __init__(self, output_dir: str, clear: bool = False, + use_spaces: bool = False, use_escape: bool = True, + include_score: bool = False, score_descend: bool = True, + skip_when_image_exist: bool = False): + LocalDirectoryExporter.__init__(self, output_dir, clear) + self.use_spaces = use_spaces + self.use_escape = use_escape + self.include_score = include_score + self.score_descend = score_descend + self.untitles = 0 + self.skip_when_image_exist = skip_when_image_exist + + def export_item(self, item: ImageItem): + if 'filename' in item.meta: + filename = item.meta['filename'] + else: + self.untitles += 1 + filename = f'untited_{self.untitles}.png' + + tags = item.meta.get('tags', None) or {} + + full_filename = os.path.join(self.output_dir, filename) + full_tagname = os.path.join(self.output_dir, os.path.splitext(filename)[0] + '.txt') + full_directory = os.path.dirname(full_filename) + if full_directory: + os.makedirs(full_directory, exist_ok=True) + + if not self.skip_when_image_exist or not os.path.exists(full_filename): + item.image.save(full_filename) + with open(full_tagname, 'w', encoding='utf-8') as f: + f.write(tags_to_text(tags, self.use_spaces, self.use_escape, self.include_score, self.score_descend)) + + def reset(self): + self.untitles = 0 diff --git a/waifuc/model/__init__.py b/waifuc/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea87ea1e21fe4ab2c26bb7d1e9496fe96c427420 --- /dev/null +++ b/waifuc/model/__init__.py @@ -0,0 +1 @@ +from .item import load_meta, dump_meta, ImageItem diff --git a/waifuc/model/__pycache__/__init__.cpython-310.pyc b/waifuc/model/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ead3e4a61e1291d5810733ee82cc5bf3ac8bf077 Binary files /dev/null and b/waifuc/model/__pycache__/__init__.cpython-310.pyc differ diff --git a/waifuc/model/__pycache__/item.cpython-310.pyc b/waifuc/model/__pycache__/item.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8fba78024460d71031ee03593ed6336815e5c90 Binary files /dev/null and b/waifuc/model/__pycache__/item.cpython-310.pyc differ diff --git a/waifuc/model/item.py b/waifuc/model/item.py new file mode 100644 index 0000000000000000000000000000000000000000..4f585706d26b596677aa353fd82b3385fd0f82df --- /dev/null +++ b/waifuc/model/item.py @@ -0,0 +1,98 @@ +import json +import os.path +import pickle +from dataclasses import dataclass +from typing import Optional + +from PIL import Image +from hbutils.encoding import base64_decode, base64_encode +from hbutils.reflection import quick_import_object + +NoneType = type(None) + +_TYPE_META = '__type' +_BASE64_META = 'base64' + + +def load_meta(data, path=()): + if isinstance(data, (int, float, str, NoneType)): + return data + elif isinstance(data, list): + return [load_meta(item, (*path, i)) for i, item in enumerate(data)] + elif isinstance(data, dict): + if _TYPE_META not in data: + return {key: load_meta(value, (*path, key)) for key, value in data.items()} + else: + cls, _, _ = quick_import_object(data[_TYPE_META]) + binary = base64_decode(data[_BASE64_META]) + obj = pickle.loads(binary) + if isinstance(obj, cls): + return obj + else: + raise TypeError(f'{cls!r} expected but {obj!r} found at {path!r}.') + else: + raise TypeError(f'Unknown type {data!r} at {path!r}.') + + +def dump_meta(data, path=()): + if isinstance(data, (int, float, str, NoneType)): + return data + elif isinstance(data, list): + return [dump_meta(item, (*path, i)) for i, item in enumerate(data)] + elif isinstance(data, dict): + return {key: dump_meta(value, (*path, key)) for key, value in data.items()} + else: + cls = type(data) + type_str = f'{cls.__module__}.{cls.__name__}' if hasattr(cls, '__module__') else cls.__name__ + base64_str = base64_encode(pickle.dumps(data)) + return { + _TYPE_META: type_str, + _BASE64_META: base64_str + } + + +@dataclass +class ImageItem: + image: Image.Image + meta: dict + + def __init__(self, image: Image.Image, meta: Optional[dict] = None): + self.image = image + self.meta = meta or {} + + @classmethod + def _image_file_to_meta_file(cls, image_file): + directory, filename = os.path.split(image_file) + filebody, _ = os.path.splitext(filename) + meta_file = os.path.join(directory, f'.{filebody}_meta.json') + return meta_file + + @classmethod + def load_from_image(cls, image_file): + image = Image.open(image_file) + meta_file = cls._image_file_to_meta_file(image_file) + + if os.path.exists(meta_file): + with open(meta_file, 'r', encoding='utf-8') as f: + meta = load_meta(json.load(f)) + else: + meta = {} + + return cls(image, meta) + + def save(self, image_file, no_meta: bool = False, skip_when_image_exist: bool = False): + if not skip_when_image_exist or not os.path.exists(image_file): + self.image.save(image_file) + if not no_meta and self.meta: + meta_file = self._image_file_to_meta_file(image_file) + with open(meta_file, 'w', encoding='utf-8') as f: + json.dump(dump_meta(self.meta), f) + + def __repr__(self): + values = {'size': self.image.size} + for key, value in self.meta.items(): + if isinstance(value, (int, float, str)): + values[key] = value + + content = ', '.join(f'{key}: {values[key]!r}' for key in sorted(values.keys())) + return f'<{self.__class__.__name__} {content}>' diff --git a/waifuc/source/__init__.py b/waifuc/source/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..28d3c82ca71185f84769654072a7e95e8b0c1b82 --- /dev/null +++ b/waifuc/source/__init__.py @@ -0,0 +1,19 @@ +from .anime_pictures import AnimePicturesSource +from .base import BaseDataSource, EmptySource +from .compose import ParallelDataSource, ComposedDataSource +from .danbooru import DanbooruSource, SafebooruSource, ATFBooruSource, E621LikeSource, E621Source, E926Source +from .derpibooru import DerpibooruLikeSource, DerpibooruSource, FurbooruSource +from .duitang import DuitangSource +from .gchar import GcharAutoSource +from .huashi6 import Huashi6Source +from .konachan import KonachanLikeSource, YandeSource, KonachanSource, KonachanNetSource, LolibooruSource, \ + Rule34LikeSource, Rule34Source, HypnoHubSource, GelbooruSource, XbooruLikeSource, XbooruSource, \ + SafebooruOrgSource, TBIBSource +from .local import LocalSource, LocalTISource +from .paheal import PahealSource +from .pixiv import BasePixivSource, PixivSearchSource, PixivUserSource, PixivRankingSource +from .sankaku import SankakuSource, PostOrder, Rating, FileType +from .video import VideoSource +from .wallhaven import WallHavenSource +from .web import WebDataSource +from .zerochan import ZerochanSource diff --git a/waifuc/source/__pycache__/__init__.cpython-310.pyc b/waifuc/source/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22eb1ff549e426949d796d07fc055785c1c9c04f Binary files /dev/null and b/waifuc/source/__pycache__/__init__.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/anime_pictures.cpython-310.pyc b/waifuc/source/__pycache__/anime_pictures.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08ad1347805b4741dc349e4bb1e2f839564ab1b5 Binary files /dev/null and b/waifuc/source/__pycache__/anime_pictures.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/base.cpython-310.pyc b/waifuc/source/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbf620c7effaed99d6e8b11404778c9eb778c919 Binary files /dev/null and b/waifuc/source/__pycache__/base.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/compose.cpython-310.pyc b/waifuc/source/__pycache__/compose.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c963bcb81cf5b7cb36cbbf19c06a80a9689da3a Binary files /dev/null and b/waifuc/source/__pycache__/compose.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/danbooru.cpython-310.pyc b/waifuc/source/__pycache__/danbooru.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab195856340d5ebbf1f25f2326ee5264188b7178 Binary files /dev/null and b/waifuc/source/__pycache__/danbooru.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/derpibooru.cpython-310.pyc b/waifuc/source/__pycache__/derpibooru.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b8afd327a8c5648fa8d2452a83fb854a45cf7bc Binary files /dev/null and b/waifuc/source/__pycache__/derpibooru.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/duitang.cpython-310.pyc b/waifuc/source/__pycache__/duitang.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3fde9e5004228c061c9dddd75259fd8cd5783cf6 Binary files /dev/null and b/waifuc/source/__pycache__/duitang.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/gchar.cpython-310.pyc b/waifuc/source/__pycache__/gchar.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd002c12d5d5b7b042b6743a759b83987a072233 Binary files /dev/null and b/waifuc/source/__pycache__/gchar.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/huashi6.cpython-310.pyc b/waifuc/source/__pycache__/huashi6.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48c1416556e413c0760ffe7017e922f4fd8962b2 Binary files /dev/null and b/waifuc/source/__pycache__/huashi6.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/konachan.cpython-310.pyc b/waifuc/source/__pycache__/konachan.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39f03b7b1f5120b3f72f58aac4dfbe46695d9c72 Binary files /dev/null and b/waifuc/source/__pycache__/konachan.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/local.cpython-310.pyc b/waifuc/source/__pycache__/local.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e906c84bc7779ccc14b14a4a16a45c5a193d81f9 Binary files /dev/null and b/waifuc/source/__pycache__/local.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/paheal.cpython-310.pyc b/waifuc/source/__pycache__/paheal.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a51b24210ac8d35caa117f69e9146d4095d99c45 Binary files /dev/null and b/waifuc/source/__pycache__/paheal.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/pixiv.cpython-310.pyc b/waifuc/source/__pycache__/pixiv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fde867817e54f196b8f29212aa5c684ec3497482 Binary files /dev/null and b/waifuc/source/__pycache__/pixiv.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/sankaku.cpython-310.pyc b/waifuc/source/__pycache__/sankaku.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67e7f911cbbe1bfb9d5afdc042a37687d51dc1bc Binary files /dev/null and b/waifuc/source/__pycache__/sankaku.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/video.cpython-310.pyc b/waifuc/source/__pycache__/video.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c652536e5eda0df63bdf40429021410c5bda8caa Binary files /dev/null and b/waifuc/source/__pycache__/video.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/wallhaven.cpython-310.pyc b/waifuc/source/__pycache__/wallhaven.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ca09caf53c845083405b180c3d9d3c5bb5d6191 Binary files /dev/null and b/waifuc/source/__pycache__/wallhaven.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/web.cpython-310.pyc b/waifuc/source/__pycache__/web.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..213ad4270e396d83f46212743ae5c555b0fd62da Binary files /dev/null and b/waifuc/source/__pycache__/web.cpython-310.pyc differ diff --git a/waifuc/source/__pycache__/zerochan.cpython-310.pyc b/waifuc/source/__pycache__/zerochan.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..237e02a4890ec7cd808f20da879517e3aab702fe Binary files /dev/null and b/waifuc/source/__pycache__/zerochan.cpython-310.pyc differ diff --git a/waifuc/source/anime_pictures.py b/waifuc/source/anime_pictures.py new file mode 100644 index 0000000000000000000000000000000000000000..be308f285a7d92ce5f529fd3e410a7fb6d04ca0c --- /dev/null +++ b/waifuc/source/anime_pictures.py @@ -0,0 +1,111 @@ +import os +from enum import Enum +from typing import Iterator, Tuple, Union, List, Literal + +import cloudscraper +from hbutils.system import urlsplit +from pyquery import PyQuery as pq + +from .web import WebDataSource +from ..utils import get_requests_session, srequest + + +class OrderBy(str, Enum): + STAR_DATE = "stars_date" + DATE = "date" + DATE_REVERS = "date_r" + RATING = "rating" + DOWNLOADS = "views" + SIZE = "size" + TAG_COUNT = "tag_num" + + +class Period(str, Enum): + ANYTIME = "0" + PAST_DAY = "3" + PAST_WEEK = "1" + PAST_MONTH = "2" + PAST_6_MONTHS = "4" + PAST_YEAR = "5" + PAST_2_YEARS = "6" + PAST_3_YEARS = "7" + + +class AnimePicturesSource(WebDataSource): + __root__ = 'https://anime-pictures.net' + + def __init__(self, tags: List[str], tag_mode: Literal['or', 'and'] = 'and', + denied_tags: List[str] = None, denied_tag_mode: Literal['or', 'and'] = 'or', + order_by: OrderBy = OrderBy.RATING, period: Period = Period.ANYTIME, + select: Literal['thumbnail', 'preview', 'original'] = 'original', + group_name: str = 'anime_pictures', download_silent: bool = True, **kwargs): + WebDataSource.__init__( + self, group_name, + get_requests_session(session=cloudscraper.create_scraper()), + download_silent, + ) + self.tags, self.tag_mode = tags, tag_mode + self.denied_tags, self.denied_tag_mode = (denied_tags or []), denied_tag_mode + self.tag_mode = tag_mode + self.order_by = order_by + self.period = period + self.select = select + self.kwargs = kwargs + + def _params(self, page): + params = { + 'order_by': self.order_by.value, + 'ldate': self.period.value, + 'lang': 'en', + 'page': str(page), + } + if self.tag_mode == 'and': + params['search_tag'] = '&&'.join(self.tags) + else: + params['search_tag'] = '||'.join(self.tags) + if self.denied_tags: + if self.denied_tag_mode == 'and': + params['denied_tags'] = '&&'.join(self.denied_tags) + else: + params['denied_tags'] = '||'.join(self.denied_tags) + + return {**params, **self.kwargs} + + def _get_url(self, post, resp): + id_, md5 = post['id'], post['md5'] + if self.select == 'thumbnail': + return f'https://cdn.anime-pictures.net/previews/{md5[:3]}/{md5}_bp.jpg' + elif self.select == 'preview': + return f'https://cdn.anime-pictures.net/previews/{md5[:3]}/{md5}_cp.jpg' + elif self.select == 'original': + return pq(resp.text)('#rating a.download_icon').attr('href') + else: + raise ValueError(f'Invalid image selection - {self.select!r}.') + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + page = 0 + while True: + resp = srequest(self.session, 'GET', f'{self.__root__}/api/v3/posts', params=self._params(page)) + resp.raise_for_status() + + posts = resp.json()['posts'] + if not posts: + break + + for post in posts: + resp_page = srequest(self.session, 'GET', f'{self.__root__}/posts/{post["id"]}?lang=en') + resp_page.raise_for_status() + + url = self._get_url(post, resp_page) + tags = [item.text().replace(' ', '_') for item in pq(resp_page.text)('ul.tags li > a').items()] + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{post["id"]}{ext_name}' + meta = { + 'anime_pictures': post, + 'group_id': f'{self.group_name}_{post["id"]}', + 'filename': filename, + 'tags': {key: 1.0 for key in tags} + } + yield post['id'], url, meta + + page += 1 diff --git a/waifuc/source/base.py b/waifuc/source/base.py new file mode 100644 index 0000000000000000000000000000000000000000..0f4ced0a70e1b85fa29e493a03604c905bef329a --- /dev/null +++ b/waifuc/source/base.py @@ -0,0 +1,96 @@ +import copy +from typing import Iterator, Optional + +from tqdm.auto import tqdm + +from ..action import BaseAction +from ..export import BaseExporter +from ..model import ImageItem +from ..utils import task_ctx, get_task_names + + +class BaseDataSource: + def _iter(self) -> Iterator[ImageItem]: + raise NotImplementedError # pragma: no cover + + def _iter_from(self) -> Iterator[ImageItem]: + yield from self._iter() + + def __iter__(self) -> Iterator[ImageItem]: + yield from self._iter_from() + + def __or__(self, other): + from .compose import ParallelDataSource + if isinstance(self, ParallelDataSource): + if isinstance(other, ParallelDataSource): + return ParallelDataSource(*self.sources, *other.sources) + else: + return ParallelDataSource(*self.sources, other) + else: + if isinstance(other, ParallelDataSource): + return ParallelDataSource(self, *other.sources) + else: + return ParallelDataSource(self, other) + + def __add__(self, other): + from .compose import ComposedDataSource + if isinstance(self, ComposedDataSource): + if isinstance(other, ComposedDataSource): + return ComposedDataSource(*self.sources, *other.sources) + else: + return ComposedDataSource(*self.sources, other) + else: + if isinstance(other, ComposedDataSource): + return ComposedDataSource(self, *other.sources) + else: + return ComposedDataSource(self, other) + + def __getitem__(self, item): + from ..action import SliceSelectAction + if isinstance(item, slice): + return self.attach(SliceSelectAction(item.start, item.stop, item.step)) + else: + raise TypeError(f'Data source\'s getitem only accept slices, but {item!r} found.') + + def attach(self, *actions: BaseAction) -> 'AttachedDataSource': + return AttachedDataSource(self, *actions) + + def export(self, exporter: BaseExporter, name: Optional[str] = None): + exporter = copy.deepcopy(exporter) + exporter.reset() + with task_ctx(name): + return exporter.export_from(iter(self)) + + +class RootDataSource(BaseDataSource): + def _iter(self) -> Iterator[ImageItem]: + raise NotImplementedError # pragma: no cover + + def _iter_from(self) -> Iterator[ImageItem]: + names = get_task_names() + if names: + desc = f'{self.__class__.__name__} - {".".join(names)}' + else: + desc = f'{self.__class__.__name__}' + for item in tqdm(self._iter(), desc=desc): + yield item + + +class AttachedDataSource(BaseDataSource): + def __init__(self, source: BaseDataSource, *actions: BaseAction): + self.source = source + self.actions = actions + + def _iter(self) -> Iterator[ImageItem]: + t = self.source + for action in self.actions: + action = copy.deepcopy(action) + action.reset() + t = action.iter_from(t) + + yield from t + + +class EmptySource(BaseDataSource): + def _iter(self) -> Iterator[ImageItem]: + yield from [] diff --git a/waifuc/source/compose.py b/waifuc/source/compose.py new file mode 100644 index 0000000000000000000000000000000000000000..49d449fff95f2d42f0d3f8e726bdf8d6c5d97532 --- /dev/null +++ b/waifuc/source/compose.py @@ -0,0 +1,37 @@ +import random +from typing import Iterator, Optional + +from .base import BaseDataSource +from ..model import ImageItem + + +class ComposedDataSource(BaseDataSource): + def __init__(self, *sources: BaseDataSource): + self.sources = sources + + def _iter(self) -> Iterator[ImageItem]: + for source in self.sources: + yield from iter(source) + + def _iter_from(self) -> Iterator[ImageItem]: + yield from self._iter() + + +class ParallelDataSource(BaseDataSource): + def __init__(self, *sources: BaseDataSource, seed: Optional[int] = None): + self.sources = sources + self.random = random.Random(seed) + + def _iter(self) -> Iterator[ImageItem]: + iters = [iter(source) for source in self.sources] + while len(iters) > 0: + id_ = self.random.choice(range(len(iters))) + iter_ = iters[id_] + + try: + yield next(iter_) + except StopIteration: + iters.pop(id_) + + def _iter_from(self) -> Iterator[ImageItem]: + yield from self._iter() diff --git a/waifuc/source/danbooru.py b/waifuc/source/danbooru.py new file mode 100644 index 0000000000000000000000000000000000000000..90ef68dfa30fccbe55ed5d5da4f1619c9dea3380 --- /dev/null +++ b/waifuc/source/danbooru.py @@ -0,0 +1,167 @@ +import os.path +import re +from typing import Optional, Iterator, List, Tuple, Union, Literal + +from hbutils.system import urlsplit +from requests.auth import HTTPBasicAuth + +from .web import NoURL, WebDataSource +from ..config.meta import __TITLE__, __VERSION__ +from ..utils import get_requests_session, srequest + +_DanbooruSiteTyping = Literal['konachan', 'yandere', 'danbooru', 'safebooru', 'lolibooru'] + + +class DanbooruLikeSource(WebDataSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, download_silent: bool = True, + username: Optional[str] = None, api_key: Optional[str] = None, + site_name: Optional[str] = 'danbooru', site_url: Optional[str] = 'https://danbooru.donmai.us/', + group_name: Optional[str] = None): + WebDataSource.__init__(self, group_name or site_name, None, download_silent) + self.session = get_requests_session(headers={ + "User-Agent": f"{__TITLE__}/{__VERSION__}", + 'Content-Type': 'application/json; charset=utf-8', + }) + self.auth = HTTPBasicAuth(username, api_key) if username and api_key else None + self.site_name, self.site_url = site_name, site_url + self.tags = tags + self.min_size = min_size + + def _get_data_from_raw(self, raw): + return raw + + def _select_url(self, data): + if self.min_size is not None and "media_asset" in data and "variants" in data["media_asset"]: + variants = data["media_asset"]["variants"] + width, height, url = None, None, None + for item in variants: + if 'width' in item and 'height' in item and \ + item['width'] >= self.min_size and item['height'] >= self.min_size: + if url is None or item['width'] < width: + width, height, url = item['width'], item['height'], item['url'] + + if url is not None: + return url + + if 'file_url' not in data: + raise NoURL + + return data['file_url'] + + def _get_tags(self, data): + return re.split(r'\s+', data["tag_string"]) + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + page = 1 + while True: + resp = srequest(self.session, 'GET', f'{self.site_url}/posts.json', params={ + "format": "json", + "limit": "100", + "page": str(page), + "tags": ' '.join(self.tags), + }, auth=self.auth) + resp.raise_for_status() + page_items = self._get_data_from_raw(resp.json()) + if not page_items: + break + + for data in page_items: + try: + url = self._select_url(data) + except NoURL: + continue + + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{data["id"]}{ext_name}' + meta = { + self.site_name: data, + 'group_id': f'{self.group_name}_{data["id"]}', + 'filename': filename, + 'tags': {key: 1.0 for key in self._get_tags(data)} + } + yield data['id'], url, meta + + page += 1 + + +class DanbooruSource(DanbooruLikeSource): + def __init__(self, tags: List[str], + min_size: Optional[int] = 800, download_silent: bool = True, + username: Optional[str] = None, api_key: Optional[str] = None, + group_name: Optional[str] = None): + DanbooruLikeSource.__init__(self, tags, min_size, download_silent, username, api_key, + 'danbooru', 'https://danbooru.donmai.us/', group_name) + + +class SafebooruSource(DanbooruLikeSource): + def __init__(self, tags: List[str], + min_size: Optional[int] = 800, download_silent: bool = True, + username: Optional[str] = None, api_key: Optional[str] = None, + group_name: Optional[str] = None): + DanbooruLikeSource.__init__(self, tags, min_size, download_silent, username, api_key, + 'safebooru', 'https://safebooru.donmai.us', group_name) + + +class ATFBooruSource(DanbooruLikeSource): + def __init__(self, tags: List[str], + min_size: Optional[int] = 800, download_silent: bool = True, + username: Optional[str] = None, api_key: Optional[str] = None, + group_name: Optional[str] = None): + DanbooruLikeSource.__init__(self, tags, min_size, download_silent, username, api_key, + 'danbooru', 'https://booru.allthefallen.moe', group_name) + + +class E621LikeSource(DanbooruLikeSource): + def __init__(self, tags: List[str], + min_size: Optional[int] = 800, download_silent: bool = True, + username: Optional[str] = None, api_key: Optional[str] = None, + site_name: Optional[str] = 'e621', site_url: Optional[str] = 'https://e621.net/', + group_name: Optional[str] = None): + DanbooruLikeSource.__init__(self, tags, min_size, download_silent, username, api_key, + site_name, site_url, group_name or site_name) + + def _get_data_from_raw(self, raw): + return raw['posts'] + + def _select_url(self, data): + urls = [] + urls.append((data['file']['url'], data['file']['width'], data['file']['height'])) + urls.append((data['preview']['url'], data['preview']['width'], data['preview']['height'])) + if 'sample' in data and data['sample']['has']: + urls.append((data['sample']['url'], data['sample']['width'], data['sample']['height'])) + + if self.min_size is not None: + f_url, f_width, f_height = None, None, None + for url, width, height in urls: + if width >= self.min_size and height >= self.min_size: + if f_url is None or width < f_width: + f_url, f_width, f_height = url, width, height + + if f_url is not None: + return f_url + + return urls[0][0] + + def _get_tags(self, data): + tags = [] + for value in data['tags'].values(): + tags.extend(value) + return tags + + +class E621Source(E621LikeSource): + def __init__(self, tags: List[str], + min_size: Optional[int] = 800, download_silent: bool = True, + username: Optional[str] = None, api_key: Optional[str] = None, + group_name: Optional[str] = 'e621'): + E621LikeSource.__init__(self, tags, min_size, download_silent, username, api_key, + 'e621', 'https://e621.net/', group_name) + + +class E926Source(E621LikeSource): + def __init__(self, tags: List[str], + min_size: Optional[int] = 800, download_silent: bool = True, + username: Optional[str] = None, api_key: Optional[str] = None, + group_name: Optional[str] = 'e926'): + E621LikeSource.__init__(self, tags, min_size, download_silent, username, api_key, + 'e926', 'https://e926.net/', group_name) diff --git a/waifuc/source/derpibooru.py b/waifuc/source/derpibooru.py new file mode 100644 index 0000000000000000000000000000000000000000..bc6f0ed1d21ab2feebe1a6024e920779f75d0fac --- /dev/null +++ b/waifuc/source/derpibooru.py @@ -0,0 +1,74 @@ +import os +from typing import Literal, Optional, Iterator, Tuple, Union, List + +from hbutils.system import urlsplit + +from .web import WebDataSource +from ..utils import get_requests_session, srequest + +SelectTyping = Literal['thumb', 'small', 'medium', 'large', 'full'] + + +class DerpibooruLikeSource(WebDataSource): + def __init__(self, site_name: str, site_url: str, + tags: List[str], key: Optional[str] = None, select: SelectTyping = 'large', + download_silent: bool = True, group_name: Optional[str] = None): + WebDataSource.__init__(self, group_name or site_name, get_requests_session(), download_silent) + self.tags = tags + self.key = key + self.select = select + self.site_name = site_name + self.site_url = site_url + + def _params(self, page): + params = { + 'q': ' '.join(self.tags), + 'per_page': '100', + 'page': str(page), + } + if self.key: + params['key'] = self.key + + return params + + def _get_url(self, data): + if self.select in data['representations']: + return data['representations'][self.select] + else: + return data['representations']['full'] + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + page = 1 + while True: + resp = srequest(self.session, 'GET', f'{self.site_url}/api/v1/json/search/images', + params=self._params(page)) + resp.raise_for_status() + + posts = resp.json()['images'] + for data in posts: + url = self._get_url(data) + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{data["id"]}{ext_name}' + meta = { + self.site_name: data, + 'group_id': f'{self.group_name}_{data["id"]}', + 'filename': filename, + 'tags': {key.replace(' ', '_'): 1.0 for key in data['tags']} + } + yield data['id'], url, meta + + page += 1 + + +class DerpibooruSource(DerpibooruLikeSource): + def __init__(self, tags: List[str], key: Optional[str] = None, select: SelectTyping = 'large', + download_silent: bool = True, group_name: str = 'derpibooru'): + DerpibooruLikeSource.__init__(self, 'derpibooru', 'https://derpibooru.org', + tags, key, select, download_silent, group_name) + + +class FurbooruSource(DerpibooruLikeSource): + def __init__(self, tags: List[str], key: Optional[str] = None, select: SelectTyping = 'large', + download_silent: bool = True, group_name: str = 'furbooru'): + DerpibooruLikeSource.__init__(self, 'furbooru', 'https://furbooru.com', + tags, key, select, download_silent, group_name) diff --git a/waifuc/source/duitang.py b/waifuc/source/duitang.py new file mode 100644 index 0000000000000000000000000000000000000000..8cde661bed8f3c9729b5fcb4ba86ef987faaaab1 --- /dev/null +++ b/waifuc/source/duitang.py @@ -0,0 +1,63 @@ +import os +import re +from typing import Iterator, Tuple, Union + +from hbutils.system import urlsplit + +from .web import WebDataSource +from ..utils import get_requests_session, srequest + + +def _extract_words(keyword): + return list(filter(bool, re.split(r'[\W_]+', keyword))) + + +class DuitangSource(WebDataSource): + def __init__(self, keyword: str, strict: bool = True, page_size: int = 100, + group_name: str = 'duitang', download_silent: bool = True): + WebDataSource.__init__(self, group_name, get_requests_session(), download_silent) + self.keyword = keyword + self.words = set(_extract_words(keyword)) + self.page_size: int = page_size + self.strict = strict + + def _check_title(self, title): + if not self.strict: + return True + else: + t_words = set(_extract_words(title)) + return len(t_words & self.words) == len(self.words) + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + offset = 0 + while True: + resp = srequest(self.session, 'GET', 'https://www.duitang.com/napi/blog/list/by_search/', params={ + 'kw': self.keyword, + 'start': str(offset), + 'limit': str(self.page_size), + }) + resp.raise_for_status() + + raw = resp.json() + if 'data' not in raw or 'object_list' not in raw['data']: + break + + posts = raw['data']['object_list'] + if not posts: + break + + for post in posts: + if not self._check_title(post['msg']): + continue + + url = post['photo']['path'] + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{post["id"]}{ext_name}' + meta = { + 'duitang': post, + 'group_id': f'{self.group_name}_{post["id"]}', + 'filename': filename, + } + yield post['id'], url, meta + + offset += self.page_size diff --git a/waifuc/source/gchar.py b/waifuc/source/gchar.py new file mode 100644 index 0000000000000000000000000000000000000000..adbdc57eb44c9d3ce609bd40701d9e19c9b9f4da --- /dev/null +++ b/waifuc/source/gchar.py @@ -0,0 +1,166 @@ +import logging +from functools import reduce +from operator import __or__ +from typing import Iterator, Tuple, Optional, List, Mapping + +from hbutils.string import plural_word + +from .anime_pictures import AnimePicturesSource +from .base import BaseDataSource +from .danbooru import ATFBooruSource, DanbooruSource, DanbooruLikeSource +from .konachan import KonachanSource, KonachanNetSource, HypnoHubSource, LolibooruSource, XbooruSource, YandeSource, \ + Rule34Source, KonachanLikeSource +from .pixiv import PixivSearchSource +from .sankaku import SankakuSource +from .wallhaven import WallHavenSource +from .zerochan import ZerochanSource +from ..model import ImageItem + +_PRESET_SITES = ('zerochan', 'danbooru') +_REGISTERED_SITE_SOURCES = { + 'anime_pictures': AnimePicturesSource, + 'atfbooru': ATFBooruSource, + # 'sankaku': SankakuSource, # still something wrong with sankaku source + 'danbooru': DanbooruSource, + 'hypnohub': HypnoHubSource, + 'konachan': KonachanSource, + 'konachan_net': KonachanNetSource, + 'lolibooru': LolibooruSource, + 'rule34': Rule34Source, + # 'safebooru': SafebooruSource, + 'xbooru': XbooruSource, + 'yande': YandeSource, + 'zerochan': ZerochanSource, + 'wallhaven': WallHavenSource, + 'pixiv': PixivSearchSource, +} + + +class GcharAutoSource(BaseDataSource): + def __init__(self, ch, allow_fuzzy: bool = False, fuzzy_threshold: int = 80, contains_extra: bool = True, + sure_only: bool = True, preset_sites: Tuple[str, ...] = _PRESET_SITES, + max_preset_limit: Optional[int] = None, main_sources_count: int = 3, + blacklist_sites: Tuple[str, ...] = (), pixiv_refresh_token: Optional[str] = None, + extra_cfg: Optional[Mapping[str, dict]] = None): + from gchar.games import get_character + from gchar.games.base import Character + + if isinstance(ch, Character): + self.ch = ch + else: + self.ch = get_character(ch, allow_fuzzy, fuzzy_threshold, contains_extra) + if not self.ch: + raise ValueError(f'Character {ch!r} not found.') + logging.info(f'Character {self.ch!r} found in gchar.') + + self.sure_only = sure_only + self.pixiv_refresh_token = pixiv_refresh_token + self.extra_cfg = dict(extra_cfg or {}) + + for site in preset_sites: + assert site in _REGISTERED_SITE_SOURCES, f'Preset site {site!r} not available.' + self.preset_sites = sorted(preset_sites) + self.max_preset_limit = max_preset_limit + if 'pixiv' in self.preset_sites and not self.pixiv_refresh_token: + raise ValueError('Pixiv refresh token not given for presetting pixiv source!') + self.main_sources_count = main_sources_count + + self.blacklist_sites = blacklist_sites + + def _select_keyword_for_site(self, site) -> Tuple[Optional[str], Optional[int]]: + from gchar.resources.sites import list_site_tags + from gchar.resources.pixiv import get_pixiv_keywords, get_pixiv_posts + + if site == 'pixiv': + keyword = get_pixiv_keywords(self.ch) + cnt = get_pixiv_posts(self.ch) + count = 0 if cnt is None else cnt[0] + return keyword, count + + else: + tags: List[Tuple[str, int]] = list_site_tags(self.ch, site, sure_only=self.sure_only, with_posts=True) + tags = sorted(tags, key=lambda x: (-x[1], x[0])) + if tags: + return tags[0] + else: + return None, None + + def _build_source_on_site(self, site) -> Optional[BaseDataSource]: + site_class = _REGISTERED_SITE_SOURCES[site] + keyword, count = self._select_keyword_for_site(site) + if keyword is not None: + extra_cfg = dict(self.extra_cfg.get(site, None) or {}) + logging.info(f'Recommended keyword for site {site!r} is {keyword!r}, ' + f'with {plural_word(count, "known post")}.') + if issubclass(site_class, (DanbooruLikeSource, AnimePicturesSource)): + return site_class([keyword, 'solo'], **extra_cfg) + elif issubclass(site_class, (KonachanLikeSource, SankakuSource)): + return site_class([keyword], **extra_cfg) + elif issubclass(site_class, ZerochanSource): + return ZerochanSource(keyword, strict=True, **extra_cfg) + elif issubclass(site_class, WallHavenSource): + return site_class(keyword, **extra_cfg) + elif issubclass(site_class, (PixivSearchSource,)): + return site_class(keyword, refresh_token=self.pixiv_refresh_token, **extra_cfg) + else: + raise TypeError(f'Unknown class {site_class!r} for keyword {keyword!r}.') # pragma: no cover + else: + logging.info(f'No keyword recommendation for site {site!r}.') + return None + + def _build_preset_source(self) -> Optional[BaseDataSource]: + logging.info('Building preset sites sources ...') + sources = [ + self._build_source_on_site(site) + for site in self.preset_sites + ] + sources = [source for source in sources if source is not None] + if sources: + retval = reduce(__or__, sources) + if self.max_preset_limit is not None: + retval = retval[:self.max_preset_limit] + return retval + else: + return None + + def _build_main_source(self) -> Optional[BaseDataSource]: + _all_sites = set(_REGISTERED_SITE_SOURCES.keys()) + if not self.pixiv_refresh_token: + _all_sites.remove('pixiv') + _all_sites = sorted(_all_sites - set(self.preset_sites) - set(self.blacklist_sites)) + logging.info(f'Available sites for main sources: {_all_sites!r}.') + + site_pairs = [] + for site in _all_sites: + keyword, count = self._select_keyword_for_site(site) + if keyword is not None: + site_pairs.append((site, keyword, count)) + site_pairs = sorted(site_pairs, key=lambda x: -x[2])[:self.main_sources_count] + logging.info(f'Selected main sites: {site_pairs!r}') + + sources = [ + self._build_source_on_site(site) + for site, _, _ in site_pairs + ] + sources = [source for source in sources if source is not None] + if sources: + return reduce(__or__, sources) + else: + return None + + def _build_source(self) -> Optional[BaseDataSource]: + preset_source = self._build_preset_source() + main_source = self._build_main_source() + if preset_source and main_source: + return preset_source + main_source + elif preset_source: + return preset_source + elif main_source: + return main_source + else: + return None + + def _iter(self) -> Iterator[ImageItem]: + source = self._build_source() + if source is not None: + yield from source._iter() diff --git a/waifuc/source/huashi6.py b/waifuc/source/huashi6.py new file mode 100644 index 0000000000000000000000000000000000000000..b7e8f164a5c2d3f65f7ff6f89a1a8b87de195a48 --- /dev/null +++ b/waifuc/source/huashi6.py @@ -0,0 +1,50 @@ +import os +from functools import lru_cache +from typing import Iterator, Tuple, Union +from urllib.parse import quote_plus, urljoin + +from hbutils.system import urlsplit + +from .web import WebDataSource +from ..utils import get_requests_session, srequest + + +class Huashi6Source(WebDataSource): + __img_site_url__ = 'https://img2.huashi6.com' + + def __init__(self, word: str, + group_name: str = 'huashi6', download_silent: bool = True): + WebDataSource.__init__(self, group_name, get_requests_session(), download_silent) + self.word = word + + @classmethod + @lru_cache() + def _get_img_site_url(cls): + return cls.__img_site_url__ + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + page = 1 + + while True: + resp = srequest(self.session, 'POST', "https://rt.huashi6.com/search/all", data={ + 'word': self.word, + 'index': str(page), + }, headers={ + "referrer": f"https://www.huashi6.com/search?searchText={quote_plus(self.word)}&p={page}", + }) + raw = resp.json()['data'] + if 'works' not in raw or not raw['works']: + break + + for post in raw['works']: + url = urljoin(self._get_img_site_url(), post['coverImage']['path']) + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{post["id"]}{ext_name}' + meta = { + 'huashi6': post, + 'group_id': f'{self.group_name}_{post["id"]}', + 'filename': filename, + } + yield post['id'], url, meta + + page += 1 diff --git a/waifuc/source/konachan.py b/waifuc/source/konachan.py new file mode 100644 index 0000000000000000000000000000000000000000..afcc8c6f7cf6ed8924a244550427ecc405536285 --- /dev/null +++ b/waifuc/source/konachan.py @@ -0,0 +1,221 @@ +import os +import re +from typing import Iterator, Tuple, Union, List, Optional + +from hbutils.system import urlsplit + +from .web import WebDataSource, NoURL +from ..utils import get_requests_session, srequest + + +class KonachanLikeSource(WebDataSource): + def __init__(self, site_name: str, site_url: str, + tags: List[str], start_page: int = 1, min_size: Optional[int] = 800, + group_name: Optional[str] = None, download_silent: bool = True): + WebDataSource.__init__(self, group_name or site_name, get_requests_session(), download_silent) + self.site_name = site_name + self.site_url = site_url + self.start_page = start_page + self.min_size = min_size + self.tags: List[str] = tags + + def _select_url(self, data): + if self.min_size is not None: + url_names = [key for key in data.keys() if key.endswith('_url')] + name_pairs = [ + *( + (name, f'{name[:-4]}_width', f'{name[:-4]}_height') + for name in url_names + ), + ('file_url', 'width', 'height'), + ] + + f_url, f_width, f_height = None, None, None + for url_name, width_name, height_name in name_pairs: + if url_name in data and width_name in data and height_name in data: + url, width, height = data[url_name], data[width_name], data[height_name] + if width >= self.min_size and height >= self.min_size: + if f_url is None or width < f_width: + f_url, f_width, f_height = url, width, height + + if f_url is not None: + return f_url + + if 'file_url' in data: + return data['file_url'] + else: + raise NoURL + + def _request(self, page): + return srequest(self.session, 'GET', f'{self.site_url}/post.json', params={ + 'tags': ' '.join(self.tags), + 'limit': '100', + 'page': str(page), + }) + + def _get_data_from_raw(self, raw): + return raw + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + page = self.start_page + while True: + resp = self._request(page) + resp.raise_for_status() + + # response may be simply empty in rule34.xxx and xbooru.com + if not resp.text.strip(): + break + page_list = self._get_data_from_raw(resp.json()) + if not page_list: + break + + for data in page_list: + try: + url = self._select_url(data) + except NoURL: + continue + + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{data["id"]}{ext_name}' + meta = { + self.site_name: data, + 'group_id': f'{self.group_name}_{data["id"]}', + 'filename': filename, + 'tags': {key: 1.0 for key in re.split(r'\s+', data['tags'])} + } + yield data["id"], url, meta + + page += 1 + + +class YandeSource(KonachanLikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'yande', download_silent: bool = True): + KonachanLikeSource.__init__(self, 'yande', 'https://yande.re', + tags, 1, min_size, group_name, download_silent) + + +class KonachanSource(KonachanLikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'konachan', download_silent: bool = True): + KonachanLikeSource.__init__(self, 'konachan', 'https://konachan.com', + tags, 1, min_size, group_name, download_silent) + + +class KonachanNetSource(KonachanLikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'konachan_net', download_silent: bool = True): + KonachanLikeSource.__init__(self, 'konachan_net', 'https://konachan.net', + tags, 1, min_size, group_name, download_silent) + + +class LolibooruSource(KonachanLikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'lolibooru', download_silent: bool = True): + KonachanLikeSource.__init__(self, 'lolibooru', 'https://lolibooru.moe', + tags, 1, min_size, group_name, download_silent) + + def _request(self, page): + return srequest(self.session, 'GET', f'{self.site_url}/post/index.json', params={ + 'tags': ' '.join(self.tags), + 'limit': '100', + 'page': str(page), + }) + + +class Rule34LikeSource(KonachanLikeSource): + def __init__(self, site_name: str, site_url: str, + tags: List[str], min_size: Optional[int] = 800, + group_name: Optional[str] = None, download_silent: bool = True): + KonachanLikeSource.__init__(self, site_name, site_url, tags, 0, min_size, group_name, download_silent) + + def _request(self, page): + return srequest(self.session, 'GET', f'{self.site_url}/index.php', params={ + 'page': 'dapi', + 's': 'post', + 'q': 'index', + 'tags': ' '.join(self.tags), + 'json': '1', + 'limit': '100', + 'pid': str(page), + }) + + +class Rule34Source(Rule34LikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'rule34', download_silent: bool = True): + Rule34LikeSource.__init__(self, 'rule34', 'https://rule34.xxx', + tags, min_size, group_name, download_silent) + + +class HypnoHubSource(Rule34LikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'hypnohub', download_silent: bool = True): + Rule34LikeSource.__init__(self, 'hypnohub', 'https://hypnohub.net', + tags, min_size, group_name, download_silent) + + +class GelbooruSource(Rule34LikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'gelbooru', download_silent: bool = True): + Rule34LikeSource.__init__(self, 'gelbooru', 'https://gelbooru.com', + tags, min_size, group_name, download_silent) + + def _get_data_from_raw(self, raw): + return raw['post'] if 'post' in raw else None + + +class XbooruLikeSource(Rule34LikeSource): + def __init__(self, site_name: str, site_url: str, img_site_url: str, + tags: List[str], min_size: Optional[int] = 800, + group_name: Optional[str] = None, download_silent: bool = True): + Rule34LikeSource.__init__(self, site_name, site_url, tags, min_size, group_name, download_silent) + self.img_site_url = img_site_url + + def _select_url(self, data): + name, _ = os.path.splitext(data['image']) + urls = [(f'{self.img_site_url}/images/{data["directory"]}/{data["image"]}', data['width'], data['height'])] + if data['sample']: + urls.append(( + f'{self.img_site_url}/samples/{data["directory"]}/sample_{name}.jpg?{data["id"]}', + data['sample_width'], data['sample_height'], + )) + + if self.min_size is not None: + f_url, f_width, f_height = None, None, None + for url, width, height in urls: + if width >= self.min_size and height >= self.min_size: + if f_url is None or width < f_width: + f_url, f_width, f_height = url, width, height + + if f_url is not None: + return f_url + + return urls[0][0] + + +class XbooruSource(XbooruLikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'xbooru', download_silent: bool = True): + XbooruLikeSource.__init__( + self, 'xbooru', 'https://xbooru.com', 'https://img.xbooru.com', + tags, min_size, group_name, download_silent, + ) + + +class SafebooruOrgSource(XbooruLikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'safebooru_org', download_silent: bool = True): + XbooruLikeSource.__init__( + self, 'safebooru_org', 'https://safebooru.org', 'https://safebooru.org', + tags, min_size, group_name, download_silent, + ) + + +class TBIBSource(XbooruLikeSource): + def __init__(self, tags: List[str], min_size: Optional[int] = 800, + group_name: str = 'tbib', download_silent: bool = True): + XbooruLikeSource.__init__( + self, 'tbib', 'https://tbib.org', 'https://tbib.org', + tags, min_size, group_name, download_silent, + ) diff --git a/waifuc/source/local.py b/waifuc/source/local.py new file mode 100644 index 0000000000000000000000000000000000000000..1a8432495f74c53264e2808e3ea02061dedd7e9b --- /dev/null +++ b/waifuc/source/local.py @@ -0,0 +1,84 @@ +import glob +import os +import pathlib +import random +import re +from typing import Iterator + +from PIL import UnidentifiedImageError +from imgutils.data import load_image + +from .base import RootDataSource +from ..model import ImageItem + + +class LocalSource(RootDataSource): + def __init__(self, directory: str, recursive: bool = True, shuffle: bool = False): + self.directory = directory + self.recursive = recursive + self.shuffle = shuffle + + def _iter_files(self): + if self.recursive: + for directory, _, files in os.walk(self.directory): + group_name = re.sub(r'[\W_]+', '_', directory).strip('_') + for file in files: + yield os.path.join(directory, file), group_name + else: + group_name = re.sub(r'[\W_]+', '_', self.directory).strip('_') + for file in os.listdir(self.directory): + yield os.path.join(self.directory, file), group_name + + def _actual_iter_files(self): + lst = list(self._iter_files()) + if self.shuffle: + random.shuffle(lst) + yield from lst + + def _iter(self) -> Iterator[ImageItem]: + for file, group_name in self._iter_files(): + try: + origin_item = ImageItem.load_from_image(file) + origin_item.image.load() + except UnidentifiedImageError: + continue + + meta = origin_item.meta or { + 'path': os.path.abspath(file), + 'group_id': group_name, + 'filename': os.path.basename(file), + } + yield ImageItem(origin_item.image, meta) + + +class LocalTISource(RootDataSource): + def __init__(self, directory: str): + self.directory = directory + + def _iter(self) -> Iterator[ImageItem]: + group_name = re.sub(r'[\W_]+', '_', self.directory).strip('_') + for f in glob.glob(os.path.join(self.directory, '*')): + if not os.path.isfile(f): + continue + + try: + image = load_image(f) + except UnidentifiedImageError: + continue + + id_ = os.path.splitext(os.path.basename(f))[0] + txt_file = os.path.join(self.directory, f'{id_}.txt') + if os.path.exists(txt_file): + full_text = pathlib.Path(txt_file).read_text(encoding='utf-8') + words = re.split(r'\s*,\s*', full_text) + tags = {word: 1.0 for word in words} + else: + tags = {} + + meta = { + 'path': os.path.abspath(f), + 'group_id': group_name, + 'filename': os.path.basename(f), + 'tags': tags, + } + yield ImageItem(image, meta) diff --git a/waifuc/source/paheal.py b/waifuc/source/paheal.py new file mode 100644 index 0000000000000000000000000000000000000000..d47e8e83ccf906edb9690e81ac44ccdfa7fd5527 --- /dev/null +++ b/waifuc/source/paheal.py @@ -0,0 +1,86 @@ +import os +import re +from typing import Optional, List, Iterator, Tuple, Union + +import requests +import xmltodict +from hbutils.system import urlsplit + +from .web import WebDataSource, NoURL +from ..utils import get_requests_session + + +class PahealSource(WebDataSource): + def __init__(self, tags: List[str], user_id: Optional[str] = None, api_key: Optional[str] = None, + min_size: Optional[int] = 800, download_silent: bool = True, group_name: str = 'paheal'): + WebDataSource.__init__(self, group_name, get_requests_session(), download_silent) + self.tags = tags + self.min_size = min_size + self.user_id, self.api_key = user_id, api_key + + def _params(self, page): + params = { + 'tags': ' '.join(self.tags), + 'limit': '100', + 'page': str(page), + } + if self.user_id and self.api_key: + params['user_id'] = self.user_id + params['api_key'] = self.api_key + + return params + + def _select_url(self, data): + if self.min_size is not None: + url_names = [key for key in data.keys() if key.endswith('_url')] + name_pairs = [ + *( + (name, f'{name[:-4]}_width', f'{name[:-4]}_height') + for name in url_names + ), + ('file_url', 'width', 'height'), + ] + + f_url, f_width, f_height = None, None, None + for url_name, width_name, height_name in name_pairs: + if url_name in data and width_name in data and height_name in data: + url, width, height = data[url_name], int(data[width_name]), int(data[height_name]) + if width >= self.min_size and height >= self.min_size: + if f_url is None or width < f_width: + f_url, f_width, f_height = url, width, height + + if f_url is not None: + return f_url + + if 'file_url' in data: + return data['file_url'] + else: + raise NoURL + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + page = 1 + while True: + resp = requests.get('https://rule34.paheal.net/api/danbooru/find_posts/index.xml', + params=self._params(page)) + resp.raise_for_status() + posts = xmltodict.parse(resp.text)['posts']['tag'] + + for data in posts: + data = {key.lstrip('@'): value for key, value in data.items()} + + try: + url = self._select_url(data) + except NoURL: + continue + + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{data["id"]}{ext_name}' + meta = { + 'paheal': data, + 'group_id': f'{self.group_name}_{data["id"]}', + 'filename': filename, + 'tags': {key: 1.0 for key in re.split(r'\s+', data['tags'])} + } + yield data["id"], url, meta + + page += 1 diff --git a/waifuc/source/pixiv.py b/waifuc/source/pixiv.py new file mode 100644 index 0000000000000000000000000000000000000000..35047ef38acc2dfc7e92571b22c456c1e06c52b5 --- /dev/null +++ b/waifuc/source/pixiv.py @@ -0,0 +1,168 @@ +import logging +import os +from typing import Iterator, Optional, Union, Tuple, Literal + +from hbutils.system import urlsplit +from pixivpy3 import AppPixivAPI + +from .web import WebDataSource +from ..utils import get_requests_session + +_FILTER = Literal["for_ios", ""] +_TYPE = Literal["illust", "manga", ""] +_RESTRICT = Literal["public", "private", ""] +_CONTENT_TYPE = Literal["illust", "manga", ""] +_MODE = Literal[ + "day", + "week", + "month", + "day_male", + "day_female", + "week_original", + "week_rookie", + "day_manga", + "day_r18", + "day_male_r18", + "day_female_r18", + "week_r18", + "week_r18g", + "", +] +_SEARCH_TARGET = Literal[ + "partial_match_for_tags", "exact_match_for_tags", "title_and_caption", "keyword", "" +] +_SORT = Literal["date_desc", "date_asc", "popular_desc", ""] +_DURATION = Literal[ + "within_last_day", "within_last_week", "within_last_month", "", None +] +_BOOL = Literal["true", "false"] +_SELECT = Literal['square_medium', 'medium', 'large', 'original'] + + +class BasePixivSource(WebDataSource): + def __init__(self, group_name: str = 'pixiv', select: _SELECT = 'large', + no_ai: bool = False, refresh_token: Optional[str] = None, download_silent: bool = True): + self.select = select + self.no_ai = no_ai + self.refresh_token = refresh_token + self.client = AppPixivAPI() + self.client.requests = get_requests_session(session=self.client.requests) + self.client.requests.headers.update({"Referer": "https://app-api.pixiv.net/"}) + WebDataSource.__init__(self, group_name, self.client.requests, download_silent) + + def _iter_illustration(self) -> Iterator[dict]: + raise NotImplementedError # pragma: no cover + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + if self.refresh_token: + self.client.auth(refresh_token=self.refresh_token) + + for illust in self._iter_illustration(): + if illust['type'] != 'illust': + continue + if self.no_ai and illust['illust_ai_type'] == 2: + continue + + if illust['page_count'] == 1: + if self.select != 'original': + urls = [illust['image_urls'][self.select]] + else: + urls = [illust['meta_single_page']['original_image_url']] + + else: + urls = [page['image_urls'][self.select] for page in illust['meta_pages']] + + for i, url in enumerate(urls): + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{illust["id"]}_{i}{ext_name}' + meta = { + 'pixiv': illust, + 'group_id': f'{self.group_name}_{illust["id"]}', + 'instance_id': f'{self.group_name}_{illust["id"]}_{i}', + 'filename': filename, + } + yield f'{illust["id"]}_{i}', url, meta + + +class PixivSearchSource(BasePixivSource): + def __init__(self, word: str, search_target: _SEARCH_TARGET = "partial_match_for_tags", + sort: _SORT = "date_desc", duration: _DURATION = None, start_date: Optional[str] = None, + end_date: Optional[str] = None, filter: _FILTER = "for_ios", req_auth: bool = True, + group_name: str = 'pixiv', select: _SELECT = 'large', + no_ai: bool = False, refresh_token: Optional[str] = None, download_silent: bool = True): + BasePixivSource.__init__(self, group_name, select, no_ai, refresh_token, download_silent) + self.word = word + self.search_target = search_target + self.sort = sort + self.duration = duration + self.start_date = start_date + self.end_date = end_date + self.filter = filter + self.req_auth = req_auth + + def _iter_illustration(self) -> Iterator[dict]: + offset = 0 + while True: + data = self.client.search_illust(self.word, self.search_target, self.sort, self.duration, + self.start_date, self.end_date, self.filter, offset, self.req_auth) + if 'illusts' not in data: + logging.warning(f'Illusts not found in page (offset: {offset!r}), skipped: {data!r}.') + break + illustrations = data['illusts'] + yield from illustrations + + offset += len(illustrations) + if not illustrations: + break + + +class PixivUserSource(BasePixivSource): + def __init__(self, user_id: Union[int, str], type: _TYPE = "illust", + filter: _FILTER = "for_ios", req_auth: bool = True, + group_name: str = 'pixiv', select: _SELECT = 'large', + no_ai: bool = False, refresh_token: Optional[str] = None, download_silent: bool = True): + BasePixivSource.__init__(self, group_name, select, no_ai, refresh_token, download_silent) + self.user_id = user_id + self.type = type + self.filter = filter + self.req_auth = req_auth + + def _iter_illustration(self) -> Iterator[dict]: + offset = 0 + while True: + data = self.client.user_illusts(self.user_id, self.type, self.filter, offset, self.req_auth) + if 'illusts' not in data: + logging.warning(f'Illusts not found in page (offset: {offset!r}), skipped: {data!r}.') + break + illustrations = data['illusts'] + yield from illustrations + + offset += len(illustrations) + if not illustrations: + break + + +class PixivRankingSource(BasePixivSource): + def __init__(self, mode: _MODE = "day", filter: _FILTER = "for_ios", + date: Optional[str] = None, req_auth: bool = True, + group_name: str = 'pixiv', select: _SELECT = 'large', + no_ai: bool = False, refresh_token: Optional[str] = None, download_silent: bool = True): + BasePixivSource.__init__(self, group_name, select, no_ai, refresh_token, download_silent) + self.mode = mode + self.filter = filter + self.date = date + self.req_auth = req_auth + + def _iter_illustration(self) -> Iterator[dict]: + offset = 0 + while True: + data = self.client.illust_ranking(self.mode, self.filter, self.date, offset, self.req_auth) + if 'illusts' not in data: + logging.warning(f'Illusts not found in page (offset: {offset!r}), skipped: {data!r}.') + break + illustrations = data['illusts'] + yield from illustrations + + offset += len(illustrations) + if not illustrations: + break diff --git a/waifuc/source/sankaku.py b/waifuc/source/sankaku.py new file mode 100644 index 0000000000000000000000000000000000000000..bed57ab435c5dd402d402271c0b7da8166896287 --- /dev/null +++ b/waifuc/source/sankaku.py @@ -0,0 +1,143 @@ +import datetime +import os +from enum import Enum +from typing import Optional, Iterator, List, Tuple, Union + +from hbutils.system import urlsplit + +from .web import NoURL, WebDataSource +from ..utils import get_requests_session, srequest + + +class Rating(str, Enum): + SAFE = "s" + QUESTIONABLE = "q" + EXPLICIT = "e" + + +class PostOrder(Enum): + POPULARITY = "popularity" + DATE = "date" + QUALITY = "quality" + RANDOM = "random" + RECENTLY_FAVORITED = "recently_favorited" + RECENTLY_VOTED = "recently_voted" + + +class FileType(Enum): + IMAGE = "image" # jpeg, png, webp formats + GIF = "animated_gif" # gif format + VIDEO = "video" # mp4, webm formats + + +def _tags_by_kwargs(**kwargs): + tags = [] + for k, v in kwargs.items(): + if v is None: + pass + elif k in {"order", "rating", "file_type"} and v is not FileType.IMAGE: # noqa + tags.append(f"{k}:{v.value}") + elif k in {"threshold", "recommended_for", "voted"}: + tags.append(f"{k}:{v}") + elif k == "date": + date = "..".join(d.strftime("%Y-%m-%dT%H:%M") for d in self.date) # type: ignore[union-attr] + tags.append(f"date:{date}") + elif k == "added_by": + for user in self.added_by: # type: ignore[union-attr] + tags.append(f"user:{user}") + + return tags + + +class SankakuSource(WebDataSource): + def __init__(self, tags: List[str], order: Optional[PostOrder] = None, + rating: Optional[Rating] = None, file_type: Optional[FileType] = None, + date: Optional[Tuple[datetime.datetime, datetime.datetime]] = None, + username: Optional[str] = None, password: Optional[str] = None, access_token: Optional[str] = None, + min_size: Optional[int] = 800, download_silent: bool = True, group_name: str = 'sankaku', **kwargs): + WebDataSource.__init__(self, group_name, get_requests_session(), download_silent) + self.tags = tags + _tags_by_kwargs(order=order, rating=rating, file_type=file_type, date=date, **kwargs) + self.username, self.password = username, password + self.access_token = access_token + + self.min_size = min_size + self.auth_session = get_requests_session(headers={ + 'Content-Type': 'application/json; charset=utf-8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Host': 'capi-v2.sankakucomplex.com', + 'X-Requested-With': 'com.android.browser', + }) + + _FILE_URLS = [ + ('sample_url', 'sample_width', 'sample_height'), + ('preview_url', 'preview_width', 'preview_height'), + ('file_url', 'width', 'height'), + ] + + def _select_url(self, data): + if self.min_size is not None: + f_url, f_width, f_height = None, None, None + for url_name, width_name, height_name in self._FILE_URLS: + if url_name in data and width_name in data and height_name in data: + url, width, height = data[url_name], data[width_name], data[height_name] + if width and height and width >= self.min_size and height >= self.min_size: + if f_url is None or width < f_width: + f_url, f_width, f_height = url, width, height + + if f_url is not None: + return f_url + + if 'file_url' in data and data['file_url']: + return data['file_url'] + else: + raise NoURL + + def _login(self): + if self.access_token: + self.auth_session.headers.update({ + "Authorization": f"Bearer {self.access_token}", + }) + elif self.username and self.password: + resp = srequest(self.auth_session, 'POST', 'https://login.sankakucomplex.com/auth/token', + json={"login": self.username, "password": self.password}) + resp.raise_for_status() + login_data = resp.json() + self.auth_session.headers.update({ + "Authorization": f"{login_data['token_type']} {login_data['access_token']}", + }) + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + self._login() + + page = 1 + while True: + resp = srequest(self.auth_session, 'GET', 'https://capi-v2.sankakucomplex.com/posts', params={ + 'lang': 'en', + 'page': str(page), + 'limit': '100', + 'tags': ' '.join(self.tags), + }) + resp.raise_for_status() + if not resp.json(): + break + + for data in resp.json(): + if 'file_type' not in data or 'image' not in data['file_type']: + continue + + try: + url = self._select_url(data) + except NoURL: + continue + + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{data["id"]}{ext_name}' + meta = { + 'sankaku': data, + 'group_id': f'{self.group_name}_{data["id"]}', + 'filename': filename, + 'tags': {key: 1.0 for key in [t_item['name'] for t_item in data['tags']]} + } + yield data["id"], url, meta + + page += 1 diff --git a/waifuc/source/video.py b/waifuc/source/video.py new file mode 100644 index 0000000000000000000000000000000000000000..353be5a43a62914039ba660374dc0fd0a46d7ae1 --- /dev/null +++ b/waifuc/source/video.py @@ -0,0 +1,62 @@ +import glob +import logging +import os +from typing import Iterator +from urllib.error import HTTPError + +from tqdm.auto import tqdm + +from .base import BaseDataSource, EmptySource +from ..model import ImageItem + +try: + import av + import av.datasets + from av.error import InvalidDataError +except (ImportError, ModuleNotFoundError): + av = None + + +class VideoSource(BaseDataSource): + def __init__(self, video_file): + if av is None: + raise ImportError(f'pyav not installed, {self.__class__.__name__} is unavailable. ' + f'Please install this with `pip install git+https://github.com/deepghs/waifuc.git@main#egg=waifuc[video]` to solve this problem.') + self.video_file = video_file + + def _iter(self) -> Iterator[ImageItem]: + try: + content = av.datasets.curated(self.video_file) + except HTTPError: + logging.error(f'Video {self.video_file!r} is invalid, skipped') + return + + try: + with av.open(content) as container: + stream = container.streams.video[0] + stream.codec_context.skip_frame = "NONKEY" + + for i, frame in enumerate(tqdm( + container.decode(stream), + desc=f'Video Extracting - {os.path.basename(self.video_file)}')): + meta = { + 'video_file': self.video_file, + 'time': frame.time, + 'index': i, + } + yield ImageItem(frame.to_image(), meta) + except (InvalidDataError, av.error.ValueError, IndexError) as err: + logging.warning(f'Video extraction skipped due to error - {err!r}') + + @classmethod + def from_directory(cls, directory: str, recursive: bool = True) -> BaseDataSource: + if recursive: + files = glob.glob(os.path.join(glob.escape(directory), '**', '*'), recursive=True) + else: + files = glob.glob(os.path.join(glob.escape(directory), '*')) + + source = EmptySource() + for file in files: + if os.path.isfile(file) and os.access(file, os.R_OK): + source = source + cls(file) + return source diff --git a/waifuc/source/wallhaven.py b/waifuc/source/wallhaven.py new file mode 100644 index 0000000000000000000000000000000000000000..e8cb71a505711127e024c16867f7009c64a73396 --- /dev/null +++ b/waifuc/source/wallhaven.py @@ -0,0 +1,101 @@ +import os +from enum import IntFlag +from typing import Iterator, Tuple, Union, Optional, Literal + +import cloudscraper +from hbutils.system import urlsplit + +from .web import WebDataSource +from ..utils import get_requests_session, srequest + + +class Category(IntFlag): + GENERAL = 0x4 + ANIME = 0x2 + PEOPLE = 0x1 + + DEFAULT = GENERAL | ANIME + ALL = GENERAL | ANIME | PEOPLE + + @property + def mark(self) -> str: + return f'{"1" if self & self.GENERAL else "0"}' \ + f'{"1" if self & self.ANIME else "0"}' \ + f'{"1" if self & self.PEOPLE else "0"}' + + +class Purity(IntFlag): + SFW = 0x4 + SKETCHY = 0x2 + NSFW = 0x1 + + DEFAULT = SFW | SKETCHY + ALL = SFW | SKETCHY | NSFW + + @property + def mark(self) -> str: + return f'{"1" if self & self.SFW else "0"}' \ + f'{"1" if self & self.SKETCHY else "0"}' \ + f'{"1" if self & self.NSFW else "0"}' + + +SortingTyping = Literal['date_added', 'relevance', 'random', 'views', 'favorites', 'toplist'] +SelectTyping = Literal['original', 'thumbnail'] + + +class WallHavenSource(WebDataSource): + def __init__(self, query: str, category: Category = Category.DEFAULT, + purity: Purity = Purity.DEFAULT, sorting: SortingTyping = 'relavance', + no_ai: bool = True, min_size: Tuple[int, int] = (1, 1), + select: SelectTyping = 'original', api_key: Optional[str] = None, + group_name: str = 'wallhaven', download_silent: bool = True): + session = get_requests_session(session=cloudscraper.create_scraper()) + if api_key: + session.headers.update({'X-API-Key': api_key}) + WebDataSource.__init__(self, group_name, session, download_silent) + + self.query = query + self.category = category + self.purity = purity + self.sorting = sorting + self.no_ai = no_ai + self.min_size = min_size + self.select = select + + def _select_url(self, data): + if self.select == 'original': + return data['path'] + elif self.select == 'thumbnail': + return data['thumbs']['original'] + else: + raise ValueError(f'Unknown image selection - {self.select!r}.') + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + page = 1 + while True: + resp = srequest(self.session, 'GET', 'https://wallhaven.cc/api/v1/search', params={ + 'q': self.query, + 'categories': self.category.mark, + 'purity': self.purity.mark, + 'sorting': self.sorting, + 'ai_art_filter': "1" if self.no_ai else "0", + 'atleast': f'{self.min_size[0]}x{self.min_size[1]}', + 'page': str(page), + }) + raw = resp.json() + if not raw or not raw['data']: + break + + for data in raw['data']: + url = self._select_url(data) + + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{data["id"]}{ext_name}' + meta = { + 'wallhaven': data, + 'group_id': f'{self.group_name}_{data["id"]}', + 'filename': filename, + } + yield data['id'], url, meta + + page += 1 diff --git a/waifuc/source/web.py b/waifuc/source/web.py new file mode 100644 index 0000000000000000000000000000000000000000..7000e08d6d7c9c49ed2964590bad52b7870c09e7 --- /dev/null +++ b/waifuc/source/web.py @@ -0,0 +1,52 @@ +import os +import warnings +from typing import Iterator, Tuple, Union + +import requests +from PIL import UnidentifiedImageError, Image +from PIL.Image import DecompressionBombError +from hbutils.system import urlsplit, TemporaryDirectory + +from .base import RootDataSource +from ..model import ImageItem +from ..utils import get_requests_session, download_file + + +class NoURL(Exception): + pass + + +class WebDataSource(RootDataSource): + def __init__(self, group_name: str, session: requests.Session = None, download_silent: bool = True): + self.download_silent = download_silent + self.session = session or get_requests_session() + self.group_name = group_name + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + raise NotImplementedError # pragma: no cover + + def _iter(self) -> Iterator[ImageItem]: + for id_, url, meta in self._iter_data(): + _, ext_name = os.path.splitext(urlsplit(url).filename) + if ext_name.lower() == '.gif': + warnings.warn(f'{self.group_name.capitalize()} resource {id_} is a GIF image, skipped.') + continue + filename = f'{self.group_name}_{id_}{ext_name}' + with TemporaryDirectory(ignore_cleanup_errors=True) as td: + td_file = os.path.join(td, filename) + try: + download_file( + url, td_file, desc=filename, + session=self.session, silent=self.download_silent + ) + image = Image.open(td_file) + image.load() + except UnidentifiedImageError: + warnings.warn(f'{self.group_name.capitalize()} resource {id_} unidentified as image, skipped.') + continue + except (IOError, DecompressionBombError) as err: + warnings.warn(f'Skipped due to error: {err!r}') + continue + + meta = {**meta, 'url': url} + yield ImageItem(image, meta) diff --git a/waifuc/source/zerochan.py b/waifuc/source/zerochan.py new file mode 100644 index 0000000000000000000000000000000000000000..62dda406b031bc34e548fbc3a25431f224eeae5b --- /dev/null +++ b/waifuc/source/zerochan.py @@ -0,0 +1,177 @@ +import os +from enum import Enum +from typing import Iterator, Union, List, Optional, Mapping, Tuple, Literal +from urllib.parse import quote_plus, urljoin + +from hbutils.system import urlsplit + +from .web import WebDataSource +from ..utils import get_requests_session, srequest + + +class Sort(str, Enum): + ID = 'id' + FAV = 'fav' + + +class Time(str, Enum): + ALL = '0' + LAST_7000 = '1' + LAST_15000 = '2' + + +class Dimension(str, Enum): + LARGE = 'large' + HUGE = 'huge' + LANDSCAPE = 'landscape' + PORTRAIT = 'portrait' + SQUARE = 'square' + + +SelectTyping = Literal['medium', 'large', 'full'] + + +class ZerochanSource(WebDataSource): + __SITE__ = 'https://www.zerochan.net' + + def __init__(self, word: Union[str, List[str]], sort: Sort = Sort.FAV, time: Time = Time.ALL, + dimension: Optional[Dimension] = None, color: Optional[str] = None, strict: bool = False, + select: SelectTyping = 'large', group_name: str = 'zerochan', download_silent: bool = True, + user_agent=None, username: Optional[str] = None, password: Optional[str] = None): + if user_agent: + headers = {'User-Agent': user_agent} + else: + headers = {} + WebDataSource.__init__(self, group_name, get_requests_session(headers=headers), download_silent) + self.word = word + self.sort = sort + self.time = time + self.dimension = dimension + self.color = color + self.strict = strict + self.select = select + + self.username = username + self._password = password + self._is_authed = False + + def _auth(self): + if not self._is_authed and self.username is not None: + resp = self.session.post( + 'https://www.zerochan.net/login', + data={ + 'ref': '/', + 'name': self.username, + 'password': self._password, + 'login': 'Login' + }, + headers={ + 'Referrer': "https://www.zerochan.net/login?ref=%2F", + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,' + 'image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Encoding': 'gzip, deflate, br', + 'Content-Type': 'application/x-www-form-urlencoded', + }, + allow_redirects=False, + ) + if resp.status_code != 303: + raise ConnectionError('Username or password wrong, failed to login to zerochan.net.') + + self._is_authed = True + + @property + def _base_url(self) -> str: + if isinstance(self.word, str): + return f'{self.__SITE__}/{quote_plus(self.word)}' + elif isinstance(self.word, (list, tuple)): + return f'{self.__SITE__}/{",".join(map(quote_plus, self.word))}' + else: + raise TypeError(f'Unknown type of word - {self.word!r}.') + + @property + def _params(self) -> Mapping[str, str]: + params = { + 'json': '1', + 's': self.sort.value, + 't': self.time.value, + } + if self.dimension is not None: + params['d'] = self.dimension.value + if self.color is not None: + params['c'] = self.color + if self.strict: + params['strict'] = '1' + + return params + + @classmethod + def _get_urls(cls, data): + id_ = data['id'] + medium_url = data['thumbnail'] + prefix = quote_plus(data['tag'].replace(' ', '.')) + large_urls = [f'https://s1.zerochan.net/{prefix}.600.{id_}.jpg'] + full_urls = [ + f"https://static.zerochan.net/{prefix}.full.{id_}{ext}" + for ext in ['.jpg', '.png'] + ] + + return {'medium': medium_url, 'large': large_urls, 'full': full_urls} + + def _get_url(self, data): + urls = self._get_urls(data) + if self.select == 'full': + url_fallbacks = [*urls['full'], *urls['large']] + elif self.select == 'large': + url_fallbacks = urls['large'] + else: + url_fallbacks = [] + + for url in url_fallbacks: + resp = srequest(self.session, 'HEAD', url, raise_for_status=False) + if resp.ok: + return url + else: + return urls['medium'] + + def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: + self._auth() + page = 1 + while True: + quit_ = False + _base_url = self._base_url + while True: + resp = srequest(self.session, 'GET', _base_url, + params={**self._params, 'p': str(page), 'l': '200'}, + allow_redirects=False, raise_for_status=False) + if resp.status_code // 100 == 3: + _base_url = urljoin(_base_url, resp.headers['Location']) + elif resp.status_code in {403, 404}: + quit_ = True + break + else: + resp.raise_for_status() + break + + if quit_: + break + + json_ = resp.json() + if 'items' in json_: + items = json_['items'] + for data in items: + url = self._get_url(data) + _, ext_name = os.path.splitext(urlsplit(url).filename) + filename = f'{self.group_name}_{data["id"]}{ext_name}' + meta = { + 'zerochan': { + **data, + 'url': url, + }, + 'group_id': f'{self.group_name}_{data["id"]}', + 'filename': filename, + } + yield data["id"], url, meta + else: + break + + page += 1 diff --git a/waifuc/utils/__init__.py b/waifuc/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..83eb0e64aed0dea55199f52ad6a323751e1f8577 --- /dev/null +++ b/waifuc/utils/__init__.py @@ -0,0 +1,3 @@ +from .context import task_ctx, get_task_names +from .download import download_file +from .session import get_requests_session, srequest, TimeoutHTTPAdapter diff --git a/waifuc/utils/__pycache__/__init__.cpython-310.pyc b/waifuc/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1c4f5d70561914002ad1f247d620090940e117f Binary files /dev/null and b/waifuc/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/waifuc/utils/__pycache__/context.cpython-310.pyc b/waifuc/utils/__pycache__/context.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8a979d102ee0143ae8a4eda8c683fa65013b94f Binary files /dev/null and b/waifuc/utils/__pycache__/context.cpython-310.pyc differ diff --git a/waifuc/utils/__pycache__/download.cpython-310.pyc b/waifuc/utils/__pycache__/download.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a486102d917d008ff417d10cb8db44fcf84d7089 Binary files /dev/null and b/waifuc/utils/__pycache__/download.cpython-310.pyc differ diff --git a/waifuc/utils/__pycache__/session.cpython-310.pyc b/waifuc/utils/__pycache__/session.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a6949e74adfcefdbcf62c5dce9255feff1a65e5 Binary files /dev/null and b/waifuc/utils/__pycache__/session.cpython-310.pyc differ diff --git a/waifuc/utils/context.py b/waifuc/utils/context.py new file mode 100644 index 0000000000000000000000000000000000000000..22d0df21d51eef4943742da2160feb2579c75a36 --- /dev/null +++ b/waifuc/utils/context.py @@ -0,0 +1,23 @@ +from contextlib import contextmanager +from typing import Tuple, Optional + +from hbutils.reflection import context + +WAIFUC_TASK_NAME = 'waifuc_task_name' + + +@contextmanager +def task_ctx(name: Optional[str]): + ctx = context() + names = tuple(ctx.get(WAIFUC_TASK_NAME, None) or ()) + if name: + with ctx.vars(**{WAIFUC_TASK_NAME: (*names, name)}): + yield + else: + yield + + +def get_task_names() -> Tuple[str, ...]: + ctx = context() + names = tuple(ctx.get(WAIFUC_TASK_NAME, None) or ()) + return names diff --git a/waifuc/utils/download.py b/waifuc/utils/download.py new file mode 100644 index 0000000000000000000000000000000000000000..280fcc7a8e917edb25198c0354559b1ac1ef2478 --- /dev/null +++ b/waifuc/utils/download.py @@ -0,0 +1,47 @@ +import os +from contextlib import contextmanager + +import requests +from tqdm.auto import tqdm + +from .session import get_requests_session, srequest + + +class _FakeClass: + def update(self, *args, **kwargs): + pass + + +@contextmanager +def _with_tqdm(expected_size, desc, silent: bool = False): + if not silent: + with tqdm(total=expected_size, unit='B', unit_scale=True, unit_divisor=1024, desc=desc) as pbar: + yield pbar + else: + yield _FakeClass() + + +def download_file(url, filename, expected_size: int = None, desc=None, session=None, silent: bool = False, **kwargs): + session = session or get_requests_session() + response = srequest(session, 'GET', url, stream=True, allow_redirects=True, **kwargs) + expected_size = expected_size or response.headers.get('Content-Length', None) + expected_size = int(expected_size) if expected_size is not None else expected_size + + desc = desc or os.path.basename(filename) + directory = os.path.dirname(filename) + if directory: + os.makedirs(directory, exist_ok=True) + + with open(filename, 'wb') as f: + with _with_tqdm(expected_size, desc, silent) as pbar: + for chunk in response.iter_content(chunk_size=1024): + f.write(chunk) + pbar.update(len(chunk)) + + actual_size = os.path.getsize(filename) + if expected_size is not None and actual_size != expected_size: + os.remove(filename) + raise requests.exceptions.HTTPError(f"Downloaded file is not of expected size, " + f"{expected_size} expected but {actual_size} found.") + + return filename diff --git a/waifuc/utils/session.py b/waifuc/utils/session.py new file mode 100644 index 0000000000000000000000000000000000000000..484ea1bbb2421267ea422f6b080a10fb25d49687 --- /dev/null +++ b/waifuc/utils/session.py @@ -0,0 +1,62 @@ +import time +from typing import Optional, Dict + +import requests +from requests.adapters import HTTPAdapter, Retry +from requests.exceptions import RequestException + +DEFAULT_TIMEOUT = 10 # seconds + + +class TimeoutHTTPAdapter(HTTPAdapter): + def __init__(self, *args, **kwargs): + self.timeout = DEFAULT_TIMEOUT + if "timeout" in kwargs: + self.timeout = kwargs["timeout"] + del kwargs["timeout"] + super().__init__(*args, **kwargs) + + def send(self, request, **kwargs): + timeout = kwargs.get("timeout") + if timeout is None: + kwargs["timeout"] = self.timeout + return super().send(request, **kwargs) + + +def get_requests_session(max_retries: int = 5, timeout: int = DEFAULT_TIMEOUT, + headers: Optional[Dict[str, str]] = None, session: Optional[requests.Session] = None) \ + -> requests.Session: + session = session or requests.session() + retries = Retry( + total=max_retries, backoff_factor=1, + status_forcelist=[413, 429, 500, 501, 502, 503, 504, 505, 506, 507, 509, 510, 511], + allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"], + ) + adapter = TimeoutHTTPAdapter(max_retries=retries, timeout=timeout) + session.mount('http://', adapter) + session.mount('https://', adapter) + session.headers.update({ + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + **dict(headers or {}), + }) + + return session + + +def srequest(session: requests.Session, method, url, *, max_retries: int = 5, + sleep_time: float = 5.0, raise_for_status: bool = True, **kwargs) -> requests.Response: + resp = None + for _ in range(max_retries): + try: + resp = session.request(method, url, **kwargs) + except RequestException: + time.sleep(sleep_time) + else: + break + + assert resp is not None, f'Request failed for {max_retries} time(s) - [{method}] {url!r}.' + if raise_for_status: + resp.raise_for_status() + + return resp