|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Image-centric preprocessing ops. |
|
|
|
All preprocessing ops should return a data processing functors. A data |
|
is represented as a dictionary of (TF) tensors. The functors output a modified |
|
dictionary. |
|
|
|
The key named "image" is commonly used for the image, and is a 3D tensor of |
|
shape (height x width x channels). |
|
""" |
|
|
|
from big_vision.pp import utils |
|
from big_vision.pp.registry import Registry |
|
|
|
import tensorflow as tf |
|
|
|
|
|
@Registry.register("preprocess_ops.decode") |
|
@utils.InKeyOutKey() |
|
def get_decode(channels=3, precise=False): |
|
"""Decode an encoded image string, see tf.io.decode_image. |
|
|
|
Args: |
|
channels: see tf.io.decode_image. |
|
precise: if False, use default TF image decoding algorithm. |
|
If True, change DCT method for JPEG decoding to match PIL/cv2/PyTorch. |
|
See also (internal link) for a concrete example. |
|
|
|
Returns: |
|
The decoded image. |
|
""" |
|
|
|
def _decode(image): |
|
if precise: |
|
return tf.image.decode_jpeg( |
|
image, channels=channels, dct_method="INTEGER_ACCURATE") |
|
else: |
|
return tf.io.decode_image( |
|
image, channels=channels, expand_animations=False) |
|
|
|
return _decode |
|
|
|
|
|
@Registry.register("preprocess_ops.resize") |
|
@utils.InKeyOutKey() |
|
def get_resize(size, method="bilinear", antialias=False): |
|
"""Resizes image to a given size. |
|
|
|
Args: |
|
size: either an integer H, where H is both the new height and width |
|
of the resized image, or a list or tuple [H, W] of integers, where H and W |
|
are new image"s height and width respectively. |
|
method: resize method, see tf.image.resize docs for options. |
|
antialias: see tf.image.resize. Ideally set to True for all new configs. |
|
|
|
Returns: |
|
A function for resizing an image. |
|
|
|
""" |
|
size = utils.maybe_repeat(size, 2) |
|
|
|
def _resize(image): |
|
"""Resizes image to a given size.""" |
|
|
|
|
|
|
|
|
|
|
|
dtype = image.dtype |
|
tf_dtype = tf.type_spec_from_value(image).dtype |
|
image = tf.image.resize(image, size, method=method, antialias=antialias) |
|
return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype) |
|
|
|
return _resize |
|
|
|
|
|
|
|
|
|
|
|
def _resize_factor(image, factor, method="area", antialias=True): |
|
"""Resizes the image by a (float) `factor`, keeping the aspect ratio fixed.""" |
|
h, w = tf.shape(image)[0], tf.shape(image)[1] |
|
|
|
h = tf.cast(tf.round(tf.cast(h, tf.float32) * factor), tf.int32) |
|
w = tf.cast(tf.round(tf.cast(w, tf.float32) * factor), tf.int32) |
|
|
|
dtype = image.dtype |
|
tf_dtype = tf.type_spec_from_value(image).dtype |
|
image = tf.image.resize(image, (h, w), method=method, antialias=antialias) |
|
return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype) |
|
|
|
|
|
@Registry.register("preprocess_ops.resize_small") |
|
@utils.InKeyOutKey() |
|
def get_resize_small(smaller_size, method="area", antialias=False): |
|
"""Resizes the smaller side to `smaller_size` keeping aspect ratio. |
|
|
|
Args: |
|
smaller_size: an integer, that represents a new size of the smaller side of |
|
an input image. |
|
method: the resize method. `area` is a meaningful, bwd-compat default. |
|
antialias: see tf.image.resize. Ideally set to True for all new configs. |
|
|
|
Returns: |
|
A function, that resizes an image and preserves its aspect ratio. |
|
|
|
Note: |
|
backwards-compat for "area"+antialias tested here: |
|
(internal link) |
|
""" |
|
|
|
def _resize_small(image): |
|
h, w = tf.shape(image)[0], tf.shape(image)[1] |
|
factor = ( |
|
tf.cast(smaller_size, tf.float32) / |
|
tf.cast(tf.minimum(h, w), tf.float32)) |
|
return _resize_factor(image, factor, method=method, antialias=antialias) |
|
return _resize_small |
|
|
|
|
|
@Registry.register("preprocess_ops.resize_long") |
|
@utils.InKeyOutKey() |
|
def get_resize_long(longer_size, method="area", antialias=True): |
|
"""Resizes the longer side to `longer_size` keeping aspect ratio. |
|
|
|
Args: |
|
longer_size: an integer, that represents a new size of the longer side of |
|
an input image. |
|
method: the resize method. `area` is a meaningful, bwd-compat default. |
|
antialias: see tf.image.resize. Ideally set to True for all new configs. |
|
|
|
Returns: |
|
A function, that resizes an image and preserves its aspect ratio. |
|
""" |
|
|
|
def _resize_long(image): |
|
h, w = tf.shape(image)[0], tf.shape(image)[1] |
|
factor = ( |
|
tf.cast(longer_size, tf.float32) / |
|
tf.cast(tf.maximum(h, w), tf.float32)) |
|
return _resize_factor(image, factor, method=method, antialias=antialias) |
|
return _resize_long |
|
|
|
|
|
@Registry.register("preprocess_ops.inception_crop") |
|
@utils.InKeyOutKey() |
|
def get_inception_crop(size=None, area_min=5, area_max=100, |
|
method="bilinear", antialias=False): |
|
"""Makes inception-style image crop. |
|
|
|
Inception-style crop is a random image crop (its size and aspect ratio are |
|
random) that was used for training Inception models, see |
|
https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf. |
|
|
|
Args: |
|
size: Resize image to [size, size] after crop. |
|
area_min: minimal crop area. |
|
area_max: maximal crop area. |
|
method: rezied method, see tf.image.resize docs for options. |
|
antialias: see tf.image.resize. Ideally set to True for all new configs. |
|
|
|
Returns: |
|
A function, that applies inception crop. |
|
""" |
|
|
|
def _inception_crop(image): |
|
begin, crop_size, _ = tf.image.sample_distorted_bounding_box( |
|
tf.shape(image), |
|
tf.zeros([0, 0, 4], tf.float32), |
|
area_range=(area_min / 100, area_max / 100), |
|
min_object_covered=0, |
|
use_image_if_no_bounding_boxes=True) |
|
crop = tf.slice(image, begin, crop_size) |
|
|
|
|
|
crop.set_shape([None, None, image.shape[-1]]) |
|
if size: |
|
crop = get_resize(size, method, antialias)({"image": crop})["image"] |
|
return crop |
|
|
|
return _inception_crop |
|
|
|
|
|
@Registry.register("preprocess_ops.decode_jpeg_and_inception_crop") |
|
@utils.InKeyOutKey() |
|
def get_decode_jpeg_and_inception_crop(size=None, area_min=5, area_max=100, |
|
ratio_min=0.75, ratio_max=1.33, |
|
method="bilinear", antialias=False): |
|
"""Decode jpeg string and make inception-style image crop. |
|
|
|
Inception-style crop is a random image crop (its size and aspect ratio are |
|
random) that was used for training Inception models, see |
|
https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf. |
|
|
|
Args: |
|
size: Resize image to [size, size] after crop. |
|
area_min: minimal crop area. |
|
area_max: maximal crop area. |
|
ratio_min: minimal aspect ratio. |
|
ratio_max: maximal aspect ratio. |
|
method: rezied method, see tf.image.resize docs for options. |
|
antialias: see tf.image.resize. Ideally set to True for all new configs. |
|
|
|
Returns: |
|
A function, that applies inception crop. |
|
""" |
|
|
|
def _inception_crop(image_data): |
|
shape = tf.image.extract_jpeg_shape(image_data) |
|
begin, crop_size, _ = tf.image.sample_distorted_bounding_box( |
|
shape, |
|
tf.zeros([0, 0, 4], tf.float32), |
|
area_range=(area_min / 100, area_max / 100), |
|
aspect_ratio_range=(ratio_min, ratio_max), |
|
min_object_covered=0, |
|
use_image_if_no_bounding_boxes=True) |
|
|
|
|
|
offset_y, offset_x, _ = tf.unstack(begin) |
|
target_height, target_width, _ = tf.unstack(crop_size) |
|
crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) |
|
image = tf.image.decode_and_crop_jpeg(image_data, crop_window, channels=3) |
|
|
|
if size: |
|
image = get_resize(size, method, antialias)({"image": image})["image"] |
|
|
|
return image |
|
|
|
return _inception_crop |
|
|
|
|
|
@Registry.register("preprocess_ops.random_crop") |
|
@utils.InKeyOutKey() |
|
def get_random_crop(crop_size): |
|
"""Makes a random crop of a given size. |
|
|
|
Args: |
|
crop_size: either an integer H, where H is both the height and width of the |
|
random crop, or a list or tuple [H, W] of integers, where H and W are |
|
height and width of the random crop respectively. |
|
|
|
Returns: |
|
A function, that applies random crop. |
|
""" |
|
crop_size = utils.maybe_repeat(crop_size, 2) |
|
|
|
def _crop(image): |
|
return tf.image.random_crop(image, (*crop_size, image.shape[-1])) |
|
|
|
return _crop |
|
|
|
|
|
@Registry.register("preprocess_ops.central_crop") |
|
@utils.InKeyOutKey() |
|
def get_central_crop(crop_size=None): |
|
"""Makes central crop of a given size. |
|
|
|
Args: |
|
crop_size: either an integer H, where H is both the height and width of the |
|
central crop, or a list or tuple [H, W] of integers, where H and W are |
|
height and width of the central crop respectively. If `crop_size` is not |
|
specified, then the largest possible center crop will be taken. |
|
|
|
Returns: |
|
A function, that applies central crop. |
|
""" |
|
if crop_size: |
|
crop_size = utils.maybe_repeat(crop_size, 2) |
|
|
|
def _crop(image): |
|
if crop_size: |
|
h, w = crop_size[0], crop_size[1] |
|
else: |
|
h = w = tf.minimum(tf.shape(image)[0], tf.shape(image)[1]) |
|
dy = (tf.shape(image)[0] - h) // 2 |
|
dx = (tf.shape(image)[1] - w) // 2 |
|
return tf.image.crop_to_bounding_box(image, dy, dx, h, w) |
|
|
|
return _crop |
|
|
|
|
|
@Registry.register("preprocess_ops.flip_lr") |
|
@utils.InKeyOutKey() |
|
def get_random_flip_lr(): |
|
"""Flips an image horizontally with probability 50%.""" |
|
|
|
def _random_flip_lr_pp(image): |
|
return tf.image.random_flip_left_right(image) |
|
|
|
return _random_flip_lr_pp |
|
|
|
|
|
@Registry.register("preprocess_ops.vgg_value_range") |
|
@utils.InKeyOutKey() |
|
def get_vgg_value_range( |
|
mean=(0.485 * 255, 0.456 * 255, 0.406 * 255), |
|
std=(0.229 * 255, 0.224 * 255, 0.225 * 255), |
|
): |
|
"""VGG-style preprocessing, subtracts mean and divides by stddev. |
|
|
|
This preprocessing is very common for ImageNet pre-trained models since VGG, |
|
and to this day the standard for models coming from most PyTorch codes. |
|
|
|
Args: |
|
mean: Tuple of values to be subtracted. Default to widespread VGG values. |
|
std: Tuple of values to be divided by. Default to widespread VGG values. |
|
|
|
Returns: |
|
A function to rescale the values. |
|
""" |
|
mean = tf.constant(mean, tf.float32) |
|
std = tf.constant(std, tf.float32) |
|
|
|
def _vgg_value_range(image): |
|
return (tf.cast(image, tf.float32) - mean) / std |
|
return _vgg_value_range |
|
|
|
|
|
@Registry.register("preprocess_ops.clip_value_range") |
|
@utils.InKeyOutKey() |
|
def get_clip_value_range(): |
|
mean = (0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255) |
|
std = (0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255) |
|
|
|
def _clip_value_range(image): |
|
return (tf.cast(image, tf.float32) - mean) / std |
|
return _clip_value_range |
|
|
|
|
|
@Registry.register("preprocess_ops.convert_to_video") |
|
@utils.InKeyOutKey() |
|
def get_convert_to_video(num_frames): |
|
"""Converts an image to a video with zero padded frames. |
|
|
|
Args: |
|
num_frames: total number of frames that the video should have. |
|
|
|
Returns: |
|
A function for converting an image to a video. |
|
""" |
|
|
|
def _convert_to_video(image): |
|
return tf.pad( |
|
tf.expand_dims(image, axis=0), |
|
[[0, num_frames - 1], [0, 0], [0, 0], [0, 0]], |
|
) |
|
|
|
return _convert_to_video |
|
|