pranavSIT's picture
added pali inference
74e8f2f
# Copyright 2024 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image-centric preprocessing ops.
All preprocessing ops should return a data processing functors. A data
is represented as a dictionary of (TF) tensors. The functors output a modified
dictionary.
The key named "image" is commonly used for the image, and is a 3D tensor of
shape (height x width x channels).
"""
from big_vision.pp import utils
from big_vision.pp.registry import Registry
import tensorflow as tf
@Registry.register("preprocess_ops.decode")
@utils.InKeyOutKey()
def get_decode(channels=3, precise=False):
"""Decode an encoded image string, see tf.io.decode_image.
Args:
channels: see tf.io.decode_image.
precise: if False, use default TF image decoding algorithm.
If True, change DCT method for JPEG decoding to match PIL/cv2/PyTorch.
See also (internal link) for a concrete example.
Returns:
The decoded image.
"""
def _decode(image):
if precise:
return tf.image.decode_jpeg( # Also supports png btw.
image, channels=channels, dct_method="INTEGER_ACCURATE")
else:
return tf.io.decode_image(
image, channels=channels, expand_animations=False)
return _decode
@Registry.register("preprocess_ops.resize")
@utils.InKeyOutKey()
def get_resize(size, method="bilinear", antialias=False):
"""Resizes image to a given size.
Args:
size: either an integer H, where H is both the new height and width
of the resized image, or a list or tuple [H, W] of integers, where H and W
are new image"s height and width respectively.
method: resize method, see tf.image.resize docs for options.
antialias: see tf.image.resize. Ideally set to True for all new configs.
Returns:
A function for resizing an image.
"""
size = utils.maybe_repeat(size, 2)
def _resize(image):
"""Resizes image to a given size."""
# Note: use TF-2 version of tf.image.resize as the version in TF-1 is
# buggy: https://github.com/tensorflow/tensorflow/issues/6720.
# In particular it was not equivariant with rotation and lead to the network
# to learn a shortcut in self-supervised rotation task, if rotation was
# applied after resize.
dtype = image.dtype
tf_dtype = tf.type_spec_from_value(image).dtype
image = tf.image.resize(image, size, method=method, antialias=antialias)
return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype)
return _resize
# This functionality is used by resize_small and resize_long. But we're not
# registering it as a pp op yet, as there is no need for it. However, it can
# probably be slightly generalized into "scale augmentation" eventually.
def _resize_factor(image, factor, method="area", antialias=True):
"""Resizes the image by a (float) `factor`, keeping the aspect ratio fixed."""
h, w = tf.shape(image)[0], tf.shape(image)[1]
h = tf.cast(tf.round(tf.cast(h, tf.float32) * factor), tf.int32)
w = tf.cast(tf.round(tf.cast(w, tf.float32) * factor), tf.int32)
dtype = image.dtype
tf_dtype = tf.type_spec_from_value(image).dtype
image = tf.image.resize(image, (h, w), method=method, antialias=antialias)
return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype)
@Registry.register("preprocess_ops.resize_small")
@utils.InKeyOutKey()
def get_resize_small(smaller_size, method="area", antialias=False):
"""Resizes the smaller side to `smaller_size` keeping aspect ratio.
Args:
smaller_size: an integer, that represents a new size of the smaller side of
an input image.
method: the resize method. `area` is a meaningful, bwd-compat default.
antialias: see tf.image.resize. Ideally set to True for all new configs.
Returns:
A function, that resizes an image and preserves its aspect ratio.
Note:
backwards-compat for "area"+antialias tested here:
(internal link)
"""
def _resize_small(image): # pylint: disable=missing-docstring
h, w = tf.shape(image)[0], tf.shape(image)[1]
factor = (
tf.cast(smaller_size, tf.float32) /
tf.cast(tf.minimum(h, w), tf.float32))
return _resize_factor(image, factor, method=method, antialias=antialias)
return _resize_small
@Registry.register("preprocess_ops.resize_long")
@utils.InKeyOutKey()
def get_resize_long(longer_size, method="area", antialias=True):
"""Resizes the longer side to `longer_size` keeping aspect ratio.
Args:
longer_size: an integer, that represents a new size of the longer side of
an input image.
method: the resize method. `area` is a meaningful, bwd-compat default.
antialias: see tf.image.resize. Ideally set to True for all new configs.
Returns:
A function, that resizes an image and preserves its aspect ratio.
"""
def _resize_long(image): # pylint: disable=missing-docstring
h, w = tf.shape(image)[0], tf.shape(image)[1]
factor = (
tf.cast(longer_size, tf.float32) /
tf.cast(tf.maximum(h, w), tf.float32))
return _resize_factor(image, factor, method=method, antialias=antialias)
return _resize_long
@Registry.register("preprocess_ops.inception_crop")
@utils.InKeyOutKey()
def get_inception_crop(size=None, area_min=5, area_max=100,
method="bilinear", antialias=False):
"""Makes inception-style image crop.
Inception-style crop is a random image crop (its size and aspect ratio are
random) that was used for training Inception models, see
https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf.
Args:
size: Resize image to [size, size] after crop.
area_min: minimal crop area.
area_max: maximal crop area.
method: rezied method, see tf.image.resize docs for options.
antialias: see tf.image.resize. Ideally set to True for all new configs.
Returns:
A function, that applies inception crop.
"""
def _inception_crop(image): # pylint: disable=missing-docstring
begin, crop_size, _ = tf.image.sample_distorted_bounding_box(
tf.shape(image),
tf.zeros([0, 0, 4], tf.float32),
area_range=(area_min / 100, area_max / 100),
min_object_covered=0, # Don't enforce a minimum area.
use_image_if_no_bounding_boxes=True)
crop = tf.slice(image, begin, crop_size)
# Unfortunately, the above operation loses the depth-dimension. So we need
# to restore it the manual way.
crop.set_shape([None, None, image.shape[-1]])
if size:
crop = get_resize(size, method, antialias)({"image": crop})["image"]
return crop
return _inception_crop
@Registry.register("preprocess_ops.decode_jpeg_and_inception_crop")
@utils.InKeyOutKey()
def get_decode_jpeg_and_inception_crop(size=None, area_min=5, area_max=100,
ratio_min=0.75, ratio_max=1.33,
method="bilinear", antialias=False):
"""Decode jpeg string and make inception-style image crop.
Inception-style crop is a random image crop (its size and aspect ratio are
random) that was used for training Inception models, see
https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf.
Args:
size: Resize image to [size, size] after crop.
area_min: minimal crop area.
area_max: maximal crop area.
ratio_min: minimal aspect ratio.
ratio_max: maximal aspect ratio.
method: rezied method, see tf.image.resize docs for options.
antialias: see tf.image.resize. Ideally set to True for all new configs.
Returns:
A function, that applies inception crop.
"""
def _inception_crop(image_data): # pylint: disable=missing-docstring
shape = tf.image.extract_jpeg_shape(image_data)
begin, crop_size, _ = tf.image.sample_distorted_bounding_box(
shape,
tf.zeros([0, 0, 4], tf.float32),
area_range=(area_min / 100, area_max / 100),
aspect_ratio_range=(ratio_min, ratio_max),
min_object_covered=0, # Don't enforce a minimum area.
use_image_if_no_bounding_boxes=True)
# Crop the image to the specified bounding box.
offset_y, offset_x, _ = tf.unstack(begin)
target_height, target_width, _ = tf.unstack(crop_size)
crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
image = tf.image.decode_and_crop_jpeg(image_data, crop_window, channels=3)
if size:
image = get_resize(size, method, antialias)({"image": image})["image"]
return image
return _inception_crop
@Registry.register("preprocess_ops.random_crop")
@utils.InKeyOutKey()
def get_random_crop(crop_size):
"""Makes a random crop of a given size.
Args:
crop_size: either an integer H, where H is both the height and width of the
random crop, or a list or tuple [H, W] of integers, where H and W are
height and width of the random crop respectively.
Returns:
A function, that applies random crop.
"""
crop_size = utils.maybe_repeat(crop_size, 2)
def _crop(image):
return tf.image.random_crop(image, (*crop_size, image.shape[-1]))
return _crop
@Registry.register("preprocess_ops.central_crop")
@utils.InKeyOutKey()
def get_central_crop(crop_size=None):
"""Makes central crop of a given size.
Args:
crop_size: either an integer H, where H is both the height and width of the
central crop, or a list or tuple [H, W] of integers, where H and W are
height and width of the central crop respectively. If `crop_size` is not
specified, then the largest possible center crop will be taken.
Returns:
A function, that applies central crop.
"""
if crop_size:
crop_size = utils.maybe_repeat(crop_size, 2)
def _crop(image):
if crop_size:
h, w = crop_size[0], crop_size[1]
else:
h = w = tf.minimum(tf.shape(image)[0], tf.shape(image)[1])
dy = (tf.shape(image)[0] - h) // 2
dx = (tf.shape(image)[1] - w) // 2
return tf.image.crop_to_bounding_box(image, dy, dx, h, w)
return _crop
@Registry.register("preprocess_ops.flip_lr")
@utils.InKeyOutKey()
def get_random_flip_lr():
"""Flips an image horizontally with probability 50%."""
def _random_flip_lr_pp(image):
return tf.image.random_flip_left_right(image)
return _random_flip_lr_pp
@Registry.register("preprocess_ops.vgg_value_range")
@utils.InKeyOutKey()
def get_vgg_value_range(
mean=(0.485 * 255, 0.456 * 255, 0.406 * 255),
std=(0.229 * 255, 0.224 * 255, 0.225 * 255),
):
"""VGG-style preprocessing, subtracts mean and divides by stddev.
This preprocessing is very common for ImageNet pre-trained models since VGG,
and to this day the standard for models coming from most PyTorch codes.
Args:
mean: Tuple of values to be subtracted. Default to widespread VGG values.
std: Tuple of values to be divided by. Default to widespread VGG values.
Returns:
A function to rescale the values.
"""
mean = tf.constant(mean, tf.float32)
std = tf.constant(std, tf.float32)
def _vgg_value_range(image):
return (tf.cast(image, tf.float32) - mean) / std
return _vgg_value_range
@Registry.register("preprocess_ops.clip_value_range")
@utils.InKeyOutKey()
def get_clip_value_range():
mean = (0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255)
std = (0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255)
def _clip_value_range(image):
return (tf.cast(image, tf.float32) - mean) / std
return _clip_value_range
@Registry.register("preprocess_ops.convert_to_video")
@utils.InKeyOutKey()
def get_convert_to_video(num_frames):
"""Converts an image to a video with zero padded frames.
Args:
num_frames: total number of frames that the video should have.
Returns:
A function for converting an image to a video.
"""
def _convert_to_video(image):
return tf.pad(
tf.expand_dims(image, axis=0),
[[0, num_frames - 1], [0, 0], [0, 0], [0, 0]],
)
return _convert_to_video