Spaces:

innat
/

VideoMAE

Sleeping

App Files Files Community

VideoMAE / utils.py

innat

Update utils.py

93ca8bb about 1 year ago

raw

history blame contribute delete

3.61 kB


	import tensorflow as tf
	import numpy as np
	from einops import rearrange
	from decord import VideoReader

	num_frames = 16
	input_size = 224
	patch_size = (16, 16)
	IMAGENET_MEAN = np.array([0.45, 0.45, 0.45])
	IMAGENET_STD = np.array([0.225, 0.225, 0.225])

	def format_frames(frame, output_size):
	frame = tf.image.convert_image_dtype(frame, tf.uint8)
	frame = tf.image.resize(frame, size=output_size)
	frame = frame / 255.
	frame = frame - IMAGENET_MEAN
	frame = frame / IMAGENET_STD
	return frame

	def read_video(file_path):
	container = VideoReader(file_path)
	return container

	def frame_sampling(container, num_frames):
	interval = len(container) // num_frames
	bids = np.arange(num_frames) * interval
	offset = np.random.randint(interval, size=bids.shape)
	frame_index = bids + offset
	frames = container.get_batch(frame_index).asnumpy()
	frames = np.stack(frames)
	frames = format_frames(frames, [input_size] * 2)
	return frames

	def denormalize(image):
	image = image.numpy() if not isinstance(image, np.ndarray) else image
	image = image * IMAGENET_STD + IMAGENET_MEAN
	image = (image * 255).clip(0, 255).astype('uint8')
	return image

	def reconstrunction(input_frame, bool_mask, pretrained_pred):
	img_squeeze = rearrange(
	input_frame.numpy(),
	'b (t p0) (h p1) (w p2) c -> b (t h w) (p0 p1 p2) c',
	p0=2, p1=patch_size[0], p2=patch_size[0]
	)
	img_mean = np.mean(img_squeeze, axis=-2, keepdims=True)
	img_variance = np.var(img_squeeze, axis=-2, ddof=1, keepdims=True)
	img_norm = (img_squeeze - img_mean) / (np.sqrt(img_variance) + 1e-6)
	img_patch = rearrange(img_norm, 'b n p c -> b n (p c)')
	img_patch[bool_mask] = pretrained_pred

	# make mask
	mask = np.ones_like(img_patch)
	mask[bool_mask] = 0
	mask = rearrange(
	mask, 'b n (p c) -> b n p c', c=3
	)
	mask = rearrange(
	mask,
	'b (t h w) (p0 p1 p2) c -> b (t p0) (h p1) (w p2) c',
	p0=2, p1=patch_size[0], p2=patch_size[1], h=14, w=14
	)

	#save reconstruction video
	rec_img = rearrange(img_patch, 'b n (p c) -> b n p c', c=3)

	# Notice: To visualize the reconstruction video, we add the predict and the original mean and var of each patch.
	img_mean = np.mean(img_squeeze, axis=-2, keepdims=True)
	img_std = np.sqrt(np.var(img_squeeze, axis=-2, ddof=1, keepdims=True) + 1e-6)
	rec_img = rec_img * img_std + img_mean
	rec_img = rearrange(
	rec_img,
	'b (t h w) (p0 p1 p2) c -> b (t p0) (h p1) (w p2) c',
	p0=2, p1=patch_size[0], p2=patch_size[1], h=14, w=14
	)

	return (
	rec_img[0],
	mask[0]
	)


	class TubeMaskingGenerator:
	def __init__(self, input_size, mask_ratio):
	self.frames, self.height, self.width = input_size
	self.num_patches_per_frame = self.height * self.width
	self.total_patches = self.frames * self.num_patches_per_frame
	self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame)
	self.total_masks = self.frames * self.num_masks_per_frame

	def __repr__(self):
	repr_str = "Maks: total patches {}, mask patches {}".format(
	self.total_patches, self.total_masks
	)
	return repr_str

	def __call__(self):
	mask_per_frame = np.hstack([
	np.zeros(self.num_patches_per_frame - self.num_masks_per_frame),
	np.ones(self.num_masks_per_frame),
	])
	np.random.shuffle(mask_per_frame)
	mask = np.tile(mask_per_frame, (self.frames,1)).flatten()
	return mask