Spaces:

innat
/

VideoMAE

Running

App Files Files Community

VideoMAE / app.py

innat

Update app.py

0458fda about 1 year ago

raw

history blame

4.26 kB

	import gradio as gr
	import numpy as np
	import imageio

	import tensorflow as tf
	from tensorflow import keras

	from utils import TubeMaskingGenerator
	from utils import read_video, frame_sampling, denormalize, reconstrunction
	from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size
	from labels import K400_label_map, SSv2_label_map, UCF_label_map

	MODELS = {
	'K400': [
	'./TFVideoMAE_S_K400_16x224_FT',
	'./TFVideoMAE_S_K400_16x224_PT'
	],
	'SSv2': [
	'./TFVideoMAE_S_K400_16x224_FT',
	'./TFVideoMAE_S_K400_16x224_PT'
	],
	'UCF' : [
	'./TFVideoMAE_S_K400_16x224_FT',
	'./TFVideoMAE_S_K400_16x224_PT'
	]
	}

	LABEL_MAPS = {
	'K400': K400_label_map,
	'SSv2': SSv2_label_map,
	'UCF' : UCF_label_map
	}

	def tube_mask_generator(mask_ratio):
	window_size = (
	num_frames // 2,
	input_size // patch_size[0],
	input_size // patch_size[1]
	)
	tube_mask = TubeMaskingGenerator(
	input_size=window_size,
	mask_ratio=mask_ratio
	)
	make_bool = tube_mask()
	bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
	bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0)
	bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool)
	return bool_masked_pos_tf


	def get_model(data_type):
	ft_model = keras.models.load_model(MODELS[data_type][0])
	pt_model = keras.models.load_model(MODELS[data_type][1])

	label_map = LABEL_MAPS.get(data_type)
	label_map = K400_label_map
	label_map = {v: k for k, v in label_map.items()}

	return ft_model, pt_model, label_map


	def inference(video_file, data_type, mask_ratio):
	print('---------------------------')
	print(video_file)
	print(data_type)
	print(mask_ratio)
	print('---------------------------')

	# get sample data
	container = read_video(video_file)
	frames = frame_sampling(container, num_frames=num_frames)

	# get models
	bool_masked_pos_tf = tube_mask_generator(mask_ratio)
	ft_model, pt_model, label_map = get_model(data_type)
	ft_model.trainable = False
	pt_model.trainable = False

	# inference on fine-tune model
	outputs_ft = ft_model(frames[None, ...], training=False)
	probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0)
	confidences = {
	label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
	}

	# inference on pre-trained model
	outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False)
	reconstruct_output, mask = reconstrunction(
	frames[None, ...], bool_masked_pos_tf, outputs_pt
	)

	# post process
	input_frame = denormalize(frames)
	input_mask = denormalize(mask[0] * frames)
	output_frame = denormalize(reconstruct_output)

	frames = []
	for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame):
	combined_frame = np.hstack([frame_a, frame_b, frame_c])
	frames.append(combined_frame)

	combined_gif = 'combined.gif'
	imageio.mimsave(combined_gif, frames, duration=300, loop=0)
	return confidences, combined_gif


	def main():
	datasets = ['K400', 'SSv2', 'UCF']
	sample_example = [
	["examples/k400.mp4", datasets[0], 0.9],
	["examples/ucf.mp4", datasets[1], 0.8],
	["examples/k400.mp4", datasets[2], 0.7]
	]

	iface = gr.Interface(
	fn=inference,
	inputs=[
	gr.Video(type="file", label="Input Video"),
	gr.Radio(
	datasets,
	type='value',
	default=datasets[0],
	label='Dataset',
	),
	gr.Slider(
	0.5,
	1.0,
	step=0.1,
	default=0.5,
	label='Mask Ratio'
	)
	],
	outputs=[
	gr.Label(num_top_classes=3, label='scores'),
	gr.Image(type="filepath", label='reconstructed')
	],
	examples=sample_example,
	title="VideoMAE",
	description="Keras reimplementation of <a href='https://github.com/innat/VideoMAE'>VideoMAE</a> is presented here."
	)

	iface.launch()

	if __name__ == '__main__':
	main()