Spaces:

innat
/

VideoMAE

Running

App Files Files Community

VideoMAE / app.py

innat

Update app.py

eeed60d about 1 year ago

raw

history blame

3.26 kB

	import gradio as gr
	import numpy as np
	import imageio

	import tensorflow as tf
	from tensorflow import keras

	from utils import TubeMaskingGenerator
	from utils import read_video, frame_sampling, denormalize, reconstrunction
	from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size
	from labels import K400_label_map, SSv2_label_map, UCF_label_map


	MODELS = {
	'K400': [
	'innat/videomae/TFVideoMAE_S_K400_16x224_FT',
	'innat/videomae/TFVideoMAE_S_K400_16x224_PT'
	],
	'SSv2': [],
	'UCF' : []
	}


	def tube_mask_generator():
	window_size = (
	num_frames // 2,
	input_size // patch_size[0],
	input_size // patch_size[1]
	)
	tube_mask = TubeMaskingGenerator(
	input_size=window_size,
	mask_ratio=0.70
	)
	make_bool = tube_mask()
	bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
	bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0)
	bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool)
	return bool_masked_pos_tf


	def video_to_gif(video_array, gif_filename):
	imageio.mimsave(
	gif_filename, video_array, duration=100
	)


	def get_model(data_type):
	ft_model = keras.models.load_model(MODELS[data_type][0])
	pt_model = keras.models.load_model(MODELS[data_type][1])
	label_map = {v: k for k, v in K400_label_map.items()}
	return ft_model, pt_model, label_map


	def inference(video_file, dataset_type):
	container = read_video(video_file)
	frames = frame_sampling(container, num_frames=num_frames)
	bool_masked_pos_tf = tube_mask_generator()
	ft_model, pt_model, label_map = get_model(dataset_type)
	ft_model.trainable = False
	pt_model.trainable = False

	# inference on fine-tune model
	outputs_ft = ft_model(frames[None, ...], training=False)
	probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0)
	confidences = {
	label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
	}

	# inference on pre-trained model
	outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False)
	reconstruct_output, mask = reconstrunction(
	frames[None, ...], bool_masked_pos_tf, outputs_pt
	)

	input_frame = denormalize(frames)
	input_mask = denormalize(mask[0] * frames)
	output_frame = denormalize(reconstruct_output)

	frames = []
	for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame):
	combined_frame = np.hstack([frame_a, frame_b, frame_c])
	frames.append(combined_frame)

	combined_gif = 'combined.gif'
	imageio.mimsave(combined_gif, frames, duration=300, loop=0)
	return confidences, combined_gif


	gr.Interface(
	fn=inference,
	inputs=[
	gr.Video(type="file"),
	gr.Radio(
	['K400', 'SSv2', 'UCF'],
	type='value',
	default='K400',
	label='Dataset',
	),
	],
	outputs=[
	gr.Label(num_top_classes=3, label='confidence scores'),
	gr.Image(type="filepath", label='reconstructed masked autoencoder')
	],
	examples=[
	["examples/k400.mp4"],
	["examples/k400.mp4"],
	["examples/k400.mp4"],
	],
	title="VideoMAE",
	).launch()