File size: 3,264 Bytes
24cfc1b fbc7dad 24cfc1b c01c45a 24cfc1b f811062 24cfc1b eeed60d 24cfc1b fbc7dad 24cfc1b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import gradio as gr
import numpy as np
import imageio
import tensorflow as tf
from tensorflow import keras
from utils import TubeMaskingGenerator
from utils import read_video, frame_sampling, denormalize, reconstrunction
from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size
from labels import K400_label_map, SSv2_label_map, UCF_label_map
MODELS = {
'K400': [
'innat/videomae/TFVideoMAE_S_K400_16x224_FT',
'innat/videomae/TFVideoMAE_S_K400_16x224_PT'
],
'SSv2': [],
'UCF' : []
}
def tube_mask_generator():
window_size = (
num_frames // 2,
input_size // patch_size[0],
input_size // patch_size[1]
)
tube_mask = TubeMaskingGenerator(
input_size=window_size,
mask_ratio=0.70
)
make_bool = tube_mask()
bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0)
bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool)
return bool_masked_pos_tf
def video_to_gif(video_array, gif_filename):
imageio.mimsave(
gif_filename, video_array, duration=100
)
def get_model(data_type):
ft_model = keras.models.load_model(MODELS[data_type][0])
pt_model = keras.models.load_model(MODELS[data_type][1])
label_map = {v: k for k, v in K400_label_map.items()}
return ft_model, pt_model, label_map
def inference(video_file, dataset_type):
container = read_video(video_file)
frames = frame_sampling(container, num_frames=num_frames)
bool_masked_pos_tf = tube_mask_generator()
ft_model, pt_model, label_map = get_model(dataset_type)
ft_model.trainable = False
pt_model.trainable = False
# inference on fine-tune model
outputs_ft = ft_model(frames[None, ...], training=False)
probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0)
confidences = {
label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
}
# inference on pre-trained model
outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False)
reconstruct_output, mask = reconstrunction(
frames[None, ...], bool_masked_pos_tf, outputs_pt
)
input_frame = denormalize(frames)
input_mask = denormalize(mask[0] * frames)
output_frame = denormalize(reconstruct_output)
frames = []
for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame):
combined_frame = np.hstack([frame_a, frame_b, frame_c])
frames.append(combined_frame)
combined_gif = 'combined.gif'
imageio.mimsave(combined_gif, frames, duration=300, loop=0)
return confidences, combined_gif
gr.Interface(
fn=inference,
inputs=[
gr.Video(type="file"),
gr.Radio(
['K400', 'SSv2', 'UCF'],
type='value',
default='K400',
label='Dataset',
),
],
outputs=[
gr.Label(num_top_classes=3, label='confidence scores'),
gr.Image(type="filepath", label='reconstructed masked autoencoder')
],
examples=[
["examples/k400.mp4"],
["examples/k400.mp4"],
["examples/k400.mp4"],
],
title="VideoMAE",
).launch() |