Spaces:

innat
/

VideoMAE

Running

File size: 3,222 Bytes

import gradio as gr
import numpy as np
import imageio

import tensorflow as tf
from tensorflow import keras

from utils import TubeMaskingGenerator
from utils import read_video, frame_sampling, denormalize, reconstrunction
from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size
from labels import K400_label_map, SSv2_label_map, UCF_label_map


MODELS = {
    'K400': [
        'innat/videomae/TFVideoMAE_S_K400_16x224_FT',
        'innat/videomae/TFVideoMAE_S_K400_16x224_PT'
        ],
    'SSv2': [],
    'UCF' : []
}


def tube_mask_generator():
    window_size = (
        num_frames // 2, 
        input_size // patch_size[0], 
        input_size // patch_size[1]
    )
    tube_mask = TubeMaskingGenerator(
        input_size=window_size, 
        mask_ratio=0.70
    )
    make_bool = tube_mask()
    bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
    bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0)
    bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool)
    return bool_masked_pos_tf


def video_to_gif(video_array, gif_filename):
    imageio.mimsave(
        gif_filename, video_array, duration=100
    )


def get_model(data_type):
    ft_model = keras.models.load_model(MODELS[data_type][0])
    pt_model = keras.models.load_model(MODELS[data_type][1])
    label_map = {v: k for k, v in K400_label_map.items()}
    return ft_model, pt_model, label_map


def inference(video_file, dataset_type):
    container = read_video(video_file)
    frames = frame_sampling(container, num_frames=num_frames)
    bool_masked_pos_tf = tube_mask_generator()
    ft_model, pt_model, label_map = get_model(dataset_type)
    ft_model.trainable = False
    pt_model.trainable = False

    # inference on fine-tune model
    outputs_ft = ft_model(frames[None, ...], training=False)
    probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0)
    confidences = {
        label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
    }

    # inference on pre-trained model
    outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False)
    reconstruct_output, mask = reconstrunction(
        frames[None, ...], bool_masked_pos_tf, outputs_pt
    )

    input_frame = denormalize(frames)
    input_mask = denormalize(mask[0] * frames)
    output_frame = denormalize(reconstruct_output)

    frames = []
    for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame):
        combined_frame = np.hstack([frame_a, frame_b, frame_c])
        frames.append(combined_frame)

    combined_gif = 'combined.gif'
    imageio.mimsave(combined_gif, frames, duration=300, loop=0)
    return confidences, combined_gif


gr.Interface(
    fn=inference,
    inputs=[ 
        gr.Video(type="file"),
        gr.Radio(
            ['K400', 'SSv2', 'UCF'], 
            label='Dataset', value='K400'
        ),
    ],
    outputs=[
        gr.Label(num_top_classes=3, label='confidence scores'),
        gr.Image(type="filepath", label='reconstructed masked autoencoder')
    ],
    examples=[
        ["examples/k400.mp4"],
        ["examples/k400.mp4"],
        ["examples/k400.mp4"],
    ],
    title="VideoMAE",
).launch()