File size: 3,281 Bytes
09295f0
f07aecd
09295f0
 
 
 
 
 
 
 
 
 
 
 
1b70000
 
 
 
 
 
 
 
 
 
 
 
 
09295f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b70000
09295f0
 
 
 
 
 
2e72a10
09295f0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import torch
import os
import gradio as gr
from video_transformers import VideoModel
from utils import (
    convert_frames_to_gif,
    download_youtube_video,
    sample_frames_from_video_file,
)

video_model = VideoModel.from_transformers("facebook/timesformer-base-finetuned-k400")


examples = [
    ["https://www.youtube.com/watch?v=huAJ9dC5lmI"],
    ["https://www.youtube.com/watch?v=wvcWt6u5HTg"],
    ["https://www.youtube.com/watch?v=-3kZSi5qjRM"],
    ["https://www.youtube.com/watch?v=-6usjfP8hys"],
    ["https://www.youtube.com/watch?v=BDHub0gBGtc"],
    ["https://www.youtube.com/watch?v=B9ea7YyCP6E"],
    ["https://www.youtube.com/watch?v=BBkpaeJBKmk"],
    ["https://www.youtube.com/watch?v=BBqU8Apee_g"],
    ["https://www.youtube.com/watch?v=B8OdMwVwyXc"],
    ["https://www.youtube.com/watch?v=I7cwq6_4QtM"],
    ["https://www.youtube.com/watch?v=Z0mJDXpNhYA"],
    ["https://www.youtube.com/watch?v=QkQQjFGnZlg"],
    ["https://www.youtube.com/watch?v=IQaoRUQif14"],
]


def predict(youtube_url):

    video_path = download_youtube_video(youtube_url)

    frames = sample_frames_from_video_file(video_path, num_frames=16)
    gif_path = convert_frames_to_gif(frames)

    result = video_model.predict(video_or_folder_path=video_path)

    os.remove(video_path)

    return result["predictions"], gif_path


app = gr.Blocks()
with app:
    gr.Markdown("# **<p align='center'>Video Classification with Timesformer</p>**")
    gr.Markdown(
        """
        <p style='text-align: center'>
        Timesformer is a video model that uses a Transformer architecture to process video frames.
        <br>It is released by Facebook AI Research in ICML 2021.
        <br>This version is trained on Kinetics-400 dataset and can classify videos into 400 classes.
        </p>
        """
    )
    gr.Markdown(
        """
        <p style='text-align: center'>
        Follow me for more! 
        <br> <a href='https://twitter.com/fcakyon' target='_blank'>twitter</a> | <a href='https://github.com/fcakyon' target='_blank'>github</a> | <a href='https://www.linkedin.com/in/fcakyon/' target='_blank'>linkedin</a> | <a href='https://fcakyon.medium.com/' target='_blank'>medium</a>
        </p>
        """
    )

    with gr.Row():
        with gr.Column():
            gr.Markdown("Provide a Youtube video URL.")
            youtube_url = gr.Textbox(label="Youtube URL:", show_label=True)
            predict_btn = gr.Button(value="Predict")
        with gr.Column():
            video_gif = gr.Image(
                label="Input Clip",
                show_label=True,
            )
        with gr.Column():
            predictions = gr.Label(
                label="Predictions:", show_label=True, num_top_classes=5
            )

    gr.Markdown("**Examples:**")
    gr.Examples(
        examples,
        youtube_url,
        [predictions, video_gif],
        fn=predict,
        cache_examples=True,
    )

    predict_btn.click(predict, inputs=youtube_url, outputs=[predictions, video_gif])
    gr.Markdown(
        """
        \n Demo created by: <a href=\"https://github.com/fcakyon\">fcakyon</a>
        <br> Based on this <a href=\"https://huggingface.co/docs/transformers/main/model_doc/timesformer">HuggingFace model</a>
        """
    )

app.launch()