File size: 3,115 Bytes
e7a4186
3b64f55
e7a4186
 
3b64f55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95d2401
3b64f55
 
e7a4186
 
 
 
 
 
 
 
5fee850
e7a4186
 
5fee850
6117d60
5fee850
 
 
 
 
 
 
 
3b64f55
 
175bb4b
6117d60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b64f55
0189988
 
 
 
 
 
 
e7a4186
3b64f55
e7a4186
3b64f55
 
 
 
 
 
e7a4186
 
3b64f55
 
e7a4186
 
 
 
 
 
 
 
 
56ec45c
 
 
 
 
 
e7a4186
 
 
 
3b64f55
 
 
 
 
 
b764d7e
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
import gradio as gr
from time import time
import onnxruntime as ort
from mediapipe.python.solutions import holistic
from torchvision.transforms.v2 import Compose, Lambda, Normalize
from utils import get_predictions, preprocess

title = '''

'''

cite_markdown = '''

'''

description = '''

'''

examples = [
    ['000_con_cho.mp4'],
]

ort_session = ort.InferenceSession('videomae_skeleton_v2.3.onnx')
model_config = json.load(open('config.json'))
preprocessor_config = json.load(open('preprocessor_config.json'))

mean = preprocessor_config['image_mean']
std = preprocessor_config['image_std']
if 'shortest_edge' in preprocessor_config['size']:
    model_input_height = model_input_width = preprocessor_config['size']['shortest_edge']
else:
    model_input_height = preprocessor_config['size']['height']
    model_input_width = preprocessor_config['size']['width']

# Define the transform.
transform = Compose(
    [
        Lambda(lambda x: x / 255.0),
        Normalize(mean=mean, std=std),
    ]
)


def inference(
    video: str,
    progress: gr.Progress = gr.Progress(),
) -> str:
    '''
    Video-based inference for Vietnamese Sign Language recognition.

    Parameters
    ----------
    video : str
        The path to the video.
    progress : gr.Progress, optional
        The progress bar, by default gr.Progress()

    Returns
    -------
    str
        The top-3 predictions.
    '''
    progress(0, desc='Preprocessing video')
    keypoints_detector = holistic.Holistic(
        static_image_mode=False,
        model_complexity=2,
        enable_segmentation=True,
        refine_face_landmarks=True,
    )

    start_time = time()
    inputs = preprocess(
        model_num_frames=model_config['num_frames'],
        keypoints_detector=keypoints_detector,
        source=video,
        model_input_height=model_input_height,
        model_input_width=model_input_width,
        transform=transform,
    )
    end_time = time()
    data_time = end_time - start_time

    progress(1/2, desc='Getting predictions')
    start_time = time()
    predictions = get_predictions(
        inputs=inputs,
        ort_session=ort_session,
        id2gloss=model_config['id2label'],
        k=3,
    )
    end_time = time()
    model_time = end_time - start_time

    if len(predictions) == 0:
        output_message = 'No sign language detected in the video. Please try again.'
    else:
        output_message = 'The top-3 predictions are:\n'
        for i, prediction in enumerate(predictions):
            output_message += f'\t{i+1}. {prediction["label"]} ({prediction["score"]:2f})\n'
        output_message += f'Data processing time: {data_time:.2f} seconds\n'
        output_message += f'Model inference time: {model_time:.2f} seconds\n'
        output_message += f'Total time: {data_time + model_time:.2f} seconds'

    progress(1/2, desc='Completed')

    return output_message


iface = gr.Interface(
    fn=inference,
    inputs='video',
    outputs='text',
    examples=examples,
    title=title,
    description=description,
)
iface.launch()
# print(inference('000_con_cho.mp4'))