Spaces:
Sleeping
Sleeping
File size: 3,115 Bytes
e7a4186 3b64f55 e7a4186 3b64f55 95d2401 3b64f55 e7a4186 5fee850 e7a4186 5fee850 6117d60 5fee850 3b64f55 175bb4b 6117d60 3b64f55 0189988 e7a4186 3b64f55 e7a4186 3b64f55 e7a4186 3b64f55 e7a4186 56ec45c e7a4186 3b64f55 b764d7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import json
import gradio as gr
from time import time
import onnxruntime as ort
from mediapipe.python.solutions import holistic
from torchvision.transforms.v2 import Compose, Lambda, Normalize
from utils import get_predictions, preprocess
title = '''
'''
cite_markdown = '''
'''
description = '''
'''
examples = [
['000_con_cho.mp4'],
]
ort_session = ort.InferenceSession('videomae_skeleton_v2.3.onnx')
model_config = json.load(open('config.json'))
preprocessor_config = json.load(open('preprocessor_config.json'))
mean = preprocessor_config['image_mean']
std = preprocessor_config['image_std']
if 'shortest_edge' in preprocessor_config['size']:
model_input_height = model_input_width = preprocessor_config['size']['shortest_edge']
else:
model_input_height = preprocessor_config['size']['height']
model_input_width = preprocessor_config['size']['width']
# Define the transform.
transform = Compose(
[
Lambda(lambda x: x / 255.0),
Normalize(mean=mean, std=std),
]
)
def inference(
video: str,
progress: gr.Progress = gr.Progress(),
) -> str:
'''
Video-based inference for Vietnamese Sign Language recognition.
Parameters
----------
video : str
The path to the video.
progress : gr.Progress, optional
The progress bar, by default gr.Progress()
Returns
-------
str
The top-3 predictions.
'''
progress(0, desc='Preprocessing video')
keypoints_detector = holistic.Holistic(
static_image_mode=False,
model_complexity=2,
enable_segmentation=True,
refine_face_landmarks=True,
)
start_time = time()
inputs = preprocess(
model_num_frames=model_config['num_frames'],
keypoints_detector=keypoints_detector,
source=video,
model_input_height=model_input_height,
model_input_width=model_input_width,
transform=transform,
)
end_time = time()
data_time = end_time - start_time
progress(1/2, desc='Getting predictions')
start_time = time()
predictions = get_predictions(
inputs=inputs,
ort_session=ort_session,
id2gloss=model_config['id2label'],
k=3,
)
end_time = time()
model_time = end_time - start_time
if len(predictions) == 0:
output_message = 'No sign language detected in the video. Please try again.'
else:
output_message = 'The top-3 predictions are:\n'
for i, prediction in enumerate(predictions):
output_message += f'\t{i+1}. {prediction["label"]} ({prediction["score"]:2f})\n'
output_message += f'Data processing time: {data_time:.2f} seconds\n'
output_message += f'Model inference time: {model_time:.2f} seconds\n'
output_message += f'Total time: {data_time + model_time:.2f} seconds'
progress(1/2, desc='Completed')
return output_message
iface = gr.Interface(
fn=inference,
inputs='video',
outputs='text',
examples=examples,
title=title,
description=description,
)
iface.launch()
# print(inference('000_con_cho.mp4'))
|