Spaces:

nouvellevision
/

VSL-VideoMAE

Sleeping

App Files Files Community

tanthinhdt commited on Apr 18, 2024

Commit

5fee850

verified ·

1 Parent(s): 4703605

fix(app): adjust some params

Browse files

Files changed (1) hide show

app.py +48 -69

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import yaml
 import gradio as gr
 from mediapipe.python.solutions import holistic
 from torchvision.transforms.v2 import Compose, Lambda, Normalize
@@ -62,17 +61,38 @@ examples = [
 ]
 def inference(
     video: str,
     k: int,
-    model,
-    keypoints_detector,
-    data_height: int,
-    data_width: int,
-    model_input_height: int,
-    model_input_width: int,
-    device: str,
-    transform: Compose,
     progress: gr.Progress = gr.Progress(),
 ) -> tuple:
     progress(0, desc='Preprocessing video')
@@ -80,8 +100,6 @@ def inference(
         model_num_frames=model.config.num_frames,
         keypoints_detector=keypoints_detector,
         source=video,
-        data_height=data_height,
-        data_width=data_width,
         model_input_height=model_input_height,
         model_input_width=model_input_width,
         device=device,
@@ -100,61 +118,22 @@ def inference(
     return output_message
-if __name__ == '__main__':
-    with open('config.yaml', 'r') as file:
-        config = yaml.safe_load(file)
-    device = 'cpu'
-    image_processor = VideoMAEImageProcessor.from_pretrained(config['model']['name'])
-    model = VideoMAEForVideoClassification.from_pretrained(config['model']['name'])
-    model = model.eval().to(device)
-    mean = image_processor.image_mean
-    std = image_processor.image_std
-    if 'shortest_edge' in image_processor.size:
-        height = width = image_processor.size['shortest_edge']
-    else:
-        height = image_processor.size['height']
-        width = image_processor.size['width']
-    keypoints_detector = holistic.Holistic(
-        static_image_mode=False,
-        model_complexity=2,
-        enable_segmentation=True,
-        refine_face_landmarks=True,
-    )
-    transform = Compose(
-        [
-            Lambda(lambda x: x / 255.0),
-            Normalize(mean=mean, std=std),
-        ]
-    )
-    iface = gr.Interface(
-        fn=inference,
-        inputs=[
-            'video',
-            gr.components.Slider(
-                minimum=1,
-                maximum=5,
-                value=3,
-                step=1,
-                label='k',
-                info='Return top-k results',
-            ),
-            model,
-            keypoints_detector,
-            config['data']['height'],
-            config['data']['width'],
-            height,
-            width,
-            device,
-            transform,
-        ],
-        outputs='text',
-        examples=examples,
-        title=title,
-        description=description,
-    )
-    iface.launch()

 import gradio as gr
 from mediapipe.python.solutions import holistic
 from torchvision.transforms.v2 import Compose, Lambda, Normalize
 ]
+device = 'cpu'
+model_name = 'VieSignLang/videomae_skeleton_v1.0'
+image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
+model = VideoMAEForVideoClassification.from_pretrained(model_name)
+model = model.eval().to(device)
+mean = image_processor.image_mean
+std = image_processor.image_std
+if 'shortest_edge' in image_processor.size:
+    model_input_height = model_input_width = image_processor.size['shortest_edge']
+else:
+    model_input_height = image_processor.size['height']
+    model_input_width = image_processor.size['width']
+keypoints_detector = holistic.Holistic(
+    static_image_mode=False,
+    model_complexity=2,
+    enable_segmentation=True,
+    refine_face_landmarks=True,
+)
+transform = Compose(
+    [
+        Lambda(lambda x: x / 255.0),
+        Normalize(mean=mean, std=std),
+    ]
+)
 def inference(
     video: str,
     k: int,
     progress: gr.Progress = gr.Progress(),
 ) -> tuple:
     progress(0, desc='Preprocessing video')
         model_num_frames=model.config.num_frames,
         keypoints_detector=keypoints_detector,
         source=video,
         model_input_height=model_input_height,
         model_input_width=model_input_width,
         device=device,
     return output_message
+iface = gr.Interface(
+    fn=inference,
+    inputs=[
+        'video',
+        gr.components.Slider(
+            minimum=1,
+            maximum=5,
+            value=3,
+            step=1,
+            label='k',
+            info='Return top-k results',
+        ),
+    ],
+    outputs='text',
+    examples=examples,
+    title=title,
+    description=description,
+)
+iface.launch()