pandagpt-vicuna-v0-7b / code /pytorchvideo /tests /test_data_epic_kitchen_recognition.py
mvsoom's picture
Upload folder using huggingface_hub
3133fdb
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import unittest
import unittest.mock
import torch
from pytorchvideo.data import EpicKitchenRecognition
from pytorchvideo.data.epic_kitchen import ActionData
from pytorchvideo.data.epic_kitchen_recognition import ClipSampling
from pytorchvideo.data.frame_video import FrameVideo
class TestEpicKitchenRecognition(unittest.TestCase):
def test_transform_generator(self):
clip = {
"start_time": 2.5,
"stop_time": 6.5,
"video": torch.rand(3, 4, 10, 20),
"actions": [
ActionData(
"P01",
"P01_01",
"turn off light",
"00:00:01.00",
"00:00:02.00",
262,
370,
"turn-off",
12,
"light",
113,
"['light']",
"[113]",
),
ActionData(
"P01",
"P01_01",
"turn on light",
"00:00:04.00",
"00:00:06.00",
262,
370,
"turn-on",
12,
"light",
113,
"['light']",
"[113]",
),
ActionData(
"P01",
"P01_01",
"close door",
"00:00:06.00",
"00:00:07.00",
418,
569,
"close",
3,
"door",
8,
"['door']",
"[8]",
),
ActionData(
"P01",
"P01_01",
"slam door",
"00:00:10.00",
"00:00:11.00",
408,
509,
"slam",
3,
"door",
8,
"['door']",
"[8]",
),
],
}
def additional_transform(clip):
clip["video"] = clip["video"].permute(1, 2, 3, 0)
return clip
transform_fn = EpicKitchenRecognition._transform_generator(additional_transform)
transformed_clip = transform_fn(clip)
self.assertEqual(len(transformed_clip["actions"]), 2)
# Sort for stability
sorted_actions = sorted(transformed_clip["actions"], key=lambda a: a.start_time)
self.assertEqual(sorted_actions[0].narration, "turn on light")
self.assertEqual(sorted_actions[1].narration, "close door")
self.assertEqual(transformed_clip["start_time"], 2.5)
self.assertEqual(transformed_clip["stop_time"], 6.5)
self.assertEqual(transformed_clip["video"].size(), torch.Size([4, 10, 20, 3]))
def test_frame_filter_generator(self):
input_list = list(range(10))
frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(10)
all_elements = frame_filter_fn(input_list)
self.assertEqual(all_elements, input_list)
frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(5)
half_elements = frame_filter_fn(input_list)
self.assertEqual(len(half_elements), 5)
self.assertEqual(half_elements, [i for i in input_list if not i % 2])
frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(1)
half_elements = frame_filter_fn(input_list)
self.assertEqual(len(half_elements), 1)
self.assertEqual(half_elements[0], 0)
def test_define_clip_structure_generator(self):
seconds_per_clip = 5
define_clip_structure_fn = (
EpicKitchenRecognition._define_clip_structure_generator(
seconds_per_clip=5, clip_sampling=ClipSampling.RandomOffsetUniform
)
)
frame_videos = {
"P01_003": FrameVideo.from_frame_paths(
[f"root/P01_003/frame_{i}" for i in range(100)], 10
),
"P02_004": FrameVideo.from_frame_paths(
[f"root/P02_004/frame_{i}" for i in range(300)], 10
),
"P11_010": FrameVideo.from_frame_paths(
[f"root/P11_010/frame_{i}" for i in range(600)], 30
),
}
actions = {video_id: [] for video_id in frame_videos}
random_value = 0.5
with unittest.mock.patch("random.random", return_value=random_value) as _:
clips = define_clip_structure_fn(frame_videos, actions)
sorted_clips = sorted(clips, key=lambda c: c.start_time) # For stability
for clip in sorted_clips:
self.assertEqual(clip.stop_time - clip.start_time, seconds_per_clip)
clips_P01_003 = [c for c in sorted_clips if c.video_id == "P01_003"]
self.assertEqual(len(clips_P01_003), 1)
for i in range(len(clips_P01_003)):
self.assertEqual(
clips_P01_003[i].start_time, seconds_per_clip * (i + random_value)
)
clips_P02_004 = [c for c in sorted_clips if c.video_id == "P02_004"]
self.assertEqual(len(clips_P02_004), 5)
for i in range(len(clips_P02_004)):
self.assertEqual(
clips_P02_004[i].start_time, seconds_per_clip * (i + random_value)
)
clips_P11_010 = [c for c in sorted_clips if c.video_id == "P11_010"]
self.assertEqual(len(clips_P11_010), 3)
for i in range(len(clips_P11_010)):
self.assertEqual(
clips_P11_010[i].start_time, seconds_per_clip * (i + random_value)
)