Intel
/

tvp-base

@@ -37,13 +37,13 @@ from transformers import AutoProcessor, TvpForVideoGrounding
 def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
-    """
     Convert the video from its original fps to the target_fps and decode the video with PyAV decoder.
     Returns:
         frames (tensor): decoded frames from the video. Return None if the no
             video stream was found.
         fps (float): the number of frames per second of the video.
-    """
     fps = float(container.streams.video[0].average_rate)
     clip_size = sampling_rate * num_frames / target_fps * fps
     delta = max(container.streams.video[0].frames - clip_size, 0)
@@ -65,12 +65,11 @@ def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, targe
             frames[frame.pts] = frame
             break
     frames = [frames[pts] for pts in sorted(frames)]
     return frames, fps
 def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
-    """
     Decode the video and perform temporal sampling.
     Args:
         container (container): pyav container.
@@ -84,7 +83,7 @@ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps
             the target video fps before frame sampling.
     Returns:
         frames (tensor): decoded frames from the video.
-    """
     assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
     frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
     clip_size = sampling_rate * num_frames / target_fps * fps
@@ -93,22 +92,19 @@ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps
     frames = [frames[idx] for idx in index]
     frames = [frame.to_rgb().to_ndarray() for frame in frames]
     frames = torch.from_numpy(np.stack(frames))
     return frames
 def get_resize_size(image, max_size):
-    """
     Args:
         image: np.ndarray
         max_size: The max size of height and width
     Returns:
         (height, width)
     Note the height/width order difference >>> pil_img = Image.open("raw_img_tensor.jpg") >>> pil_img.size (640,
     480) # (width, height) >>> np_img = np.array(pil_img) >>> np_img.shape (480, 640, 3) # (height, width, 3)
-    """
     height, width = image.shape[-2:]
     if height >= width:
         ratio = width * 1.0 / height
         new_height = max_size
@@ -120,32 +116,29 @@ def get_resize_size(image, max_size):
     size = {"height": int(new_height), "width": int(new_width)}
     return size
-file = hf_hub_download(repo_id="Intel/tvp_demo", filename="3MSZA.mp4", repo_type="dataset")
 model = TvpForVideoGrounding.from_pretrained("Intel/tvp-base")
 decoder_kwargs = dict(
     container=av.open(file, metadata_errors="ignore"),
     sampling_rate=1,
-    num_frames=model.config.num_frm,
     clip_idx=0,
     num_clips=1,
     target_fps=3,
 )
-raw_sampled_frms = decode(**decoder_kwargs)
-raw_sampled_frms = raw_sampled_frms.permute(0, 3, 1, 2)
-text = "person turn a light on."
 processor = AutoProcessor.from_pretrained("Intel/tvp-base")
 size = get_resize_size(raw_sampled_frms, model.config.max_img_size)
-data = processor(
     text=[text], videos=list(raw_sampled_frms.numpy()), return_tensors="pt", max_text_length=100, size=size
 )
-data["pixel_values"] = data["pixel_values"].to(model.dtype)
-data["labels"] = torch.tensor([30.96, 24.3, 30.4])
-output = model(**data)
 print(f"The model's output is {output}")
 def get_video_duration(filename):

 def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
+    '''
     Convert the video from its original fps to the target_fps and decode the video with PyAV decoder.
     Returns:
         frames (tensor): decoded frames from the video. Return None if the no
             video stream was found.
         fps (float): the number of frames per second of the video.
+    '''
     fps = float(container.streams.video[0].average_rate)
     clip_size = sampling_rate * num_frames / target_fps * fps
     delta = max(container.streams.video[0].frames - clip_size, 0)
             frames[frame.pts] = frame
             break
     frames = [frames[pts] for pts in sorted(frames)]
     return frames, fps
 def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
+    '''
     Decode the video and perform temporal sampling.
     Args:
         container (container): pyav container.
             the target video fps before frame sampling.
     Returns:
         frames (tensor): decoded frames from the video.
+    '''
     assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
     frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
     clip_size = sampling_rate * num_frames / target_fps * fps
     frames = [frames[idx] for idx in index]
     frames = [frame.to_rgb().to_ndarray() for frame in frames]
     frames = torch.from_numpy(np.stack(frames))
     return frames
 def get_resize_size(image, max_size):
+    '''
     Args:
         image: np.ndarray
         max_size: The max size of height and width
     Returns:
         (height, width)
     Note the height/width order difference >>> pil_img = Image.open("raw_img_tensor.jpg") >>> pil_img.size (640,
     480) # (width, height) >>> np_img = np.array(pil_img) >>> np_img.shape (480, 640, 3) # (height, width, 3)
+    '''
     height, width = image.shape[-2:]
     if height >= width:
         ratio = width * 1.0 / height
         new_height = max_size
     size = {"height": int(new_height), "width": int(new_width)}
     return size
+file = hf_hub_download(repo_id="Intel/tvp_demo", filename="AK2KG.mp4", repo_type="dataset")
 model = TvpForVideoGrounding.from_pretrained("Intel/tvp-base")
 decoder_kwargs = dict(
     container=av.open(file, metadata_errors="ignore"),
     sampling_rate=1,
+    num_frames=model.config.num_frames,
     clip_idx=0,
     num_clips=1,
     target_fps=3,
 )
+raw_sampled_frms = decode(**decoder_kwargs).permute(0, 3, 1, 2)
+text = "a person is sitting on a bed."
 processor = AutoProcessor.from_pretrained("Intel/tvp-base")
 size = get_resize_size(raw_sampled_frms, model.config.max_img_size)
+model_inputs = processor(
     text=[text], videos=list(raw_sampled_frms.numpy()), return_tensors="pt", max_text_length=100, size=size
 )
+model_inputs["pixel_values"] = model_inputs["pixel_values"].to(model.dtype)
+model_inputs["labels"] = torch.tensor([18.1, 0.0, 6.8])
+output = model(**model_inputs)
 print(f"The model's output is {output}")
 def get_video_duration(filename):