Chat-UniVi
commited on
Commit
•
164f596
1
Parent(s):
a70ee08
Update README.md
Browse files
README.md
CHANGED
@@ -39,11 +39,6 @@ import numpy as np
|
|
39 |
|
40 |
def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
|
41 |
# speed up video decode via decord.
|
42 |
-
video_mask = np.zeros(max_frames, dtype=np.int64)
|
43 |
-
max_video_length = 0
|
44 |
-
|
45 |
-
# T x 3 x H x W
|
46 |
-
video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
|
47 |
|
48 |
if s is None:
|
49 |
start_time, end_time = None, None
|
@@ -83,25 +78,22 @@ def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH,
|
|
83 |
patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
|
84 |
slice_len = patch_images.shape[0]
|
85 |
|
86 |
-
|
87 |
-
if slice_len < 1:
|
88 |
-
pass
|
89 |
-
else:
|
90 |
-
video[:slice_len, ...] = patch_images
|
91 |
-
|
92 |
-
return patch_images, video_mask
|
93 |
else:
|
94 |
print("video path: {} error.".format(video_path))
|
95 |
|
96 |
-
video_mask[:max_video_length] = [1] * max_video_length
|
97 |
-
|
98 |
-
return torch.from_numpy(video), video_mask
|
99 |
|
100 |
if __name__ == '__main__':
|
101 |
# Model Parameter
|
102 |
model_path = "Chat-UniVi/Chat-UniVi" # or "Chat-UniVi/Chat-UniVi-13B"
|
103 |
video_path = ${video_path}
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
# Input Text
|
107 |
qs = "Describe the video."
|
@@ -136,13 +128,13 @@ if __name__ == '__main__':
|
|
136 |
|
137 |
# Check if the video exists
|
138 |
if video_path is not None:
|
139 |
-
video_frames,
|
140 |
|
141 |
cur_prompt = qs
|
142 |
if model.config.mm_use_im_start_end:
|
143 |
-
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN *
|
144 |
else:
|
145 |
-
qs = DEFAULT_IMAGE_TOKEN *
|
146 |
|
147 |
conv = conv_templates[conv_mode].copy()
|
148 |
conv.append_message(conv.roles[0], qs)
|
|
|
39 |
|
40 |
def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
|
41 |
# speed up video decode via decord.
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
if s is None:
|
44 |
start_time, end_time = None, None
|
|
|
78 |
patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
|
79 |
slice_len = patch_images.shape[0]
|
80 |
|
81 |
+
return patch_images, slice_len
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
else:
|
83 |
print("video path: {} error.".format(video_path))
|
84 |
|
|
|
|
|
|
|
85 |
|
86 |
if __name__ == '__main__':
|
87 |
# Model Parameter
|
88 |
model_path = "Chat-UniVi/Chat-UniVi" # or "Chat-UniVi/Chat-UniVi-13B"
|
89 |
video_path = ${video_path}
|
90 |
+
|
91 |
+
# The number of visual tokens varies with the length of the video. "max_frames" is the maximum number of frames.
|
92 |
+
# When the video is long, we will uniformly downsample the video to meet the frames when equal to the "max_frames".
|
93 |
+
max_frames = 100
|
94 |
+
|
95 |
+
# The number of frames retained per second in the video.
|
96 |
+
video_framerate = 1
|
97 |
|
98 |
# Input Text
|
99 |
qs = "Describe the video."
|
|
|
128 |
|
129 |
# Check if the video exists
|
130 |
if video_path is not None:
|
131 |
+
video_frames, slice_len = _get_rawvideo_dec(video_path, image_processor, max_frames=max_frames, video_framerate=video_framerate)
|
132 |
|
133 |
cur_prompt = qs
|
134 |
if model.config.mm_use_im_start_end:
|
135 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
|
136 |
else:
|
137 |
+
qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
|
138 |
|
139 |
conv = conv_templates[conv_mode].copy()
|
140 |
conv.append_message(conv.roles[0], qs)
|