Spaces:
Running
on
Zero
Running
on
Zero
update inference model.
Browse files- app.py +5 -3
- videollama2/mm_utils.py +4 -1
app.py
CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
|
|
7 |
|
8 |
import sys
|
9 |
sys.path.append('./')
|
10 |
-
from videollama2.constants import MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN
|
11 |
from videollama2.conversation import conv_templates, SeparatorStyle, Conversation
|
12 |
from videollama2.model.builder import load_pretrained_model
|
13 |
from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video
|
@@ -155,12 +155,14 @@ def generate(image, video, state, state_, textbox_in, temperature, top_p, max_ou
|
|
155 |
|
156 |
text_en_in = textbox_in.replace("picture", "image")
|
157 |
|
|
|
|
|
158 |
processor = handler.processor
|
159 |
if os.path.exists(image) and not os.path.exists(video):
|
160 |
tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype))
|
161 |
modals.append('IMAGE')
|
162 |
if not os.path.exists(image) and os.path.exists(video):
|
163 |
-
tensor.append(process_video(video, processor).to(handler.model.device, dtype=dtype))
|
164 |
modals.append('VIDEO')
|
165 |
if os.path.exists(image) and os.path.exists(video):
|
166 |
raise NotImplementedError("Not support image and video at the same time")
|
@@ -222,7 +224,7 @@ def clear_history(state, state_):
|
|
222 |
# 3. The function can't return tensor or other cuda objects.
|
223 |
|
224 |
conv_mode = "llama_2"
|
225 |
-
model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
|
226 |
|
227 |
device = torch.device("cuda")
|
228 |
|
|
|
7 |
|
8 |
import sys
|
9 |
sys.path.append('./')
|
10 |
+
from videollama2.constants import NUM_FRAMES, MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN
|
11 |
from videollama2.conversation import conv_templates, SeparatorStyle, Conversation
|
12 |
from videollama2.model.builder import load_pretrained_model
|
13 |
from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video
|
|
|
155 |
|
156 |
text_en_in = textbox_in.replace("picture", "image")
|
157 |
|
158 |
+
num_frames = handler.model.config.num_frames if hasattr(handler.model.config, "num_frames") else NUM_FRAMES
|
159 |
+
|
160 |
processor = handler.processor
|
161 |
if os.path.exists(image) and not os.path.exists(video):
|
162 |
tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype))
|
163 |
modals.append('IMAGE')
|
164 |
if not os.path.exists(image) and os.path.exists(video):
|
165 |
+
tensor.append(process_video(video, processor, num_frames=num_frames, sample_scheme='fps').to(handler.model.device, dtype=dtype))
|
166 |
modals.append('VIDEO')
|
167 |
if os.path.exists(image) and os.path.exists(video):
|
168 |
raise NotImplementedError("Not support image and video at the same time")
|
|
|
224 |
# 3. The function can't return tensor or other cuda objects.
|
225 |
|
226 |
conv_mode = "llama_2"
|
227 |
+
model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B-16F'
|
228 |
|
229 |
device = torch.device("cuda")
|
230 |
|
videollama2/mm_utils.py
CHANGED
@@ -381,7 +381,10 @@ def process_video(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAM
|
|
381 |
elif mode == 'fps':
|
382 |
assert local_fps is not None
|
383 |
segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
|
384 |
-
|
|
|
|
|
|
|
385 |
else:
|
386 |
raise ImportError(f'Unsupported frame sampling mode: {mode}')
|
387 |
|
|
|
381 |
elif mode == 'fps':
|
382 |
assert local_fps is not None
|
383 |
segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
|
384 |
+
frame_id_list = np.arange(segment_len // 2, duration, segment_len, dtype=int)
|
385 |
+
if len(frame_id_list) < num_frames:
|
386 |
+
frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
|
387 |
+
return frame_id_list
|
388 |
else:
|
389 |
raise ImportError(f'Unsupported frame sampling mode: {mode}')
|
390 |
|