lmms-lab
/

LLaVA-Video-7B-Qwen2

Video-Text-to-Text

text-generation

Inference Endpoints

Model card Files Files and versions Community

ZhangYuanhan commited on 8 days ago

Commit

013210b

•

1 Parent(s): d5cd10a

Update README.md

Files changed (1) hide show

README.md +1 -1

README.md CHANGED Viewed

@@ -202,7 +202,7 @@ video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].c
 video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
 time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
-question = DEFAULT_IMAGE_TOKEN + f"{time_instruciton}\nPlease describe this video in detail."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], question)
 conv.append_message(conv.roles[1], None)

 video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
 time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
+question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruciton}\nPlease describe this video in detail."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], question)
 conv.append_message(conv.roles[1], None)