import gradio as gr import torch from transformers import AutoModel, AutoTokenizer # Model setting model_path = "OpenGVLab/InternVideo2_5_Chat_8B" # Load the tokenizer and model with remote code enabled. tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda() # Get the image processor from the vision tower. image_processor = model.get_vision_tower().image_processor # Evaluation settings max_num_frames = 512 generation_config = { "do_sample": False, "temperature": 0.0, "max_new_tokens": 1024, "top_p": 0.1, "num_beams": 1, } video_path = "your_video.mp4" # (For testing locally, update as needed) # Single-turn conversation example: def single_turn_chat(video_path, user_prompt): output, chat_history = model.chat( video_path=video_path, tokenizer=tokenizer, user_prompt=user_prompt, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config ) return output # Multi-turn conversation example: def multi_turn_chat(video_path, user_prompt, chat_history): output, chat_history = model.chat( video_path=video_path, tokenizer=tokenizer, user_prompt=user_prompt, chat_history=chat_history, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config ) return output, chat_history # For the Gradio interface, we'll combine these into a chat function. def chat_interface(video_path, user_prompt, chat_history): if chat_history is None: chat_history = [] output, new_history = model.chat( video_path=video_path, tokenizer=tokenizer, user_prompt=user_prompt, chat_history=chat_history, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config ) return output, new_history # Build the Gradio interface. with gr.Blocks() as demo: gr.Markdown("## InternVideo2_5_Chat_8B Chat Interface") with gr.Row(): video_input = gr.Video(label="Upload Video", type="filepath") question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...") chat_state = gr.State([]) # To maintain conversation history output_text = gr.Textbox(label="Model Response") send_btn = gr.Button("Send") send_btn.click( chat_interface, inputs=[video_input, question_input, chat_state], outputs=[output_text, chat_state] ) if __name__ == "__main__": demo.launch()