diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..f9e47ffbee7e6c904ebe99bbd343e38b582c19f8 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +videollama2/serve/examples/1034346401.mp4 filter=lfs diff=lfs merge=lfs -text +videollama2/serve/examples/sample_demo_1.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/handler.py b/handler.py new file mode 100644 index 0000000000000000000000000000000000000000..fda1e6e20037a54c4b65d07baaee7183bcd2729b --- /dev/null +++ b/handler.py @@ -0,0 +1,82 @@ +from typing import Dict, List, Any +import sys +sys.path.append('./') +from videollama2 import model_init, mm_infer +from videollama2.utils import disable_torch_init + +class EndpointHandler: + def __init__(self, path: str = ""): + """ + Initialize the handler by loading the model and any other necessary components. + + Args: + path (str): The path to the model or other necessary files. + """ + disable_torch_init() + self.model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B' + self.model, self.processor, self.tokenizer = model_init(self.model_path) + + def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Handle inference requests. + + Args: + data (Dict[str, Any]): The input data for inference. Expected keys: + - 'modal' (str): 'video' or 'image' + - 'modal_path' (str): Path to the video or image file + - 'instruct' (str): The instruction/query to process + + Returns: + List[Dict[str, Any]]: The output of the inference. + """ + modal = data.get("modal", "video") + modal_path = data.get("modal_path", "") + instruct = data.get("instruct", "") + + if not modal_path or not instruct: + raise ValueError("Both 'modal_path' and 'instruct' must be provided in the input data.") + + # Perform inference + output = mm_infer( + self.processor[modal](modal_path), + instruct, + model=self.model, + tokenizer=self.tokenizer, + do_sample=False, + modal=modal + ) + + return [{"output": output}] + + +# from transformers import pipeline + +# class EndpointHandler: +# def __init__(self, path: str = ""): +# """ +# Initialize the handler by setting up the environment and loading the model. +# """ +# # Use a pipeline as a high-level helper to download and load the model +# self.pipe = pipeline("visual-question-answering", model="DAMO-NLP-SG/VideoLLaMA2-8x7B") +# print("Model downloaded and pipeline created successfully.") + +# def __call__(self, data): +# """ +# Handle inference requests. + +# Args: +# data (dict): Input data containing 'image' and 'question'. + +# Returns: +# dict: The output from the model. +# """ +# image = data.get("image") +# question = data.get("question") + +# if not image or not question: +# raise ValueError("Both 'image' and 'question' must be provided in the input data.") + +# # Use the pipeline to perform visual question answering +# output = self.pipe(image=image, question=question) + +# return output diff --git a/videollama2/__init__.py b/videollama2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bebf4c4443fd1ac61082908ffd7950d807b6900e --- /dev/null +++ b/videollama2/__init__.py @@ -0,0 +1,117 @@ +import os +import copy +import warnings +import shutil +from functools import partial + +import torch +import logging +from .model import load_pretrained_model +from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria +from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP + + +def model_init(model_path=None, **kwargs): + logging.info(f"Loading Model from {model_path}") + model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path + logging.info(f"Model Path: {model_path}") + model_name = get_model_name_from_path(model_path) + logging.info(f"Model Name: {model_name}") + tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs) + logging.info(f"Model Loaded Successfully") + if tokenizer.pad_token is None and tokenizer.unk_token is not None: + tokenizer.pad_token = tokenizer.unk_token + + num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES + + processor = { + 'image': partial(process_image, processor=processor, aspect_ratio=None), + 'video': partial(process_video, processor=processor, aspect_ratio=None, num_frames=num_frames), + } + + return model, processor, tokenizer + + +def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs): + """inference api of VideoLLaMA2 for video understanding. + + Args: + model: VideoLLaMA2 model. + image_or_video (torch.Tensor): image tensor (1, C, H, W) / video tensor (T, C, H, W). + instruct (str): text instruction for understanding video. + tokenizer: tokenizer. + do_sample (bool): whether to sample. + modal (str): inference modality. + Returns: + str: response of the model. + """ + + # 1. text preprocess (tag process & generate prompt). + if modal == 'image': + modal_token = DEFAULT_IMAGE_TOKEN + elif modal == 'video': + modal_token = DEFAULT_VIDEO_TOKEN + elif modal == 'text': + modal_token = '' + else: + raise ValueError(f"Unsupported modal: {modal}") + + # 1. vision preprocess (load & transform image or video). + if modal == 'text': + tensor = None + else: + tensor = image_or_video.half().cuda() + tensor = [(tensor, modal)] + + # 2. text preprocess (tag process & generate prompt). + if isinstance(instruct, str): + message = [{'role': 'user', 'content': modal_token + '\n' + instruct}] + elif isinstance(instruct, list): + message = copy.deepcopy(instruct) + message[0]['content'] = modal_token + '\n' + message[0]['content'] + else: + raise ValueError(f"Unsupported type of instruct: {type(instruct)}") + + if model.config.model_type in ['videollama2', 'videollama2_mistral', 'videollama2_mixtral']: + system_message = [ + {'role': 'system', 'content': ( + """<>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.""" + """\n""" + """If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<>""") + } + ] + else: + system_message = [] + + message = system_message + message + prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True) + + input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda() + attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda() + + # 3. generate response according to visual signals and prompts. + keywords = [tokenizer.eos_token] + stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) + + do_sample = kwargs.get('do_sample', False) + temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0) + top_p = kwargs.get('top_p', 0.9) + max_new_tokens = kwargs.get('max_new_tokens', 2048) + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + attention_mask=attention_masks, + images=tensor, + do_sample=do_sample, + temperature=temperature, + max_new_tokens=max_new_tokens, + top_p=top_p, + use_cache=True, + stopping_criteria=[stopping_criteria], + pad_token_id=tokenizer.eos_token_id, + ) + + outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + + return outputs diff --git a/videollama2/constants.py b/videollama2/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..ba87b61becb0819594962652fd3e193a9c8c3a3f --- /dev/null +++ b/videollama2/constants.py @@ -0,0 +1,32 @@ +CONTROLLER_HEART_BEAT_EXPIRATION = 30 +WORKER_HEART_BEAT_INTERVAL = 15 + +LOGDIR = "." + +# Model Constants +IGNORE_INDEX = -100 + +# Image arguments +IMAGE_TOKEN_INDEX = -200 +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_IMAGE_PATCH_TOKEN = "" +DEFAULT_IM_START_TOKEN = "" +DEFAULT_IM_END_TOKEN = "" +IMAGE_PLACEHOLDER = "" + +# Video arguments +VIDEO_TOKEN_INDEX = -201 +DEFAULT_VIDEO_TOKEN = "